{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998751404669747, "eval_steps": 1000, "global_step": 2002, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000499438132101386, "grad_norm": 0.22265625, "learning_rate": 2.4875621890547265e-08, "logits/chosen": -0.3009346127510071, "logits/rejected": -0.224898099899292, "logps/chosen": -43.235816955566406, "logps/rejected": -65.95542907714844, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00499438132101386, "grad_norm": 0.1904296875, "learning_rate": 2.4875621890547267e-07, "logits/chosen": -0.4162670373916626, "logits/rejected": -0.31764352321624756, "logps/chosen": -43.73904037475586, "logps/rejected": -88.3354263305664, "loss": 0.4999, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.0004759904695674777, "rewards/margins": 0.0009994357824325562, "rewards/rejected": -0.0005234453710727394, "step": 10 }, { "epoch": 0.00998876264202772, "grad_norm": 0.16796875, "learning_rate": 4.975124378109453e-07, "logits/chosen": -0.41128048300743103, "logits/rejected": -0.3287343382835388, "logps/chosen": -43.18193054199219, "logps/rejected": -69.37371063232422, "loss": 0.4999, "rewards/accuracies": 0.625, "rewards/chosen": 8.649445953778923e-05, "rewards/margins": 0.000692047062329948, "rewards/rejected": -0.0006055526318959892, "step": 20 }, { "epoch": 0.014983143963041578, "grad_norm": 0.25, "learning_rate": 7.462686567164179e-07, "logits/chosen": -0.4024788439273834, "logits/rejected": -0.3096240162849426, "logps/chosen": -42.980751037597656, "logps/rejected": -73.10075378417969, "loss": 0.4999, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0004937038174830377, "rewards/margins": 0.0007297725533135235, "rewards/rejected": -0.00023606869217474014, "step": 30 }, { "epoch": 0.01997752528405544, "grad_norm": 0.208984375, "learning_rate": 9.950248756218907e-07, "logits/chosen": -0.41356319189071655, "logits/rejected": -0.34054869413375854, "logps/chosen": -43.257789611816406, "logps/rejected": -69.32649230957031, "loss": 0.4998, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0002575941034592688, "rewards/margins": 0.0018243074882775545, "rewards/rejected": -0.0015667133266106248, "step": 40 }, { "epoch": 0.024971906605069295, "grad_norm": 0.26953125, "learning_rate": 1.2437810945273632e-06, "logits/chosen": -0.4217616021633148, "logits/rejected": -0.3440130352973938, "logps/chosen": -44.67601776123047, "logps/rejected": -78.74809265136719, "loss": 0.4995, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0004624166467692703, "rewards/margins": 0.002841049339622259, "rewards/rejected": -0.00237863278016448, "step": 50 }, { "epoch": 0.029966287926083156, "grad_norm": 0.158203125, "learning_rate": 1.4925373134328358e-06, "logits/chosen": -0.4335503578186035, "logits/rejected": -0.3408567011356354, "logps/chosen": -43.363746643066406, "logps/rejected": -77.2335433959961, "loss": 0.4991, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0020871085580438375, "rewards/margins": 0.0050177304074168205, "rewards/rejected": -0.0029306220822036266, "step": 60 }, { "epoch": 0.034960669247097016, "grad_norm": 0.1572265625, "learning_rate": 1.7412935323383088e-06, "logits/chosen": -0.4507155418395996, "logits/rejected": -0.35845330357551575, "logps/chosen": -42.748069763183594, "logps/rejected": -73.00779724121094, "loss": 0.4982, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.004088289104402065, "rewards/margins": 0.009887892752885818, "rewards/rejected": -0.005799603182822466, "step": 70 }, { "epoch": 0.03995505056811088, "grad_norm": 0.1943359375, "learning_rate": 1.9900497512437813e-06, "logits/chosen": -0.41265735030174255, "logits/rejected": -0.32930153608322144, "logps/chosen": -42.023521423339844, "logps/rejected": -82.02639770507812, "loss": 0.497, "rewards/accuracies": 1.0, "rewards/chosen": 0.008388923481106758, "rewards/margins": 0.015186095610260963, "rewards/rejected": -0.006797172129154205, "step": 80 }, { "epoch": 0.04494943188912474, "grad_norm": 0.181640625, "learning_rate": 2.238805970149254e-06, "logits/chosen": -0.42406344413757324, "logits/rejected": -0.32654517889022827, "logps/chosen": -43.199241638183594, "logps/rejected": -79.2525405883789, "loss": 0.4952, "rewards/accuracies": 1.0, "rewards/chosen": 0.015419301576912403, "rewards/margins": 0.025097712874412537, "rewards/rejected": -0.009678413160145283, "step": 90 }, { "epoch": 0.04994381321013859, "grad_norm": 0.232421875, "learning_rate": 2.4875621890547264e-06, "logits/chosen": -0.4186275601387024, "logits/rejected": -0.31876617670059204, "logps/chosen": -41.526851654052734, "logps/rejected": -73.03739929199219, "loss": 0.4934, "rewards/accuracies": 1.0, "rewards/chosen": 0.020998705178499222, "rewards/margins": 0.03922630846500397, "rewards/rejected": -0.018227603286504745, "step": 100 }, { "epoch": 0.05493819453115245, "grad_norm": 0.1630859375, "learning_rate": 2.736318407960199e-06, "logits/chosen": -0.3820754289627075, "logits/rejected": -0.3049188256263733, "logps/chosen": -40.626625061035156, "logps/rejected": -77.20478057861328, "loss": 0.4909, "rewards/accuracies": 1.0, "rewards/chosen": 0.026516741141676903, "rewards/margins": 0.0446377769112587, "rewards/rejected": -0.018121037632226944, "step": 110 }, { "epoch": 0.05993257585216631, "grad_norm": 0.1455078125, "learning_rate": 2.9850746268656716e-06, "logits/chosen": -0.4118029475212097, "logits/rejected": -0.3379635214805603, "logps/chosen": -40.367244720458984, "logps/rejected": -72.1778564453125, "loss": 0.4878, "rewards/accuracies": 1.0, "rewards/chosen": 0.034462034702301025, "rewards/margins": 0.07104991376399994, "rewards/rejected": -0.036587879061698914, "step": 120 }, { "epoch": 0.06492695717318017, "grad_norm": 0.1416015625, "learning_rate": 3.233830845771145e-06, "logits/chosen": -0.38510891795158386, "logits/rejected": -0.2871672511100769, "logps/chosen": -39.84120559692383, "logps/rejected": -77.8514175415039, "loss": 0.4845, "rewards/accuracies": 1.0, "rewards/chosen": 0.033992547541856766, "rewards/margins": 0.09460695832967758, "rewards/rejected": -0.06061442568898201, "step": 130 }, { "epoch": 0.06992133849419403, "grad_norm": 0.2197265625, "learning_rate": 3.4825870646766175e-06, "logits/chosen": -0.3648582696914673, "logits/rejected": -0.2659669816493988, "logps/chosen": -40.6865234375, "logps/rejected": -84.98823547363281, "loss": 0.4808, "rewards/accuracies": 1.0, "rewards/chosen": 0.04041652753949165, "rewards/margins": 0.11247670650482178, "rewards/rejected": -0.07206018269062042, "step": 140 }, { "epoch": 0.07491571981520789, "grad_norm": 0.173828125, "learning_rate": 3.73134328358209e-06, "logits/chosen": -0.4185262620449066, "logits/rejected": -0.31970107555389404, "logps/chosen": -40.132545471191406, "logps/rejected": -80.09419250488281, "loss": 0.4784, "rewards/accuracies": 1.0, "rewards/chosen": 0.04452138394117355, "rewards/margins": 0.12835349142551422, "rewards/rejected": -0.08383210748434067, "step": 150 }, { "epoch": 0.07991010113622175, "grad_norm": 0.130859375, "learning_rate": 3.980099502487563e-06, "logits/chosen": -0.37403732538223267, "logits/rejected": -0.27564138174057007, "logps/chosen": -39.31542205810547, "logps/rejected": -90.21852111816406, "loss": 0.4747, "rewards/accuracies": 1.0, "rewards/chosen": 0.044510699808597565, "rewards/margins": 0.15536533296108246, "rewards/rejected": -0.11085464060306549, "step": 160 }, { "epoch": 0.08490448245723561, "grad_norm": 0.1572265625, "learning_rate": 4.228855721393035e-06, "logits/chosen": -0.35230112075805664, "logits/rejected": -0.2606234848499298, "logps/chosen": -38.46342086791992, "logps/rejected": -85.07556915283203, "loss": 0.4689, "rewards/accuracies": 1.0, "rewards/chosen": 0.04909727722406387, "rewards/margins": 0.20509441196918488, "rewards/rejected": -0.1559971272945404, "step": 170 }, { "epoch": 0.08989886377824947, "grad_norm": 0.2421875, "learning_rate": 4.477611940298508e-06, "logits/chosen": -0.3421555161476135, "logits/rejected": -0.2503698766231537, "logps/chosen": -39.706153869628906, "logps/rejected": -85.4745101928711, "loss": 0.4643, "rewards/accuracies": 1.0, "rewards/chosen": 0.0517905056476593, "rewards/margins": 0.2295013666152954, "rewards/rejected": -0.1777108609676361, "step": 180 }, { "epoch": 0.09489324509926333, "grad_norm": 0.2197265625, "learning_rate": 4.72636815920398e-06, "logits/chosen": -0.2977878451347351, "logits/rejected": -0.17351695895195007, "logps/chosen": -38.165069580078125, "logps/rejected": -104.08354187011719, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 0.0498540997505188, "rewards/margins": 0.33853739500045776, "rewards/rejected": -0.28868329524993896, "step": 190 }, { "epoch": 0.09988762642027718, "grad_norm": 0.484375, "learning_rate": 4.975124378109453e-06, "logits/chosen": -0.2946663498878479, "logits/rejected": -0.17343321442604065, "logps/chosen": -37.89108657836914, "logps/rejected": -119.83811950683594, "loss": 0.4244, "rewards/accuracies": 1.0, "rewards/chosen": 0.06007291004061699, "rewards/margins": 0.5597599148750305, "rewards/rejected": -0.49968695640563965, "step": 200 }, { "epoch": 0.10488200774129104, "grad_norm": 0.271484375, "learning_rate": 4.999691923599309e-06, "logits/chosen": -0.24224761128425598, "logits/rejected": -0.10646134614944458, "logps/chosen": -38.592735290527344, "logps/rejected": -158.98190307617188, "loss": 0.3761, "rewards/accuracies": 1.0, "rewards/chosen": 0.052448518574237823, "rewards/margins": 0.9627677202224731, "rewards/rejected": -0.9103191494941711, "step": 210 }, { "epoch": 0.1098763890623049, "grad_norm": 0.369140625, "learning_rate": 4.998627065620946e-06, "logits/chosen": -0.20557060837745667, "logits/rejected": -0.019889693707227707, "logps/chosen": -39.04503631591797, "logps/rejected": -309.40240478515625, "loss": 0.3162, "rewards/accuracies": 1.0, "rewards/chosen": 0.05903823301196098, "rewards/margins": 2.2989630699157715, "rewards/rejected": -2.239924907684326, "step": 220 }, { "epoch": 0.11487077038331876, "grad_norm": 0.1435546875, "learning_rate": 4.996801946581365e-06, "logits/chosen": -0.08062759786844254, "logits/rejected": 0.12686052918434143, "logps/chosen": -38.807472229003906, "logps/rejected": -391.716552734375, "loss": 0.2979, "rewards/accuracies": 1.0, "rewards/chosen": 0.03988034278154373, "rewards/margins": 3.2561073303222656, "rewards/rejected": -3.216226577758789, "step": 230 }, { "epoch": 0.11986515170433262, "grad_norm": 0.173828125, "learning_rate": 4.99421712181231e-06, "logits/chosen": -0.09656897932291031, "logits/rejected": 0.20183369517326355, "logps/chosen": -40.211158752441406, "logps/rejected": -563.9237060546875, "loss": 0.2787, "rewards/accuracies": 1.0, "rewards/chosen": 0.03454852104187012, "rewards/margins": 4.958992958068848, "rewards/rejected": -4.924445152282715, "step": 240 }, { "epoch": 0.12485953302534648, "grad_norm": 0.19140625, "learning_rate": 4.990873377802351e-06, "logits/chosen": -0.04213310405611992, "logits/rejected": 0.26729267835617065, "logps/chosen": -38.64299011230469, "logps/rejected": -672.4085693359375, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": 0.04478804022073746, "rewards/margins": 6.028790473937988, "rewards/rejected": -5.984002113342285, "step": 250 }, { "epoch": 0.12985391434636034, "grad_norm": 0.1591796875, "learning_rate": 4.986771731957569e-06, "logits/chosen": -0.013924488797783852, "logits/rejected": 0.32576116919517517, "logps/chosen": -38.04896926879883, "logps/rejected": -677.6256713867188, "loss": 0.269, "rewards/accuracies": 1.0, "rewards/chosen": 0.04615269601345062, "rewards/margins": 6.120830535888672, "rewards/rejected": -6.07467794418335, "step": 260 }, { "epoch": 0.1348482956673742, "grad_norm": 0.1240234375, "learning_rate": 4.981913432291989e-06, "logits/chosen": -0.0022221256513148546, "logits/rejected": 0.3353291451931, "logps/chosen": -36.26408004760742, "logps/rejected": -657.3780517578125, "loss": 0.2603, "rewards/accuracies": 1.0, "rewards/chosen": 0.07836371660232544, "rewards/margins": 5.838679790496826, "rewards/rejected": -5.760315895080566, "step": 270 }, { "epoch": 0.13984267698838806, "grad_norm": 0.138671875, "learning_rate": 4.976299957047846e-06, "logits/chosen": -0.008776476606726646, "logits/rejected": 0.35888582468032837, "logps/chosen": -34.863895416259766, "logps/rejected": -787.3309326171875, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 0.0779188945889473, "rewards/margins": 7.239710330963135, "rewards/rejected": -7.1617913246154785, "step": 280 }, { "epoch": 0.1448370583094019, "grad_norm": 0.2373046875, "learning_rate": 4.9699330142458e-06, "logits/chosen": -0.005239410791546106, "logits/rejected": 0.3836653232574463, "logps/chosen": -30.54348373413086, "logps/rejected": -689.7301635742188, "loss": 0.2517, "rewards/accuracies": 1.0, "rewards/chosen": 0.1276303231716156, "rewards/margins": 6.348529815673828, "rewards/rejected": -6.2208991050720215, "step": 290 }, { "epoch": 0.14983143963041579, "grad_norm": 0.15625, "learning_rate": 4.96281454116523e-06, "logits/chosen": -0.018074408173561096, "logits/rejected": 0.34884509444236755, "logps/chosen": -19.059213638305664, "logps/rejected": -759.7054443359375, "loss": 0.2355, "rewards/accuracies": 1.0, "rewards/chosen": 0.24109096825122833, "rewards/margins": 7.104301452636719, "rewards/rejected": -6.863211154937744, "step": 300 }, { "epoch": 0.15482582095142963, "grad_norm": 0.14453125, "learning_rate": 4.954946703754777e-06, "logits/chosen": -0.022661946713924408, "logits/rejected": 0.36587223410606384, "logps/chosen": -14.688285827636719, "logps/rejected": -652.6716918945312, "loss": 0.2303, "rewards/accuracies": 1.0, "rewards/chosen": 0.2812207341194153, "rewards/margins": 6.1413116455078125, "rewards/rejected": -5.860090255737305, "step": 310 }, { "epoch": 0.1598202022724435, "grad_norm": 0.06591796875, "learning_rate": 4.946331895973308e-06, "logits/chosen": 0.027700275182724, "logits/rejected": 0.4779927134513855, "logps/chosen": -13.264841079711914, "logps/rejected": -853.7429809570312, "loss": 0.2306, "rewards/accuracies": 1.0, "rewards/chosen": 0.29832005500793457, "rewards/margins": 7.9851837158203125, "rewards/rejected": -7.686862945556641, "step": 320 }, { "epoch": 0.16481458359345735, "grad_norm": 0.1298828125, "learning_rate": 4.936972739061503e-06, "logits/chosen": 0.028876056894659996, "logits/rejected": 0.4520367980003357, "logps/chosen": -14.747647285461426, "logps/rejected": -819.2418212890625, "loss": 0.2308, "rewards/accuracies": 1.0, "rewards/chosen": 0.29899871349334717, "rewards/margins": 7.518294334411621, "rewards/rejected": -7.219296455383301, "step": 330 }, { "epoch": 0.16980896491447123, "grad_norm": 0.2353515625, "learning_rate": 4.926872080744284e-06, "logits/chosen": 0.09099732339382172, "logits/rejected": 0.6329769492149353, "logps/chosen": -14.6112699508667, "logps/rejected": -978.19482421875, "loss": 0.2205, "rewards/accuracies": 1.0, "rewards/chosen": 0.290539413690567, "rewards/margins": 9.353235244750977, "rewards/rejected": -9.06269645690918, "step": 340 }, { "epoch": 0.17480334623548507, "grad_norm": 0.0673828125, "learning_rate": 4.9160329943643335e-06, "logits/chosen": 0.10238673537969589, "logits/rejected": 0.6276119947433472, "logps/chosen": -13.135534286499023, "logps/rejected": -862.1193237304688, "loss": 0.2233, "rewards/accuracies": 1.0, "rewards/chosen": 0.2938804030418396, "rewards/margins": 8.220571517944336, "rewards/rejected": -7.92669153213501, "step": 350 }, { "epoch": 0.17979772755649895, "grad_norm": 0.064453125, "learning_rate": 4.904458777946967e-06, "logits/chosen": 0.023329418152570724, "logits/rejected": 0.6091981530189514, "logps/chosen": -13.777295112609863, "logps/rejected": -1096.3214111328125, "loss": 0.221, "rewards/accuracies": 1.0, "rewards/chosen": 0.2997492849826813, "rewards/margins": 10.409059524536133, "rewards/rejected": -10.1093111038208, "step": 360 }, { "epoch": 0.1847921088775128, "grad_norm": 0.1201171875, "learning_rate": 4.892152953196633e-06, "logits/chosen": 0.029097210615873337, "logits/rejected": 0.650887131690979, "logps/chosen": -13.820713996887207, "logps/rejected": -1171.0504150390625, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": 0.2969459295272827, "rewards/margins": 11.168619155883789, "rewards/rejected": -10.871672630310059, "step": 370 }, { "epoch": 0.18978649019852667, "grad_norm": 0.06640625, "learning_rate": 4.879119264425366e-06, "logits/chosen": 0.11170516163110733, "logits/rejected": 0.7552271485328674, "logps/chosen": -13.031651496887207, "logps/rejected": -990.927734375, "loss": 0.2191, "rewards/accuracies": 1.0, "rewards/chosen": 0.30227065086364746, "rewards/margins": 9.518648147583008, "rewards/rejected": -9.216377258300781, "step": 380 }, { "epoch": 0.19478087151954052, "grad_norm": 0.0272216796875, "learning_rate": 4.865361677413489e-06, "logits/chosen": 0.10295484960079193, "logits/rejected": 0.6912266612052917, "logps/chosen": -14.148368835449219, "logps/rejected": -973.99365234375, "loss": 0.2236, "rewards/accuracies": 1.0, "rewards/chosen": 0.30019253492355347, "rewards/margins": 9.242959022521973, "rewards/rejected": -8.942765235900879, "step": 390 }, { "epoch": 0.19977525284055436, "grad_norm": 0.048583984375, "learning_rate": 4.850884378202947e-06, "logits/chosen": 0.12218449264764786, "logits/rejected": 0.7848892211914062, "logps/chosen": -13.857281684875488, "logps/rejected": -1093.4356689453125, "loss": 0.2224, "rewards/accuracies": 1.0, "rewards/chosen": 0.30381980538368225, "rewards/margins": 10.443190574645996, "rewards/rejected": -10.139370918273926, "step": 400 }, { "epoch": 0.20476963416156824, "grad_norm": 0.10888671875, "learning_rate": 4.8356917718236125e-06, "logits/chosen": 0.16129298508167267, "logits/rejected": 0.83033287525177, "logps/chosen": -13.215472221374512, "logps/rejected": -1056.891357421875, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.30162861943244934, "rewards/margins": 10.163104057312012, "rewards/rejected": -9.861475944519043, "step": 410 }, { "epoch": 0.20976401548258208, "grad_norm": 0.05712890625, "learning_rate": 4.8197884809529575e-06, "logits/chosen": 0.18466398119926453, "logits/rejected": 0.8971255421638489, "logps/chosen": -14.178210258483887, "logps/rejected": -1095.867431640625, "loss": 0.2192, "rewards/accuracies": 1.0, "rewards/chosen": 0.29163575172424316, "rewards/margins": 10.56762409210205, "rewards/rejected": -10.275988578796387, "step": 420 }, { "epoch": 0.21475839680359596, "grad_norm": 0.078125, "learning_rate": 4.803179344509505e-06, "logits/chosen": 0.17180819809436798, "logits/rejected": 0.9859398603439331, "logps/chosen": -14.1710844039917, "logps/rejected": -1132.571044921875, "loss": 0.2206, "rewards/accuracies": 1.0, "rewards/chosen": 0.2903401851654053, "rewards/margins": 10.91908073425293, "rewards/rejected": -10.628740310668945, "step": 430 }, { "epoch": 0.2197527781246098, "grad_norm": 0.0341796875, "learning_rate": 4.785869416180489e-06, "logits/chosen": 0.18128976225852966, "logits/rejected": 0.9951160550117493, "logps/chosen": -13.373617172241211, "logps/rejected": -1239.9248046875, "loss": 0.2185, "rewards/accuracies": 1.0, "rewards/chosen": 0.3014771342277527, "rewards/margins": 11.977083206176758, "rewards/rejected": -11.675604820251465, "step": 440 }, { "epoch": 0.22474715944562368, "grad_norm": 0.035400390625, "learning_rate": 4.767863962884156e-06, "logits/chosen": 0.19665592908859253, "logits/rejected": 1.0053622722625732, "logps/chosen": -12.768911361694336, "logps/rejected": -1196.0384521484375, "loss": 0.217, "rewards/accuracies": 1.0, "rewards/chosen": 0.30236831307411194, "rewards/margins": 11.53145694732666, "rewards/rejected": -11.229089736938477, "step": 450 }, { "epoch": 0.22974154076663753, "grad_norm": 0.06982421875, "learning_rate": 4.74916846316719e-06, "logits/chosen": 0.2026137411594391, "logits/rejected": 0.9967263340950012, "logps/chosen": -14.13359546661377, "logps/rejected": -1080.839111328125, "loss": 0.2196, "rewards/accuracies": 1.0, "rewards/chosen": 0.29357805848121643, "rewards/margins": 10.319085121154785, "rewards/rejected": -10.025506973266602, "step": 460 }, { "epoch": 0.2347359220876514, "grad_norm": 0.047607421875, "learning_rate": 4.7297886055377525e-06, "logits/chosen": 0.22456324100494385, "logits/rejected": 0.9802171587944031, "logps/chosen": -13.012743949890137, "logps/rejected": -1017.5540161132812, "loss": 0.2208, "rewards/accuracies": 1.0, "rewards/chosen": 0.29690033197402954, "rewards/margins": 9.80536937713623, "rewards/rejected": -9.508468627929688, "step": 470 }, { "epoch": 0.23973030340866525, "grad_norm": 0.02734375, "learning_rate": 4.709730286734631e-06, "logits/chosen": 0.2183937281370163, "logits/rejected": 1.0708736181259155, "logps/chosen": -12.663009643554688, "logps/rejected": -1249.983642578125, "loss": 0.2169, "rewards/accuracies": 1.0, "rewards/chosen": 0.3058857023715973, "rewards/margins": 12.078222274780273, "rewards/rejected": -11.772336959838867, "step": 480 }, { "epoch": 0.24472468472967912, "grad_norm": 0.055908203125, "learning_rate": 4.688999609933023e-06, "logits/chosen": 0.22988371551036835, "logits/rejected": 1.0844902992248535, "logps/chosen": -12.956899642944336, "logps/rejected": -1156.6395263671875, "loss": 0.2176, "rewards/accuracies": 1.0, "rewards/chosen": 0.30138763785362244, "rewards/margins": 11.153468132019043, "rewards/rejected": -10.852079391479492, "step": 490 }, { "epoch": 0.24971906605069297, "grad_norm": 0.049072265625, "learning_rate": 4.6676028828875195e-06, "logits/chosen": 0.19053277373313904, "logits/rejected": 1.1232895851135254, "logps/chosen": -13.526689529418945, "logps/rejected": -1328.9798583984375, "loss": 0.2173, "rewards/accuracies": 1.0, "rewards/chosen": 0.2967537045478821, "rewards/margins": 12.862408638000488, "rewards/rejected": -12.565653800964355, "step": 500 }, { "epoch": 0.2547134473717068, "grad_norm": 0.00811767578125, "learning_rate": 4.645546616012835e-06, "logits/chosen": 0.19936171174049377, "logits/rejected": 1.1598930358886719, "logps/chosen": -13.963285446166992, "logps/rejected": -1301.6494140625, "loss": 0.217, "rewards/accuracies": 1.0, "rewards/chosen": 0.29725441336631775, "rewards/margins": 12.456524848937988, "rewards/rejected": -12.159271240234375, "step": 510 }, { "epoch": 0.2597078286927207, "grad_norm": 0.022216796875, "learning_rate": 4.622837520402869e-06, "logits/chosen": 0.2132669985294342, "logits/rejected": 1.1716349124908447, "logps/chosen": -13.427679061889648, "logps/rejected": -1347.979248046875, "loss": 0.216, "rewards/accuracies": 1.0, "rewards/chosen": 0.30607202649116516, "rewards/margins": 13.056879043579102, "rewards/rejected": -12.75080680847168, "step": 520 }, { "epoch": 0.26470221001373456, "grad_norm": 0.0654296875, "learning_rate": 4.599482505788715e-06, "logits/chosen": 0.1745399534702301, "logits/rejected": 1.1154874563217163, "logps/chosen": -13.649249076843262, "logps/rejected": -1377.6204833984375, "loss": 0.2163, "rewards/accuracies": 1.0, "rewards/chosen": 0.2969611883163452, "rewards/margins": 13.256256103515625, "rewards/rejected": -12.959295272827148, "step": 530 }, { "epoch": 0.2696965913347484, "grad_norm": 0.040771484375, "learning_rate": 4.575488678436228e-06, "logits/chosen": 0.20975852012634277, "logits/rejected": 1.2858575582504272, "logps/chosen": -13.548286437988281, "logps/rejected": -1526.4365234375, "loss": 0.2161, "rewards/accuracies": 1.0, "rewards/chosen": 0.30410271883010864, "rewards/margins": 14.82691764831543, "rewards/rejected": -14.522814750671387, "step": 540 }, { "epoch": 0.27469097265576226, "grad_norm": 0.0235595703125, "learning_rate": 4.550863338983784e-06, "logits/chosen": 0.23238572478294373, "logits/rejected": 1.2929937839508057, "logps/chosen": -12.818222045898438, "logps/rejected": -1398.010009765625, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 0.3006662130355835, "rewards/margins": 13.579294204711914, "rewards/rejected": -13.2786283493042, "step": 550 }, { "epoch": 0.27968535397677613, "grad_norm": 0.045654296875, "learning_rate": 4.525613980220909e-06, "logits/chosen": 0.21401552855968475, "logits/rejected": 1.2280786037445068, "logps/chosen": -13.640890121459961, "logps/rejected": -1425.2213134765625, "loss": 0.2165, "rewards/accuracies": 1.0, "rewards/chosen": 0.30365556478500366, "rewards/margins": 13.8203706741333, "rewards/rejected": -13.516714096069336, "step": 560 }, { "epoch": 0.28467973529779, "grad_norm": 0.031494140625, "learning_rate": 4.499748284808433e-06, "logits/chosen": 0.2350139617919922, "logits/rejected": 1.237275242805481, "logps/chosen": -13.027705192565918, "logps/rejected": -1249.4954833984375, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 0.3124990463256836, "rewards/margins": 11.985966682434082, "rewards/rejected": -11.673466682434082, "step": 570 }, { "epoch": 0.2896741166188038, "grad_norm": 0.037109375, "learning_rate": 4.473274122940879e-06, "logits/chosen": 0.26041245460510254, "logits/rejected": 1.2588506937026978, "logps/chosen": -12.92595100402832, "logps/rejected": -1345.543212890625, "loss": 0.2145, "rewards/accuracies": 1.0, "rewards/chosen": 0.3006027638912201, "rewards/margins": 13.026535034179688, "rewards/rejected": -12.725933074951172, "step": 580 }, { "epoch": 0.2946684979398177, "grad_norm": 0.023193359375, "learning_rate": 4.446199549951782e-06, "logits/chosen": 0.2726953327655792, "logits/rejected": 1.2741527557373047, "logps/chosen": -13.607648849487305, "logps/rejected": -1350.020263671875, "loss": 0.2164, "rewards/accuracies": 1.0, "rewards/chosen": 0.2966735064983368, "rewards/margins": 13.099435806274414, "rewards/rejected": -12.802760124206543, "step": 590 }, { "epoch": 0.29966287926083157, "grad_norm": 0.060302734375, "learning_rate": 4.418532803862684e-06, "logits/chosen": 0.24927139282226562, "logits/rejected": 1.2164738178253174, "logps/chosen": -13.667009353637695, "logps/rejected": -1226.8724365234375, "loss": 0.2161, "rewards/accuracies": 1.0, "rewards/chosen": 0.3081030249595642, "rewards/margins": 11.710563659667969, "rewards/rejected": -11.402461051940918, "step": 600 }, { "epoch": 0.30465726058184545, "grad_norm": 0.01220703125, "learning_rate": 4.39028230287654e-06, "logits/chosen": 0.24915924668312073, "logits/rejected": 1.2275068759918213, "logps/chosen": -14.152711868286133, "logps/rejected": -1383.346923828125, "loss": 0.2153, "rewards/accuracies": 1.0, "rewards/chosen": 0.301180899143219, "rewards/margins": 13.357465744018555, "rewards/rejected": -13.05628490447998, "step": 610 }, { "epoch": 0.30965164190285926, "grad_norm": 0.056884765625, "learning_rate": 4.361456642816292e-06, "logits/chosen": 0.18370430171489716, "logits/rejected": 1.187785267829895, "logps/chosen": -14.063751220703125, "logps/rejected": -1465.3687744140625, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 0.3011600375175476, "rewards/margins": 14.13781452178955, "rewards/rejected": -13.836652755737305, "step": 620 }, { "epoch": 0.31464602322387314, "grad_norm": 0.036376953125, "learning_rate": 4.332064594509413e-06, "logits/chosen": 0.19446897506713867, "logits/rejected": 1.427197813987732, "logps/chosen": -14.258028030395508, "logps/rejected": -1825.0166015625, "loss": 0.2145, "rewards/accuracies": 1.0, "rewards/chosen": 0.2957887053489685, "rewards/margins": 17.80067253112793, "rewards/rejected": -17.504884719848633, "step": 630 }, { "epoch": 0.319640404544887, "grad_norm": 0.025634765625, "learning_rate": 4.302115101119186e-06, "logits/chosen": 0.19377607107162476, "logits/rejected": 1.1977471113204956, "logps/chosen": -13.28663158416748, "logps/rejected": -1551.3272705078125, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 0.2949761748313904, "rewards/margins": 14.840034484863281, "rewards/rejected": -14.545059204101562, "step": 640 }, { "epoch": 0.3246347858659009, "grad_norm": 0.028076171875, "learning_rate": 4.271617275423564e-06, "logits/chosen": 0.18471740186214447, "logits/rejected": 1.2049682140350342, "logps/chosen": -14.22096061706543, "logps/rejected": -1509.560791015625, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.2953301966190338, "rewards/margins": 14.56297779083252, "rewards/rejected": -14.267648696899414, "step": 650 }, { "epoch": 0.3296291671869147, "grad_norm": 0.031494140625, "learning_rate": 4.2405803970423995e-06, "logits/chosen": 0.21741405129432678, "logits/rejected": 1.3314052820205688, "logps/chosen": -13.835968017578125, "logps/rejected": -1617.0235595703125, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 0.29379233717918396, "rewards/margins": 15.649670600891113, "rewards/rejected": -15.355878829956055, "step": 660 }, { "epoch": 0.3346235485079286, "grad_norm": 0.11572265625, "learning_rate": 4.2090139096139306e-06, "logits/chosen": 0.16212065517902374, "logits/rejected": 1.2855770587921143, "logps/chosen": -14.017046928405762, "logps/rejected": -1740.139404296875, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 0.3004634976387024, "rewards/margins": 16.887771606445312, "rewards/rejected": -16.58730697631836, "step": 670 }, { "epoch": 0.33961792982894246, "grad_norm": 0.043701171875, "learning_rate": 4.176927417921343e-06, "logits/chosen": 0.326777845621109, "logits/rejected": 1.3592358827590942, "logps/chosen": -13.120327949523926, "logps/rejected": -1251.674560546875, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.30190396308898926, "rewards/margins": 12.16067123413086, "rewards/rejected": -11.858766555786133, "step": 680 }, { "epoch": 0.3446123111499563, "grad_norm": 0.03173828125, "learning_rate": 4.144330684970314e-06, "logits/chosen": 0.22485598921775818, "logits/rejected": 1.238599181175232, "logps/chosen": -14.03515625, "logps/rejected": -1432.62060546875, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.30531880259513855, "rewards/margins": 13.757417678833008, "rewards/rejected": -13.45209789276123, "step": 690 }, { "epoch": 0.34960669247097015, "grad_norm": 0.064453125, "learning_rate": 4.111233629018404e-06, "logits/chosen": 0.2409452497959137, "logits/rejected": 1.3199043273925781, "logps/chosen": -13.525833129882812, "logps/rejected": -1432.090087890625, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 0.29940056800842285, "rewards/margins": 13.923855781555176, "rewards/rejected": -13.624455451965332, "step": 700 }, { "epoch": 0.354601073791984, "grad_norm": 0.044677734375, "learning_rate": 4.077646320557215e-06, "logits/chosen": 0.25844550132751465, "logits/rejected": 1.4347895383834839, "logps/chosen": -13.414407730102539, "logps/rejected": -1587.720947265625, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.29702991247177124, "rewards/margins": 15.390164375305176, "rewards/rejected": -15.093134880065918, "step": 710 }, { "epoch": 0.3595954551129979, "grad_norm": 0.017822265625, "learning_rate": 4.043578979248228e-06, "logits/chosen": 0.24548295140266418, "logits/rejected": 1.3877404928207397, "logps/chosen": -12.786032676696777, "logps/rejected": -1540.37939453125, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 0.3044242262840271, "rewards/margins": 14.986343383789062, "rewards/rejected": -14.681918144226074, "step": 720 }, { "epoch": 0.3645898364340117, "grad_norm": 0.0203857421875, "learning_rate": 4.009041970813247e-06, "logits/chosen": 0.2618701457977295, "logits/rejected": 1.432408332824707, "logps/chosen": -12.812631607055664, "logps/rejected": -1661.7718505859375, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 0.29122787714004517, "rewards/margins": 16.198129653930664, "rewards/rejected": -15.906901359558105, "step": 730 }, { "epoch": 0.3695842177550256, "grad_norm": 0.030029296875, "learning_rate": 3.9740458038804075e-06, "logits/chosen": 0.25733712315559387, "logits/rejected": 1.3133214712142944, "logps/chosen": -14.166203498840332, "logps/rejected": -1473.091552734375, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 0.3017811179161072, "rewards/margins": 14.06616497039795, "rewards/rejected": -13.764383316040039, "step": 740 }, { "epoch": 0.37457859907603946, "grad_norm": 0.02880859375, "learning_rate": 3.938601126786702e-06, "logits/chosen": 0.28963789343833923, "logits/rejected": 1.4084501266479492, "logps/chosen": -12.976341247558594, "logps/rejected": -1537.215087890625, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.2965225577354431, "rewards/margins": 14.963714599609375, "rewards/rejected": -14.667192459106445, "step": 750 }, { "epoch": 0.37957298039705334, "grad_norm": 0.0306396484375, "learning_rate": 3.902718724337993e-06, "logits/chosen": 0.22370409965515137, "logits/rejected": 1.3502318859100342, "logps/chosen": -13.021102905273438, "logps/rejected": -1571.427978515625, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 0.3001527190208435, "rewards/margins": 15.25025463104248, "rewards/rejected": -14.950100898742676, "step": 760 }, { "epoch": 0.38456736171806716, "grad_norm": 0.014404296875, "learning_rate": 3.8664095145274995e-06, "logits/chosen": 0.26876306533813477, "logits/rejected": 1.432448387145996, "logps/chosen": -13.371549606323242, "logps/rejected": -1552.1263427734375, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": 0.30356094241142273, "rewards/margins": 15.13947582244873, "rewards/rejected": -14.835916519165039, "step": 770 }, { "epoch": 0.38956174303908103, "grad_norm": 0.034912109375, "learning_rate": 3.829684545213768e-06, "logits/chosen": 0.23094145953655243, "logits/rejected": 1.379480242729187, "logps/chosen": -13.367365837097168, "logps/rejected": -1540.384765625, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.30820637941360474, "rewards/margins": 14.93891716003418, "rewards/rejected": -14.630711555480957, "step": 780 }, { "epoch": 0.3945561243600949, "grad_norm": 0.03857421875, "learning_rate": 3.7925549907591252e-06, "logits/chosen": 0.17974331974983215, "logits/rejected": 1.3995566368103027, "logps/chosen": -13.248870849609375, "logps/rejected": -1851.462646484375, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.3005455434322357, "rewards/margins": 18.06133460998535, "rewards/rejected": -17.760787963867188, "step": 790 }, { "epoch": 0.3995505056811087, "grad_norm": 0.0272216796875, "learning_rate": 3.7550321486296303e-06, "logits/chosen": 0.1997009515762329, "logits/rejected": 1.2776936292648315, "logps/chosen": -13.081275939941406, "logps/rejected": -1519.75537109375, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.29953616857528687, "rewards/margins": 14.774045944213867, "rewards/rejected": -14.474508285522461, "step": 800 }, { "epoch": 0.4045448870021226, "grad_norm": 0.0252685546875, "learning_rate": 3.717127435957583e-06, "logits/chosen": 0.22182372212409973, "logits/rejected": 1.330664873123169, "logps/chosen": -13.001907348632812, "logps/rejected": -1577.052978515625, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.30547088384628296, "rewards/margins": 15.365854263305664, "rewards/rejected": -15.060381889343262, "step": 810 }, { "epoch": 0.4095392683231365, "grad_norm": 0.011474609375, "learning_rate": 3.6788523860676156e-06, "logits/chosen": 0.23856505751609802, "logits/rejected": 1.3909879922866821, "logps/chosen": -13.222851753234863, "logps/rejected": -1567.79931640625, "loss": 0.2142, "rewards/accuracies": 1.0, "rewards/chosen": 0.3013755977153778, "rewards/margins": 15.270452499389648, "rewards/rejected": -14.969076156616211, "step": 820 }, { "epoch": 0.41453364964415035, "grad_norm": 0.0267333984375, "learning_rate": 3.640218644967429e-06, "logits/chosen": 0.2593843638896942, "logits/rejected": 1.4300034046173096, "logps/chosen": -12.888224601745605, "logps/rejected": -1659.7015380859375, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.295782208442688, "rewards/margins": 16.122600555419922, "rewards/rejected": -15.826817512512207, "step": 830 }, { "epoch": 0.41952803096516417, "grad_norm": 0.0194091796875, "learning_rate": 3.601237967804245e-06, "logits/chosen": 0.264489084482193, "logits/rejected": 1.4015864133834839, "logps/chosen": -12.973742485046387, "logps/rejected": -1561.237548828125, "loss": 0.2142, "rewards/accuracies": 1.0, "rewards/chosen": 0.2937301695346832, "rewards/margins": 15.21537971496582, "rewards/rejected": -14.921648979187012, "step": 840 }, { "epoch": 0.42452241228617804, "grad_norm": 0.034912109375, "learning_rate": 3.5619222152880488e-06, "logits/chosen": 0.26485809683799744, "logits/rejected": 1.4641757011413574, "logps/chosen": -12.745534896850586, "logps/rejected": -1704.2015380859375, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.29673147201538086, "rewards/margins": 16.65966033935547, "rewards/rejected": -16.362926483154297, "step": 850 }, { "epoch": 0.4295167936071919, "grad_norm": 0.03662109375, "learning_rate": 3.522283350082713e-06, "logits/chosen": 0.27674156427383423, "logits/rejected": 1.3279974460601807, "logps/chosen": -13.220677375793457, "logps/rejected": -1268.2398681640625, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.30168816447257996, "rewards/margins": 12.249058723449707, "rewards/rejected": -11.947370529174805, "step": 860 }, { "epoch": 0.4345111749282058, "grad_norm": 0.0303955078125, "learning_rate": 3.482333433166101e-06, "logits/chosen": 0.2209288775920868, "logits/rejected": 1.239816427230835, "logps/chosen": -13.64061450958252, "logps/rejected": -1329.226806640625, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 0.32054653763771057, "rewards/margins": 12.724926948547363, "rewards/rejected": -12.404378890991211, "step": 870 }, { "epoch": 0.4395055562492196, "grad_norm": 0.017578125, "learning_rate": 3.442084620160255e-06, "logits/chosen": 0.2859000563621521, "logits/rejected": 1.3617407083511353, "logps/chosen": -13.033666610717773, "logps/rejected": -1480.4652099609375, "loss": 0.2148, "rewards/accuracies": 1.0, "rewards/chosen": 0.3004864454269409, "rewards/margins": 14.408884048461914, "rewards/rejected": -14.1083984375, "step": 880 }, { "epoch": 0.4444999375702335, "grad_norm": 0.033203125, "learning_rate": 3.4015491576327813e-06, "logits/chosen": 0.2019362449645996, "logits/rejected": 1.4212459325790405, "logps/chosen": -13.03289794921875, "logps/rejected": -1749.9088134765625, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 0.2995554804801941, "rewards/margins": 17.093223571777344, "rewards/rejected": -16.793670654296875, "step": 890 }, { "epoch": 0.44949431889124736, "grad_norm": 0.043701171875, "learning_rate": 3.3607393793705774e-06, "logits/chosen": 0.18301896750926971, "logits/rejected": 1.3750990629196167, "logps/chosen": -13.328268051147461, "logps/rejected": -1911.580322265625, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.3058582544326782, "rewards/margins": 18.632659912109375, "rewards/rejected": -18.32680320739746, "step": 900 }, { "epoch": 0.45448870021226123, "grad_norm": 0.026611328125, "learning_rate": 3.319667702627004e-06, "logits/chosen": 0.251764714717865, "logits/rejected": 1.379320502281189, "logps/chosen": -13.333532333374023, "logps/rejected": -1562.4708251953125, "loss": 0.2145, "rewards/accuracies": 1.0, "rewards/chosen": 0.29535120725631714, "rewards/margins": 15.228157043457031, "rewards/rejected": -14.932805061340332, "step": 910 }, { "epoch": 0.45948308153327505, "grad_norm": 0.0301513671875, "learning_rate": 3.2783466243436728e-06, "logits/chosen": 0.2565325200557709, "logits/rejected": 1.3139139413833618, "logps/chosen": -12.679740905761719, "logps/rejected": -1520.677490234375, "loss": 0.2148, "rewards/accuracies": 1.0, "rewards/chosen": 0.29702773690223694, "rewards/margins": 14.843530654907227, "rewards/rejected": -14.546501159667969, "step": 920 }, { "epoch": 0.4644774628542889, "grad_norm": 0.032958984375, "learning_rate": 3.23678871734798e-06, "logits/chosen": 0.25534436106681824, "logits/rejected": 1.4121118783950806, "logps/chosen": -13.0289945602417, "logps/rejected": -1636.283447265625, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 0.2977290153503418, "rewards/margins": 15.95555591583252, "rewards/rejected": -15.657827377319336, "step": 930 }, { "epoch": 0.4694718441753028, "grad_norm": 0.017578125, "learning_rate": 3.1950066265275563e-06, "logits/chosen": 0.22841012477874756, "logits/rejected": 1.4126774072647095, "logps/chosen": -13.12025260925293, "logps/rejected": -1663.779541015625, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.2916621267795563, "rewards/margins": 16.221864700317383, "rewards/rejected": -15.930200576782227, "step": 940 }, { "epoch": 0.4744662254963166, "grad_norm": 0.0255126953125, "learning_rate": 3.1530130649827866e-06, "logits/chosen": 0.22560763359069824, "logits/rejected": 1.3270902633666992, "logps/chosen": -12.889676094055176, "logps/rejected": -1550.6907958984375, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.2999538779258728, "rewards/margins": 15.000715255737305, "rewards/rejected": -14.700759887695312, "step": 950 }, { "epoch": 0.4794606068173305, "grad_norm": 0.033203125, "learning_rate": 3.1108208101585737e-06, "logits/chosen": 0.2439723014831543, "logits/rejected": 1.362210988998413, "logps/chosen": -13.089398384094238, "logps/rejected": -1693.647216796875, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": 0.29442083835601807, "rewards/margins": 16.52579116821289, "rewards/rejected": -16.231369018554688, "step": 960 }, { "epoch": 0.48445498813834437, "grad_norm": 0.062255859375, "learning_rate": 3.068442699956526e-06, "logits/chosen": 0.2077961266040802, "logits/rejected": 1.3753139972686768, "logps/chosen": -14.569076538085938, "logps/rejected": -1670.797119140625, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 0.30582067370414734, "rewards/margins": 16.229455947875977, "rewards/rejected": -15.92363452911377, "step": 970 }, { "epoch": 0.48944936945935824, "grad_norm": 0.05224609375, "learning_rate": 3.025891628828754e-06, "logits/chosen": 0.1842622458934784, "logits/rejected": 1.2995259761810303, "logps/chosen": -14.32885456085205, "logps/rejected": -1653.052001953125, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 0.3057419955730438, "rewards/margins": 16.055688858032227, "rewards/rejected": -15.749944686889648, "step": 980 }, { "epoch": 0.49444375078037206, "grad_norm": 0.0269775390625, "learning_rate": 2.983180543854449e-06, "logits/chosen": 0.19390757381916046, "logits/rejected": 1.3017512559890747, "logps/chosen": -13.34800910949707, "logps/rejected": -1712.0159912109375, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 0.30465009808540344, "rewards/margins": 16.635692596435547, "rewards/rejected": -16.331043243408203, "step": 990 }, { "epoch": 0.49943813210138593, "grad_norm": 0.0223388671875, "learning_rate": 2.9403224408004607e-06, "logits/chosen": 0.23906031250953674, "logits/rejected": 1.400268316268921, "logps/chosen": -13.12585735321045, "logps/rejected": -1704.7308349609375, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 0.3049536347389221, "rewards/margins": 16.624217987060547, "rewards/rejected": -16.319265365600586, "step": 1000 }, { "epoch": 0.49943813210138593, "eval_logits/chosen": 0.20361633598804474, "eval_logits/rejected": 1.1080797910690308, "eval_logps/chosen": -12.131524085998535, "eval_logps/rejected": -998.1762084960938, "eval_loss": 0.21242494881153107, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.32524025440216064, "eval_rewards/margins": 9.676619529724121, "eval_rewards/rejected": -9.351378440856934, "eval_runtime": 0.4258, "eval_samples_per_second": 11.742, "eval_steps_per_second": 7.045, "step": 1000 }, { "epoch": 0.5044325134223998, "grad_norm": 0.0205078125, "learning_rate": 2.8973303601670537e-06, "logits/chosen": 0.23553326725959778, "logits/rejected": 1.356740117073059, "logps/chosen": -13.091280937194824, "logps/rejected": -1667.8570556640625, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": 0.3002287745475769, "rewards/margins": 16.279354095458984, "rewards/rejected": -15.979124069213867, "step": 1010 }, { "epoch": 0.5094268947434136, "grad_norm": 0.04833984375, "learning_rate": 2.8542173832200547e-06, "logits/chosen": 0.1925448775291443, "logits/rejected": 1.3025437593460083, "logps/chosen": -14.501489639282227, "logps/rejected": -1580.2572021484375, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 0.2990257143974304, "rewards/margins": 15.303033828735352, "rewards/rejected": -15.004008293151855, "step": 1020 }, { "epoch": 0.5144212760644276, "grad_norm": 0.02587890625, "learning_rate": 2.810996628010594e-06, "logits/chosen": 0.2747485637664795, "logits/rejected": 1.341671109199524, "logps/chosen": -13.159135818481445, "logps/rejected": -1436.7039794921875, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.30046582221984863, "rewards/margins": 14.00048542022705, "rewards/rejected": -13.700021743774414, "step": 1030 }, { "epoch": 0.5194156573854414, "grad_norm": 0.0380859375, "learning_rate": 2.7676812453836617e-06, "logits/chosen": 0.2172623872756958, "logits/rejected": 1.389795184135437, "logps/chosen": -13.409383773803711, "logps/rejected": -1729.2418212890625, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.29702064394950867, "rewards/margins": 16.85504722595215, "rewards/rejected": -16.558027267456055, "step": 1040 }, { "epoch": 0.5244100387064552, "grad_norm": 0.059814453125, "learning_rate": 2.724284414976672e-06, "logits/chosen": 0.194356769323349, "logits/rejected": 1.3346575498580933, "logps/chosen": -13.100080490112305, "logps/rejected": -1775.0599365234375, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 0.3071358799934387, "rewards/margins": 17.33095359802246, "rewards/rejected": -17.023818969726562, "step": 1050 }, { "epoch": 0.5294044200274691, "grad_norm": 0.0322265625, "learning_rate": 2.6808193412092823e-06, "logits/chosen": 0.27043357491493225, "logits/rejected": 1.2958372831344604, "logps/chosen": -13.240577697753906, "logps/rejected": -1308.380615234375, "loss": 0.212, "rewards/accuracies": 1.0, "rewards/chosen": 0.3128414452075958, "rewards/margins": 12.567632675170898, "rewards/rejected": -12.254792213439941, "step": 1060 }, { "epoch": 0.5343988013484829, "grad_norm": 0.018310546875, "learning_rate": 2.637299249265659e-06, "logits/chosen": 0.24779090285301208, "logits/rejected": 1.3237859010696411, "logps/chosen": -13.30639934539795, "logps/rejected": -1560.58740234375, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 0.3085622787475586, "rewards/margins": 15.089482307434082, "rewards/rejected": -14.780920028686523, "step": 1070 }, { "epoch": 0.5393931826694968, "grad_norm": 0.049560546875, "learning_rate": 2.5937373810704352e-06, "logits/chosen": 0.20865114033222198, "logits/rejected": 1.3283964395523071, "logps/chosen": -12.944803237915039, "logps/rejected": -1628.0531005859375, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.3075375556945801, "rewards/margins": 15.755208969116211, "rewards/rejected": -15.447671890258789, "step": 1080 }, { "epoch": 0.5443875639905107, "grad_norm": 0.0247802734375, "learning_rate": 2.550146991259565e-06, "logits/chosen": 0.2642405331134796, "logits/rejected": 1.330570936203003, "logps/chosen": -12.811630249023438, "logps/rejected": -1518.474609375, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.309398889541626, "rewards/margins": 14.794235229492188, "rewards/rejected": -14.484835624694824, "step": 1090 }, { "epoch": 0.5493819453115245, "grad_norm": 0.0252685546875, "learning_rate": 2.5065413431473196e-06, "logits/chosen": 0.22386522591114044, "logits/rejected": 1.3922302722930908, "logps/chosen": -13.141294479370117, "logps/rejected": -1650.009765625, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 0.30252230167388916, "rewards/margins": 16.12454605102539, "rewards/rejected": -15.822023391723633, "step": 1100 }, { "epoch": 0.5543763266325384, "grad_norm": 0.017822265625, "learning_rate": 2.462933704690635e-06, "logits/chosen": 0.23435406386852264, "logits/rejected": 1.2522070407867432, "logps/chosen": -13.335357666015625, "logps/rejected": -1444.448974609375, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 0.3048686385154724, "rewards/margins": 13.933290481567383, "rewards/rejected": -13.62842082977295, "step": 1110 }, { "epoch": 0.5593707079535523, "grad_norm": 0.0159912109375, "learning_rate": 2.4193373444520558e-06, "logits/chosen": 0.23952054977416992, "logits/rejected": 1.466230869293213, "logps/chosen": -13.09483528137207, "logps/rejected": -1737.9251708984375, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.3013126254081726, "rewards/margins": 16.979856491088867, "rewards/rejected": -16.67854118347168, "step": 1120 }, { "epoch": 0.5643650892745661, "grad_norm": 0.0277099609375, "learning_rate": 2.3757655275624826e-06, "logits/chosen": 0.20145916938781738, "logits/rejected": 1.4154694080352783, "logps/chosen": -12.834500312805176, "logps/rejected": -1683.4241943359375, "loss": 0.2142, "rewards/accuracies": 1.0, "rewards/chosen": 0.29736918210983276, "rewards/margins": 16.437068939208984, "rewards/rejected": -16.13970184326172, "step": 1130 }, { "epoch": 0.56935947059558, "grad_norm": 0.0235595703125, "learning_rate": 2.3322315116849747e-06, "logits/chosen": 0.18402531743049622, "logits/rejected": 1.3126466274261475, "logps/chosen": -13.314038276672363, "logps/rejected": -1694.0882568359375, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.3090851306915283, "rewards/margins": 16.457656860351562, "rewards/rejected": -16.14857292175293, "step": 1140 }, { "epoch": 0.5743538519165938, "grad_norm": 0.0208740234375, "learning_rate": 2.2887485429808213e-06, "logits/chosen": 0.24247586727142334, "logits/rejected": 1.3415312767028809, "logps/chosen": -13.364187240600586, "logps/rejected": -1531.287353515625, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 0.31205669045448303, "rewards/margins": 14.862896919250488, "rewards/rejected": -14.5508394241333, "step": 1150 }, { "epoch": 0.5793482332376076, "grad_norm": 0.0257568359375, "learning_rate": 2.245329852079109e-06, "logits/chosen": 0.2564612329006195, "logits/rejected": 1.3275741338729858, "logps/chosen": -12.749259948730469, "logps/rejected": -1410.6912841796875, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 0.3032976984977722, "rewards/margins": 13.754470825195312, "rewards/rejected": -13.451173782348633, "step": 1160 }, { "epoch": 0.5843426145586216, "grad_norm": 0.02099609375, "learning_rate": 2.2019886500510197e-06, "logits/chosen": 0.234290212392807, "logits/rejected": 1.3947325944900513, "logps/chosen": -12.72685432434082, "logps/rejected": -1736.8687744140625, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": 0.2989721894264221, "rewards/margins": 16.979671478271484, "rewards/rejected": -16.68069839477539, "step": 1170 }, { "epoch": 0.5893369958796354, "grad_norm": 0.037353515625, "learning_rate": 2.1587381243900777e-06, "logits/chosen": 0.26597389578819275, "logits/rejected": 1.3139584064483643, "logps/chosen": -14.154109001159668, "logps/rejected": -1483.2686767578125, "loss": 0.2125, "rewards/accuracies": 1.0, "rewards/chosen": 0.3113531470298767, "rewards/margins": 14.383076667785645, "rewards/rejected": -14.071722030639648, "step": 1180 }, { "epoch": 0.5943313772006493, "grad_norm": 0.0233154296875, "learning_rate": 2.115591434999573e-06, "logits/chosen": 0.277686208486557, "logits/rejected": 1.401039719581604, "logps/chosen": -12.76582145690918, "logps/rejected": -1506.6636962890625, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 0.30175262689590454, "rewards/margins": 14.693206787109375, "rewards/rejected": -14.391454696655273, "step": 1190 }, { "epoch": 0.5993257585216631, "grad_norm": 0.0294189453125, "learning_rate": 2.0725617101883726e-06, "logits/chosen": 0.25775861740112305, "logits/rejected": 1.345365285873413, "logps/chosen": -12.69609260559082, "logps/rejected": -1601.963623046875, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 0.2992851138114929, "rewards/margins": 15.617083549499512, "rewards/rejected": -15.317797660827637, "step": 1200 }, { "epoch": 0.604320139842677, "grad_norm": 0.09033203125, "learning_rate": 2.0296620426763545e-06, "logits/chosen": 0.14509257674217224, "logits/rejected": 1.3586305379867554, "logps/chosen": -13.326802253723145, "logps/rejected": -1893.167724609375, "loss": 0.2128, "rewards/accuracies": 1.0, "rewards/chosen": 0.31738612055778503, "rewards/margins": 18.294166564941406, "rewards/rejected": -17.97677993774414, "step": 1210 }, { "epoch": 0.6093145211636909, "grad_norm": 0.0206298828125, "learning_rate": 1.9869054856106628e-06, "logits/chosen": 0.2093639373779297, "logits/rejected": 1.3707678318023682, "logps/chosen": -13.017126083374023, "logps/rejected": -1714.967529296875, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.305789053440094, "rewards/margins": 16.755239486694336, "rewards/rejected": -16.449451446533203, "step": 1220 }, { "epoch": 0.6143089024847047, "grad_norm": 0.027587890625, "learning_rate": 1.9443050485940118e-06, "logits/chosen": 0.29796817898750305, "logits/rejected": 1.4095687866210938, "logps/chosen": -13.055410385131836, "logps/rejected": -1501.1917724609375, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 0.30659452080726624, "rewards/margins": 14.630342483520508, "rewards/rejected": -14.323748588562012, "step": 1230 }, { "epoch": 0.6193032838057185, "grad_norm": 0.0205078125, "learning_rate": 1.9018736937262271e-06, "logits/chosen": 0.20551720261573792, "logits/rejected": 1.3722885847091675, "logps/chosen": -13.263589859008789, "logps/rejected": -1644.015380859375, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.3152340054512024, "rewards/margins": 15.940661430358887, "rewards/rejected": -15.62542724609375, "step": 1240 }, { "epoch": 0.6242976651267325, "grad_norm": 0.025146484375, "learning_rate": 1.859624331660253e-06, "logits/chosen": 0.19998934864997864, "logits/rejected": 1.380974531173706, "logps/chosen": -12.843725204467773, "logps/rejected": -1903.5989990234375, "loss": 0.2149, "rewards/accuracies": 1.0, "rewards/chosen": 0.293190598487854, "rewards/margins": 18.591859817504883, "rewards/rejected": -18.298667907714844, "step": 1250 }, { "epoch": 0.6292920464477463, "grad_norm": 0.0184326171875, "learning_rate": 1.817569817673806e-06, "logits/chosen": 0.20200982689857483, "logits/rejected": 1.3313050270080566, "logps/chosen": -13.815347671508789, "logps/rejected": -1682.6148681640625, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.3007175624370575, "rewards/margins": 16.28915786743164, "rewards/rejected": -15.988439559936523, "step": 1260 }, { "epoch": 0.6342864277687601, "grad_norm": 0.0264892578125, "learning_rate": 1.7757229477578824e-06, "logits/chosen": 0.2238602340221405, "logits/rejected": 1.3182499408721924, "logps/chosen": -13.298685073852539, "logps/rejected": -1689.486328125, "loss": 0.2124, "rewards/accuracies": 1.0, "rewards/chosen": 0.31393638253211975, "rewards/margins": 16.413013458251953, "rewards/rejected": -16.099077224731445, "step": 1270 }, { "epoch": 0.639280809089774, "grad_norm": 0.0120849609375, "learning_rate": 1.7340964547232993e-06, "logits/chosen": 0.23566928505897522, "logits/rejected": 1.3794082403182983, "logps/chosen": -12.66304874420166, "logps/rejected": -1540.5882568359375, "loss": 0.2122, "rewards/accuracies": 1.0, "rewards/chosen": 0.30794182419776917, "rewards/margins": 15.042073249816895, "rewards/rejected": -14.734130859375, "step": 1280 }, { "epoch": 0.6442751904107878, "grad_norm": 0.0233154296875, "learning_rate": 1.6927030043264656e-06, "logits/chosen": 0.29966339468955994, "logits/rejected": 1.3575140237808228, "logps/chosen": -12.536134719848633, "logps/rejected": -1450.3671875, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.3003261685371399, "rewards/margins": 14.137969970703125, "rewards/rejected": -13.83764362335205, "step": 1290 }, { "epoch": 0.6492695717318018, "grad_norm": 0.01904296875, "learning_rate": 1.6515551914155522e-06, "logits/chosen": 0.21864613890647888, "logits/rejected": 1.2960518598556519, "logps/chosen": -13.931121826171875, "logps/rejected": -1731.0433349609375, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 0.29954832792282104, "rewards/margins": 16.782882690429688, "rewards/rejected": -16.483333587646484, "step": 1300 }, { "epoch": 0.6542639530528156, "grad_norm": 0.027587890625, "learning_rate": 1.6106655360982376e-06, "logits/chosen": 0.11391136795282364, "logits/rejected": 1.1829859018325806, "logps/chosen": -13.162788391113281, "logps/rejected": -1799.5191650390625, "loss": 0.2121, "rewards/accuracies": 1.0, "rewards/chosen": 0.3114808201789856, "rewards/margins": 17.426467895507812, "rewards/rejected": -17.114986419677734, "step": 1310 }, { "epoch": 0.6592583343738294, "grad_norm": 0.126953125, "learning_rate": 1.570046479932196e-06, "logits/chosen": 0.25235381722450256, "logits/rejected": 1.2799979448318481, "logps/chosen": -13.60442066192627, "logps/rejected": -1395.318115234375, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.3034665286540985, "rewards/margins": 13.48466968536377, "rewards/rejected": -13.18120288848877, "step": 1320 }, { "epoch": 0.6642527156948433, "grad_norm": 0.0306396484375, "learning_rate": 1.5297103821394876e-06, "logits/chosen": 0.2604614198207855, "logits/rejected": 1.4426438808441162, "logps/chosen": -12.831866264343262, "logps/rejected": -1725.9541015625, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": 0.30009177327156067, "rewards/margins": 16.85196876525879, "rewards/rejected": -16.551877975463867, "step": 1330 }, { "epoch": 0.6692470970158572, "grad_norm": 0.0291748046875, "learning_rate": 1.489669515845995e-06, "logits/chosen": 0.19801196455955505, "logits/rejected": 1.246242642402649, "logps/chosen": -12.573989868164062, "logps/rejected": -1484.0341796875, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": 0.3015730679035187, "rewards/margins": 14.43646240234375, "rewards/rejected": -14.134889602661133, "step": 1340 }, { "epoch": 0.674241478336871, "grad_norm": 0.020751953125, "learning_rate": 1.449936064347065e-06, "logits/chosen": 0.24275144934654236, "logits/rejected": 1.3506710529327393, "logps/chosen": -12.805200576782227, "logps/rejected": -1675.633544921875, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.3000423312187195, "rewards/margins": 16.352081298828125, "rewards/rejected": -16.052040100097656, "step": 1350 }, { "epoch": 0.6792358596578849, "grad_norm": 0.01483154296875, "learning_rate": 1.4105221174004771e-06, "logits/chosen": 0.18348607420921326, "logits/rejected": 1.3507370948791504, "logps/chosen": -13.73768424987793, "logps/rejected": -1898.157470703125, "loss": 0.2124, "rewards/accuracies": 1.0, "rewards/chosen": 0.309729665517807, "rewards/margins": 18.553356170654297, "rewards/rejected": -18.24362564086914, "step": 1360 }, { "epoch": 0.6842302409788987, "grad_norm": 0.03515625, "learning_rate": 1.3714396675478714e-06, "logits/chosen": 0.29044079780578613, "logits/rejected": 1.34650719165802, "logps/chosen": -12.877673149108887, "logps/rejected": -1523.441162109375, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 0.2951946556568146, "rewards/margins": 14.83436107635498, "rewards/rejected": -14.539166450500488, "step": 1370 }, { "epoch": 0.6892246222999125, "grad_norm": 0.0115966796875, "learning_rate": 1.332700606465766e-06, "logits/chosen": 0.18918542563915253, "logits/rejected": 1.3702523708343506, "logps/chosen": -13.452165603637695, "logps/rejected": -1564.6370849609375, "loss": 0.2125, "rewards/accuracies": 1.0, "rewards/chosen": 0.3073347210884094, "rewards/margins": 15.239773750305176, "rewards/rejected": -14.9324369430542, "step": 1380 }, { "epoch": 0.6942190036209265, "grad_norm": 0.034423828125, "learning_rate": 1.294316721347254e-06, "logits/chosen": 0.23732297122478485, "logits/rejected": 1.3234728574752808, "logps/chosen": -13.713842391967773, "logps/rejected": -1537.1632080078125, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.30026909708976746, "rewards/margins": 14.980550765991211, "rewards/rejected": -14.680282592773438, "step": 1390 }, { "epoch": 0.6992133849419403, "grad_norm": 0.07080078125, "learning_rate": 1.2562996913154952e-06, "logits/chosen": 0.150472030043602, "logits/rejected": 1.4298745393753052, "logps/chosen": -12.643513679504395, "logps/rejected": -2120.697509765625, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 0.30036434531211853, "rewards/margins": 20.737525939941406, "rewards/rejected": -20.43716049194336, "step": 1400 }, { "epoch": 0.7042077662629542, "grad_norm": 0.034423828125, "learning_rate": 1.2186610838700958e-06, "logits/chosen": 0.30068179965019226, "logits/rejected": 1.335126280784607, "logps/chosen": -12.845601081848145, "logps/rejected": -1345.099365234375, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.2946023643016815, "rewards/margins": 13.083051681518555, "rewards/rejected": -12.788450241088867, "step": 1410 }, { "epoch": 0.709202147583968, "grad_norm": 0.033447265625, "learning_rate": 1.1814123513674465e-06, "logits/chosen": 0.18157488107681274, "logits/rejected": 1.380326509475708, "logps/chosen": -13.374841690063477, "logps/rejected": -1741.3206787109375, "loss": 0.2124, "rewards/accuracies": 1.0, "rewards/chosen": 0.30905359983444214, "rewards/margins": 16.993389129638672, "rewards/rejected": -16.684337615966797, "step": 1420 }, { "epoch": 0.7141965289049819, "grad_norm": 0.036865234375, "learning_rate": 1.1445648275360925e-06, "logits/chosen": 0.19126050174236298, "logits/rejected": 1.4904680252075195, "logps/chosen": -13.341898918151855, "logps/rejected": -1982.956298828125, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.30447131395339966, "rewards/margins": 19.409332275390625, "rewards/rejected": -19.104862213134766, "step": 1430 }, { "epoch": 0.7191909102259958, "grad_norm": 0.03955078125, "learning_rate": 1.1081297240282077e-06, "logits/chosen": 0.2438248097896576, "logits/rejected": 1.3988474607467651, "logps/chosen": -13.187724113464355, "logps/rejected": -1571.8028564453125, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.29910221695899963, "rewards/margins": 15.328478813171387, "rewards/rejected": -15.029376029968262, "step": 1440 }, { "epoch": 0.7241852915470096, "grad_norm": 0.0235595703125, "learning_rate": 1.0721181270082061e-06, "logits/chosen": 0.20241305232048035, "logits/rejected": 1.308318018913269, "logps/chosen": -12.698432922363281, "logps/rejected": -1740.4573974609375, "loss": 0.2125, "rewards/accuracies": 1.0, "rewards/chosen": 0.313620924949646, "rewards/margins": 16.896198272705078, "rewards/rejected": -16.582576751708984, "step": 1450 }, { "epoch": 0.7291796728680234, "grad_norm": 0.0164794921875, "learning_rate": 1.0365409937795385e-06, "logits/chosen": 0.20683518052101135, "logits/rejected": 1.2756832838058472, "logps/chosen": -13.767419815063477, "logps/rejected": -1547.2147216796875, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 0.3099841773509979, "rewards/margins": 14.946782112121582, "rewards/rejected": -14.636796951293945, "step": 1460 }, { "epoch": 0.7341740541890374, "grad_norm": 0.0263671875, "learning_rate": 1.0014091494506962e-06, "logits/chosen": 0.17677463591098785, "logits/rejected": 1.3411346673965454, "logps/chosen": -13.297311782836914, "logps/rejected": -1913.2984619140625, "loss": 0.2128, "rewards/accuracies": 1.0, "rewards/chosen": 0.3021236062049866, "rewards/margins": 18.699064254760742, "rewards/rejected": -18.396942138671875, "step": 1470 }, { "epoch": 0.7391684355100512, "grad_norm": 0.033203125, "learning_rate": 9.667332836414368e-07, "logits/chosen": 0.15931569039821625, "logits/rejected": 1.2274577617645264, "logps/chosen": -13.343734741210938, "logps/rejected": -1609.459228515625, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 0.3135462999343872, "rewards/margins": 15.410505294799805, "rewards/rejected": -15.096961975097656, "step": 1480 }, { "epoch": 0.744162816831065, "grad_norm": 0.013427734375, "learning_rate": 9.325239472302422e-07, "logits/chosen": 0.25666847825050354, "logits/rejected": 1.4118614196777344, "logps/chosen": -13.085573196411133, "logps/rejected": -1693.067626953125, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 0.30241408944129944, "rewards/margins": 16.529926300048828, "rewards/rejected": -16.227514266967773, "step": 1490 }, { "epoch": 0.7491571981520789, "grad_norm": 0.032958984375, "learning_rate": 8.987915491439844e-07, "logits/chosen": 0.2501397132873535, "logits/rejected": 1.383455514907837, "logps/chosen": -12.903341293334961, "logps/rejected": -1736.0384521484375, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.3011986017227173, "rewards/margins": 16.931396484375, "rewards/rejected": -16.630199432373047, "step": 1500 }, { "epoch": 0.7541515794730927, "grad_norm": 0.021240234375, "learning_rate": 8.655463531907823e-07, "logits/chosen": 0.2224547117948532, "logits/rejected": 1.3173682689666748, "logps/chosen": -13.032022476196289, "logps/rejected": -1771.166259765625, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 0.29998722672462463, "rewards/margins": 17.284591674804688, "rewards/rejected": -16.984607696533203, "step": 1510 }, { "epoch": 0.7591459607941067, "grad_norm": 0.014404296875, "learning_rate": 8.327984749370227e-07, "logits/chosen": 0.2447456419467926, "logits/rejected": 1.344362497329712, "logps/chosen": -12.957304000854492, "logps/rejected": -1557.052978515625, "loss": 0.2142, "rewards/accuracies": 1.0, "rewards/chosen": 0.29349425435066223, "rewards/margins": 15.1736478805542, "rewards/rejected": -14.880154609680176, "step": 1520 }, { "epoch": 0.7641403421151205, "grad_norm": 0.0218505859375, "learning_rate": 8.005578786294782e-07, "logits/chosen": 0.1744759976863861, "logits/rejected": 1.3996227979660034, "logps/chosen": -13.042366027832031, "logps/rejected": -1864.2431640625, "loss": 0.2122, "rewards/accuracies": 1.0, "rewards/chosen": 0.30568575859069824, "rewards/margins": 18.201250076293945, "rewards/rejected": -17.89556312561035, "step": 1530 }, { "epoch": 0.7691347234361343, "grad_norm": 0.012939453125, "learning_rate": 7.688343741634702e-07, "logits/chosen": 0.22156497836112976, "logits/rejected": 1.3012608289718628, "logps/chosen": -12.66821575164795, "logps/rejected": -1628.963134765625, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.2943740785121918, "rewards/margins": 15.8633394241333, "rewards/rejected": -15.568964958190918, "step": 1540 }, { "epoch": 0.7741291047571482, "grad_norm": 0.03662109375, "learning_rate": 7.376376140980001e-07, "logits/chosen": 0.1970866173505783, "logits/rejected": 1.2925320863723755, "logps/chosen": -12.667881965637207, "logps/rejected": -1566.008056640625, "loss": 0.2128, "rewards/accuracies": 1.0, "rewards/chosen": 0.3171369433403015, "rewards/margins": 15.188334465026855, "rewards/rejected": -14.871198654174805, "step": 1550 }, { "epoch": 0.7791234860781621, "grad_norm": 0.0625, "learning_rate": 7.069770907187465e-07, "logits/chosen": 0.20419040322303772, "logits/rejected": 1.306980013847351, "logps/chosen": -13.543539047241211, "logps/rejected": -1506.096923828125, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.3025529384613037, "rewards/margins": 14.58825397491455, "rewards/rejected": -14.285697937011719, "step": 1560 }, { "epoch": 0.7841178673991759, "grad_norm": 0.01483154296875, "learning_rate": 6.768621331498371e-07, "logits/chosen": 0.22659845650196075, "logits/rejected": 1.3488702774047852, "logps/chosen": -12.837379455566406, "logps/rejected": -1573.134033203125, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.29855865240097046, "rewards/margins": 15.335705757141113, "rewards/rejected": -15.037145614624023, "step": 1570 }, { "epoch": 0.7891122487201898, "grad_norm": 0.049560546875, "learning_rate": 6.473019045152593e-07, "logits/chosen": 0.22067594528198242, "logits/rejected": 1.39100980758667, "logps/chosen": -12.964533805847168, "logps/rejected": -1819.2757568359375, "loss": 0.2124, "rewards/accuracies": 1.0, "rewards/chosen": 0.3059755563735962, "rewards/margins": 17.78072738647461, "rewards/rejected": -17.474750518798828, "step": 1580 }, { "epoch": 0.7941066300412036, "grad_norm": 0.0159912109375, "learning_rate": 6.183053991507818e-07, "logits/chosen": 0.18515101075172424, "logits/rejected": 1.2657784223556519, "logps/chosen": -13.701101303100586, "logps/rejected": -1632.0228271484375, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 0.30029112100601196, "rewards/margins": 15.897903442382812, "rewards/rejected": -15.597612380981445, "step": 1590 }, { "epoch": 0.7991010113622175, "grad_norm": 0.035400390625, "learning_rate": 5.898814398672376e-07, "logits/chosen": 0.2673841416835785, "logits/rejected": 1.404524326324463, "logps/chosen": -12.89787769317627, "logps/rejected": -1488.13818359375, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.2999451160430908, "rewards/margins": 14.503445625305176, "rewards/rejected": -14.203500747680664, "step": 1600 }, { "epoch": 0.8040953926832314, "grad_norm": 0.028564453125, "learning_rate": 5.620386752659912e-07, "logits/chosen": 0.20275497436523438, "logits/rejected": 1.2962656021118164, "logps/chosen": -14.060361862182617, "logps/rejected": -1563.873046875, "loss": 0.2123, "rewards/accuracies": 1.0, "rewards/chosen": 0.3067697286605835, "rewards/margins": 15.22687816619873, "rewards/rejected": -14.920109748840332, "step": 1610 }, { "epoch": 0.8090897740042452, "grad_norm": 0.031494140625, "learning_rate": 5.347855771074157e-07, "logits/chosen": 0.22789278626441956, "logits/rejected": 1.4333293437957764, "logps/chosen": -12.939372062683105, "logps/rejected": -1751.370361328125, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": 0.2999444007873535, "rewards/margins": 17.115280151367188, "rewards/rejected": -16.81533432006836, "step": 1620 }, { "epoch": 0.8140841553252591, "grad_norm": 0.031005859375, "learning_rate": 5.081304377331786e-07, "logits/chosen": 0.27506810426712036, "logits/rejected": 1.2999309301376343, "logps/chosen": -13.067828178405762, "logps/rejected": -1445.669677734375, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.30137357115745544, "rewards/margins": 14.080915451049805, "rewards/rejected": -13.779541015625, "step": 1630 }, { "epoch": 0.819078536646273, "grad_norm": 0.03369140625, "learning_rate": 4.820813675431186e-07, "logits/chosen": 0.15463611483573914, "logits/rejected": 1.3889650106430054, "logps/chosen": -13.349421501159668, "logps/rejected": -1812.4769287109375, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": 0.3092433512210846, "rewards/margins": 17.648366928100586, "rewards/rejected": -17.33912467956543, "step": 1640 }, { "epoch": 0.8240729179672868, "grad_norm": 0.0242919921875, "learning_rate": 4.5664629252747865e-07, "logits/chosen": 0.21123354136943817, "logits/rejected": 1.3822557926177979, "logps/chosen": -12.89905834197998, "logps/rejected": -1771.5474853515625, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.29551658034324646, "rewards/margins": 17.291709899902344, "rewards/rejected": -16.99619483947754, "step": 1650 }, { "epoch": 0.8290672992883007, "grad_norm": 0.021484375, "learning_rate": 4.3183295185525746e-07, "logits/chosen": 0.17760224640369415, "logits/rejected": 1.320516586303711, "logps/chosen": -12.863115310668945, "logps/rejected": -1791.6168212890625, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 0.3116030693054199, "rewards/margins": 17.404626846313477, "rewards/rejected": -17.09302520751953, "step": 1660 }, { "epoch": 0.8340616806093145, "grad_norm": 0.0274658203125, "learning_rate": 4.0764889551939773e-07, "logits/chosen": 0.19689543545246124, "logits/rejected": 1.3633372783660889, "logps/chosen": -12.999624252319336, "logps/rejected": -1783.937255859375, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.30732038617134094, "rewards/margins": 17.336694717407227, "rewards/rejected": -17.029375076293945, "step": 1670 }, { "epoch": 0.8390560619303283, "grad_norm": 0.02392578125, "learning_rate": 3.8410148203953916e-07, "logits/chosen": 0.20565366744995117, "logits/rejected": 1.2732315063476562, "logps/chosen": -13.205337524414062, "logps/rejected": -1707.686279296875, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 0.29469722509384155, "rewards/margins": 16.513946533203125, "rewards/rejected": -16.219249725341797, "step": 1680 }, { "epoch": 0.8440504432513423, "grad_norm": 0.01904296875, "learning_rate": 3.611978762230306e-07, "logits/chosen": 0.2300119698047638, "logits/rejected": 1.3706471920013428, "logps/chosen": -12.709399223327637, "logps/rejected": -1650.5601806640625, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 0.30553561449050903, "rewards/margins": 16.097576141357422, "rewards/rejected": -15.79203987121582, "step": 1690 }, { "epoch": 0.8490448245723561, "grad_norm": 0.031982421875, "learning_rate": 3.389450469848821e-07, "logits/chosen": 0.26923322677612305, "logits/rejected": 1.3886299133300781, "logps/chosen": -12.681255340576172, "logps/rejected": -1633.20361328125, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": 0.3012009263038635, "rewards/margins": 15.944944381713867, "rewards/rejected": -15.643745422363281, "step": 1700 }, { "epoch": 0.85403920589337, "grad_norm": 0.015869140625, "learning_rate": 3.173497652273241e-07, "logits/chosen": 0.22611021995544434, "logits/rejected": 1.4598513841629028, "logps/chosen": -13.163076400756836, "logps/rejected": -1702.979736328125, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 0.3090182840824127, "rewards/margins": 16.64162826538086, "rewards/rejected": -16.332609176635742, "step": 1710 }, { "epoch": 0.8590335872143838, "grad_norm": 0.0257568359375, "learning_rate": 2.964186017796153e-07, "logits/chosen": 0.23432429134845734, "logits/rejected": 1.3954203128814697, "logps/chosen": -12.975980758666992, "logps/rejected": -1669.3160400390625, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 0.29741281270980835, "rewards/margins": 16.287899017333984, "rewards/rejected": -15.990484237670898, "step": 1720 }, { "epoch": 0.8640279685353977, "grad_norm": 0.020263671875, "learning_rate": 2.761579253987226e-07, "logits/chosen": 0.24720034003257751, "logits/rejected": 1.284588098526001, "logps/chosen": -13.461567878723145, "logps/rejected": -1476.7037353515625, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 0.3023452162742615, "rewards/margins": 14.363668441772461, "rewards/rejected": -14.061323165893555, "step": 1730 }, { "epoch": 0.8690223498564116, "grad_norm": 0.04541015625, "learning_rate": 2.565739008314944e-07, "logits/chosen": 0.25941091775894165, "logits/rejected": 1.3293951749801636, "logps/chosen": -12.768040657043457, "logps/rejected": -1509.4613037109375, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 0.2932388186454773, "rewards/margins": 14.711868286132812, "rewards/rejected": -14.418627738952637, "step": 1740 }, { "epoch": 0.8740167311774254, "grad_norm": 0.0240478515625, "learning_rate": 2.3767248693890106e-07, "logits/chosen": 0.2348676472902298, "logits/rejected": 1.3176116943359375, "logps/chosen": -13.70958137512207, "logps/rejected": -1628.524658203125, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.3072062134742737, "rewards/margins": 15.790201187133789, "rewards/rejected": -15.482992172241211, "step": 1750 }, { "epoch": 0.8790111124984392, "grad_norm": 0.0849609375, "learning_rate": 2.1945943488292265e-07, "logits/chosen": 0.14879265427589417, "logits/rejected": 1.2918832302093506, "logps/chosen": -14.206727981567383, "logps/rejected": -1858.029052734375, "loss": 0.2125, "rewards/accuracies": 1.0, "rewards/chosen": 0.3139779269695282, "rewards/margins": 18.043270111083984, "rewards/rejected": -17.729291915893555, "step": 1760 }, { "epoch": 0.8840054938194531, "grad_norm": 0.019775390625, "learning_rate": 2.0194028637663733e-07, "logits/chosen": 0.2688780426979065, "logits/rejected": 1.2982268333435059, "logps/chosen": -13.01366901397705, "logps/rejected": -1392.9710693359375, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 0.3066830039024353, "rewards/margins": 13.560384750366211, "rewards/rejected": -13.253702163696289, "step": 1770 }, { "epoch": 0.888999875140467, "grad_norm": 0.0380859375, "learning_rate": 1.851203719980324e-07, "logits/chosen": 0.10393796861171722, "logits/rejected": 1.3042396306991577, "logps/chosen": -13.014623641967773, "logps/rejected": -1894.3724365234375, "loss": 0.2117, "rewards/accuracies": 1.0, "rewards/chosen": 0.3259292542934418, "rewards/margins": 18.341753005981445, "rewards/rejected": -18.015825271606445, "step": 1780 }, { "epoch": 0.8939942564614808, "grad_norm": 0.0250244140625, "learning_rate": 1.6900480956806214e-07, "logits/chosen": 0.14930710196495056, "logits/rejected": 1.24697744846344, "logps/chosen": -13.011159896850586, "logps/rejected": -1752.00390625, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 0.30487823486328125, "rewards/margins": 17.018848419189453, "rewards/rejected": -16.713970184326172, "step": 1790 }, { "epoch": 0.8989886377824947, "grad_norm": 0.0159912109375, "learning_rate": 1.5359850259344223e-07, "logits/chosen": 0.19253353774547577, "logits/rejected": 1.2840015888214111, "logps/chosen": -13.470372200012207, "logps/rejected": -1626.4345703125, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.3020302951335907, "rewards/margins": 15.777534484863281, "rewards/rejected": -15.475504875183105, "step": 1800 }, { "epoch": 0.9039830191035085, "grad_norm": 0.0238037109375, "learning_rate": 1.3890613877465127e-07, "logits/chosen": 0.236587792634964, "logits/rejected": 1.3434031009674072, "logps/chosen": -13.017538070678711, "logps/rejected": -1619.8177490234375, "loss": 0.2148, "rewards/accuracies": 1.0, "rewards/chosen": 0.29720932245254517, "rewards/margins": 15.795916557312012, "rewards/rejected": -15.498708724975586, "step": 1810 }, { "epoch": 0.9089774004245225, "grad_norm": 0.064453125, "learning_rate": 1.249321885795954e-07, "logits/chosen": 0.23312029242515564, "logits/rejected": 1.218126893043518, "logps/chosen": -13.364558219909668, "logps/rejected": -1396.6041259765625, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.30306655168533325, "rewards/margins": 13.558195114135742, "rewards/rejected": -13.255128860473633, "step": 1820 }, { "epoch": 0.9139717817455363, "grad_norm": 0.02001953125, "learning_rate": 1.1168090388337577e-07, "logits/chosen": 0.289134681224823, "logits/rejected": 1.3270288705825806, "logps/chosen": -12.729695320129395, "logps/rejected": -1422.671142578125, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 0.3039693832397461, "rewards/margins": 13.834306716918945, "rewards/rejected": -13.5303373336792, "step": 1830 }, { "epoch": 0.9189661630665501, "grad_norm": 0.0257568359375, "learning_rate": 9.915631667455989e-08, "logits/chosen": 0.23302344977855682, "logits/rejected": 1.3944863080978394, "logps/chosen": -13.118586540222168, "logps/rejected": -1715.78515625, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.29517239332199097, "rewards/margins": 16.75180435180664, "rewards/rejected": -16.45663070678711, "step": 1840 }, { "epoch": 0.923960544387564, "grad_norm": 0.025634765625, "learning_rate": 8.736223782836589e-08, "logits/chosen": 0.1992538869380951, "logits/rejected": 1.3024797439575195, "logps/chosen": -12.67003059387207, "logps/rejected": -1692.8782958984375, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.3054552972316742, "rewards/margins": 16.512258529663086, "rewards/rejected": -16.206802368164062, "step": 1850 }, { "epoch": 0.9289549257085778, "grad_norm": 0.01226806640625, "learning_rate": 7.63022559471202e-08, "logits/chosen": 0.23122599720954895, "logits/rejected": 1.37257981300354, "logps/chosen": -12.650789260864258, "logps/rejected": -1592.7677001953125, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": 0.2996312975883484, "rewards/margins": 15.537762641906738, "rewards/rejected": -15.238128662109375, "step": 1860 }, { "epoch": 0.9339493070295917, "grad_norm": 0.0159912109375, "learning_rate": 6.597973626834759e-08, "logits/chosen": 0.21128106117248535, "logits/rejected": 1.4774492979049683, "logps/chosen": -13.167986869812012, "logps/rejected": -1897.197998046875, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 0.30018025636672974, "rewards/margins": 18.570720672607422, "rewards/rejected": -18.27054214477539, "step": 1870 }, { "epoch": 0.9389436883506056, "grad_norm": 0.0264892578125, "learning_rate": 5.639781964082547e-08, "logits/chosen": 0.27233806252479553, "logits/rejected": 1.4588285684585571, "logps/chosen": -13.405789375305176, "logps/rejected": -1711.1881103515625, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.29316192865371704, "rewards/margins": 16.696842193603516, "rewards/rejected": -16.40367889404297, "step": 1880 }, { "epoch": 0.9439380696716194, "grad_norm": 0.033203125, "learning_rate": 4.755942156891458e-08, "logits/chosen": 0.23750165104866028, "logits/rejected": 1.4071403741836548, "logps/chosen": -12.829435348510742, "logps/rejected": -1591.2479248046875, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 0.3001479208469391, "rewards/margins": 15.529951095581055, "rewards/rejected": -15.229803085327148, "step": 1890 }, { "epoch": 0.9489324509926332, "grad_norm": 0.0203857421875, "learning_rate": 3.946723132545155e-08, "logits/chosen": 0.18308812379837036, "logits/rejected": 1.3526315689086914, "logps/chosen": -13.234288215637207, "logps/rejected": -1674.6246337890625, "loss": 0.2124, "rewards/accuracies": 1.0, "rewards/chosen": 0.3094561696052551, "rewards/margins": 16.279918670654297, "rewards/rejected": -15.970464706420898, "step": 1900 }, { "epoch": 0.9539268323136472, "grad_norm": 0.02197265625, "learning_rate": 3.212371113348156e-08, "logits/chosen": 0.2626166343688965, "logits/rejected": 1.3081789016723633, "logps/chosen": -12.9055757522583, "logps/rejected": -1487.3057861328125, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 0.29553088545799255, "rewards/margins": 14.46813678741455, "rewards/rejected": -14.172607421875, "step": 1910 }, { "epoch": 0.958921213634661, "grad_norm": 0.033935546875, "learning_rate": 2.5531095417073437e-08, "logits/chosen": 0.2499731481075287, "logits/rejected": 1.3821344375610352, "logps/chosen": -13.182432174682617, "logps/rejected": -1497.8023681640625, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.29157382249832153, "rewards/margins": 14.601869583129883, "rewards/rejected": -14.310295104980469, "step": 1920 }, { "epoch": 0.9639155949556749, "grad_norm": 0.0218505859375, "learning_rate": 1.969139012144822e-08, "logits/chosen": 0.27068477869033813, "logits/rejected": 1.3521279096603394, "logps/chosen": -13.608545303344727, "logps/rejected": -1491.3599853515625, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.3026901185512543, "rewards/margins": 14.520078659057617, "rewards/rejected": -14.217389106750488, "step": 1930 }, { "epoch": 0.9689099762766887, "grad_norm": 0.049072265625, "learning_rate": 1.4606372102626277e-08, "logits/chosen": 0.19163861870765686, "logits/rejected": 1.3109387159347534, "logps/chosen": -13.46105670928955, "logps/rejected": -1731.31640625, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 0.3083009421825409, "rewards/margins": 16.883403778076172, "rewards/rejected": -16.575105667114258, "step": 1940 }, { "epoch": 0.9739043575977026, "grad_norm": 0.0322265625, "learning_rate": 1.0277588586781463e-08, "logits/chosen": 0.19613580405712128, "logits/rejected": 1.228542685508728, "logps/chosen": -13.699869155883789, "logps/rejected": -1520.361572265625, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 0.3030748963356018, "rewards/margins": 14.732551574707031, "rewards/rejected": -14.429475784301758, "step": 1950 }, { "epoch": 0.9788987389187165, "grad_norm": 0.025146484375, "learning_rate": 6.7063566994651775e-09, "logits/chosen": 0.21389658749103546, "logits/rejected": 1.3907114267349243, "logps/chosen": -13.105180740356445, "logps/rejected": -1654.1494140625, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 0.30368101596832275, "rewards/margins": 16.11886978149414, "rewards/rejected": -15.81518840789795, "step": 1960 }, { "epoch": 0.9838931202397303, "grad_norm": 0.02490234375, "learning_rate": 3.893763064840295e-09, "logits/chosen": 0.18713845312595367, "logits/rejected": 1.294327974319458, "logps/chosen": -12.946528434753418, "logps/rejected": -1592.420654296875, "loss": 0.2118, "rewards/accuracies": 1.0, "rewards/chosen": 0.31735336780548096, "rewards/margins": 15.468029975891113, "rewards/rejected": -15.150675773620605, "step": 1970 }, { "epoch": 0.9888875015607441, "grad_norm": 0.0172119140625, "learning_rate": 1.840663475053961e-09, "logits/chosen": 0.23091156780719757, "logits/rejected": 1.4303152561187744, "logps/chosen": -13.524391174316406, "logps/rejected": -1685.2825927734375, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.30492401123046875, "rewards/margins": 16.39162826538086, "rewards/rejected": -16.08670425415039, "step": 1980 }, { "epoch": 0.993881882881758, "grad_norm": 0.01446533203125, "learning_rate": 5.476826298439486e-10, "logits/chosen": 0.18041366338729858, "logits/rejected": 1.3766809701919556, "logps/chosen": -12.73656940460205, "logps/rejected": -1975.449462890625, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": 0.2982867360115051, "rewards/margins": 19.314851760864258, "rewards/rejected": -19.016565322875977, "step": 1990 }, { "epoch": 0.9988762642027719, "grad_norm": 0.029052734375, "learning_rate": 1.521394646070151e-11, "logits/chosen": 0.212058424949646, "logits/rejected": 1.3283421993255615, "logps/chosen": -12.978428840637207, "logps/rejected": -1706.1982421875, "loss": 0.2118, "rewards/accuracies": 1.0, "rewards/chosen": 0.3162993788719177, "rewards/margins": 16.5406551361084, "rewards/rejected": -16.224355697631836, "step": 2000 }, { "epoch": 0.9988762642027719, "eval_logits/chosen": 0.19315297901630402, "eval_logits/rejected": 1.0988490581512451, "eval_logps/chosen": -12.085772514343262, "eval_logps/rejected": -1018.5144653320312, "eval_loss": 0.2121797353029251, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.32569777965545654, "eval_rewards/margins": 9.880459785461426, "eval_rewards/rejected": -9.554760932922363, "eval_runtime": 0.4236, "eval_samples_per_second": 11.805, "eval_steps_per_second": 7.083, "step": 2000 }, { "epoch": 0.9998751404669747, "step": 2002, "total_flos": 0.0, "train_loss": 0.2449444185841929, "train_runtime": 3711.3289, "train_samples_per_second": 4.316, "train_steps_per_second": 0.539 } ], "logging_steps": 10, "max_steps": 2002, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }