{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99582225598177, "eval_steps": 800, "global_step": 1479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002025572857323712, "grad_norm": 31.013870239257812, "learning_rate": 6.756756756756757e-10, "logits/chosen": -2.5177597999572754, "logits/rejected": -2.4276583194732666, "logps/chosen": -79.6932373046875, "logps/rejected": -86.58649444580078, "loss": 0.6929, "rewards/accuracies": 0.03125, "rewards/chosen": -0.0008372783777303994, "rewards/margins": 0.00045527220936492085, "rewards/rejected": -0.0012925505870953202, "step": 1 }, { "epoch": 0.020255728573237118, "grad_norm": 28.71278190612793, "learning_rate": 6.756756756756757e-09, "logits/chosen": -2.587923526763916, "logits/rejected": -2.421647787094116, "logps/chosen": -72.02790069580078, "logps/rejected": -68.7666015625, "loss": 0.6928, "rewards/accuracies": 0.4479166567325592, "rewards/chosen": -0.000452535372460261, "rewards/margins": 0.0008923111017793417, "rewards/rejected": -0.001344846561551094, "step": 10 }, { "epoch": 0.040511457146474236, "grad_norm": 24.486814498901367, "learning_rate": 1.3513513513513514e-08, "logits/chosen": -2.5588934421539307, "logits/rejected": -2.3707621097564697, "logps/chosen": -77.4730453491211, "logps/rejected": -71.17650604248047, "loss": 0.693, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.0015422820579260588, "rewards/margins": 0.00048240157775580883, "rewards/rejected": 0.0010598807130008936, "step": 20 }, { "epoch": 0.060767185719711354, "grad_norm": 26.952857971191406, "learning_rate": 2.027027027027027e-08, "logits/chosen": -2.5552210807800293, "logits/rejected": -2.3964200019836426, "logps/chosen": -75.58769226074219, "logps/rejected": -74.38423156738281, "loss": 0.6941, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00154799222946167, "rewards/margins": -0.0016709610354155302, "rewards/rejected": 0.0001229687622981146, "step": 30 }, { "epoch": 0.08102291429294847, "grad_norm": 30.491893768310547, "learning_rate": 2.7027027027027028e-08, "logits/chosen": -2.538985013961792, "logits/rejected": -2.3956074714660645, "logps/chosen": -84.64269256591797, "logps/rejected": -82.15937042236328, "loss": 0.6927, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0005608886131085455, "rewards/margins": 0.0012397856917232275, "rewards/rejected": -0.000678897020407021, "step": 40 }, { "epoch": 0.1012786428661856, "grad_norm": 27.152774810791016, "learning_rate": 3.378378378378378e-08, "logits/chosen": -2.515413522720337, "logits/rejected": -2.358457565307617, "logps/chosen": -81.1507568359375, "logps/rejected": -78.68826293945312, "loss": 0.6921, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0013986782869324088, "rewards/margins": 0.002325823763385415, "rewards/rejected": -0.0037245028652250767, "step": 50 }, { "epoch": 0.12153437143942271, "grad_norm": 27.534177780151367, "learning_rate": 4.054054054054054e-08, "logits/chosen": -2.520850419998169, "logits/rejected": -2.3658010959625244, "logps/chosen": -78.13814544677734, "logps/rejected": -75.04551696777344, "loss": 0.6938, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0025924122892320156, "rewards/margins": -0.0010297519620507956, "rewards/rejected": -0.00156266032718122, "step": 60 }, { "epoch": 0.14179010001265982, "grad_norm": 30.60668182373047, "learning_rate": 4.72972972972973e-08, "logits/chosen": -2.5382590293884277, "logits/rejected": -2.37661075592041, "logps/chosen": -83.97273254394531, "logps/rejected": -80.8182373046875, "loss": 0.6926, "rewards/accuracies": 0.515625, "rewards/chosen": -0.003294873284175992, "rewards/margins": 0.0013795426348224282, "rewards/rejected": -0.004674416035413742, "step": 70 }, { "epoch": 0.16204582858589694, "grad_norm": 27.501964569091797, "learning_rate": 5.4054054054054056e-08, "logits/chosen": -2.4654035568237305, "logits/rejected": -2.3297314643859863, "logps/chosen": -75.83648681640625, "logps/rejected": -76.66287994384766, "loss": 0.6902, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0013893753057345748, "rewards/margins": 0.006102095358073711, "rewards/rejected": -0.007491470314562321, "step": 80 }, { "epoch": 0.18230155715913407, "grad_norm": 24.944175720214844, "learning_rate": 6.081081081081081e-08, "logits/chosen": -2.5287299156188965, "logits/rejected": -2.379664659500122, "logps/chosen": -86.45475769042969, "logps/rejected": -79.61102294921875, "loss": 0.6904, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0014329934492707253, "rewards/margins": 0.005873243790119886, "rewards/rejected": -0.007306237705051899, "step": 90 }, { "epoch": 0.2025572857323712, "grad_norm": 29.129093170166016, "learning_rate": 6.756756756756756e-08, "logits/chosen": -2.5153121948242188, "logits/rejected": -2.361551523208618, "logps/chosen": -84.0345230102539, "logps/rejected": -78.61013793945312, "loss": 0.6881, "rewards/accuracies": 0.625, "rewards/chosen": -0.0028929836116731167, "rewards/margins": 0.010393200442194939, "rewards/rejected": -0.013286183588206768, "step": 100 }, { "epoch": 0.2228130143056083, "grad_norm": 28.691940307617188, "learning_rate": 7.432432432432432e-08, "logits/chosen": -2.546154260635376, "logits/rejected": -2.389882802963257, "logps/chosen": -74.24641418457031, "logps/rejected": -72.99244689941406, "loss": 0.6872, "rewards/accuracies": 0.609375, "rewards/chosen": -0.003837780561298132, "rewards/margins": 0.012213540263473988, "rewards/rejected": -0.016051320359110832, "step": 110 }, { "epoch": 0.24306874287884542, "grad_norm": 26.047407150268555, "learning_rate": 8.108108108108108e-08, "logits/chosen": -2.530447006225586, "logits/rejected": -2.3604226112365723, "logps/chosen": -79.45042419433594, "logps/rejected": -75.46896362304688, "loss": 0.6834, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.004467087332159281, "rewards/margins": 0.020118705928325653, "rewards/rejected": -0.024585790932178497, "step": 120 }, { "epoch": 0.26332447145208254, "grad_norm": 30.345191955566406, "learning_rate": 8.783783783783784e-08, "logits/chosen": -2.4959208965301514, "logits/rejected": -2.344454526901245, "logps/chosen": -86.02290344238281, "logps/rejected": -81.23602294921875, "loss": 0.6824, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.006714115384966135, "rewards/margins": 0.02207200787961483, "rewards/rejected": -0.028786126524209976, "step": 130 }, { "epoch": 0.28358020002531964, "grad_norm": 23.314868927001953, "learning_rate": 9.45945945945946e-08, "logits/chosen": -2.5607352256774902, "logits/rejected": -2.394366502761841, "logps/chosen": -72.60206604003906, "logps/rejected": -67.85148620605469, "loss": 0.6799, "rewards/accuracies": 0.6875, "rewards/chosen": -0.008559314534068108, "rewards/margins": 0.027249369770288467, "rewards/rejected": -0.035808682441711426, "step": 140 }, { "epoch": 0.3038359285985568, "grad_norm": 27.51814079284668, "learning_rate": 9.999944288759615e-08, "logits/chosen": -2.5597286224365234, "logits/rejected": -2.4156954288482666, "logps/chosen": -74.42972564697266, "logps/rejected": -70.92676544189453, "loss": 0.6768, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00905610155314207, "rewards/margins": 0.03394917771220207, "rewards/rejected": -0.043005283921957016, "step": 150 }, { "epoch": 0.3240916571717939, "grad_norm": 30.097389221191406, "learning_rate": 9.99799452570021e-08, "logits/chosen": -2.508636236190796, "logits/rejected": -2.3848562240600586, "logps/chosen": -77.739013671875, "logps/rejected": -73.92839813232422, "loss": 0.6753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.017720907926559448, "rewards/margins": 0.037171002477407455, "rewards/rejected": -0.054891906678676605, "step": 160 }, { "epoch": 0.34434738574503104, "grad_norm": 32.0848503112793, "learning_rate": 9.993260441994115e-08, "logits/chosen": -2.5097594261169434, "logits/rejected": -2.3447771072387695, "logps/chosen": -81.94526672363281, "logps/rejected": -78.39651489257812, "loss": 0.6634, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.013811466284096241, "rewards/margins": 0.06231771036982536, "rewards/rejected": -0.07612917572259903, "step": 170 }, { "epoch": 0.36460311431826814, "grad_norm": 29.288728713989258, "learning_rate": 9.985744674940535e-08, "logits/chosen": -2.5364279747009277, "logits/rejected": -2.354965925216675, "logps/chosen": -80.46150207519531, "logps/rejected": -75.10428619384766, "loss": 0.6577, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.0209208894520998, "rewards/margins": 0.07542888820171356, "rewards/rejected": -0.09634977579116821, "step": 180 }, { "epoch": 0.38485884289150524, "grad_norm": 27.00816535949707, "learning_rate": 9.975451411479911e-08, "logits/chosen": -2.499474048614502, "logits/rejected": -2.337350606918335, "logps/chosen": -78.61238098144531, "logps/rejected": -78.64549255371094, "loss": 0.657, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.02487110160291195, "rewards/margins": 0.07822562754154205, "rewards/rejected": -0.10309673845767975, "step": 190 }, { "epoch": 0.4051145714647424, "grad_norm": 27.34921646118164, "learning_rate": 9.962386385861412e-08, "logits/chosen": -2.50087308883667, "logits/rejected": -2.360152006149292, "logps/chosen": -76.67208862304688, "logps/rejected": -78.57847595214844, "loss": 0.652, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.038105227053165436, "rewards/margins": 0.08875634521245956, "rewards/rejected": -0.126861572265625, "step": 200 }, { "epoch": 0.4253703000379795, "grad_norm": 25.707185745239258, "learning_rate": 9.946556876448468e-08, "logits/chosen": -2.4654879570007324, "logits/rejected": -2.312530994415283, "logps/chosen": -78.15449523925781, "logps/rejected": -77.42134857177734, "loss": 0.6439, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03906805440783501, "rewards/margins": 0.10793592780828476, "rewards/rejected": -0.14700399339199066, "step": 210 }, { "epoch": 0.4456260286112166, "grad_norm": 27.729816436767578, "learning_rate": 9.927971701664084e-08, "logits/chosen": -2.4674429893493652, "logits/rejected": -2.3009190559387207, "logps/chosen": -75.07694244384766, "logps/rejected": -75.41253662109375, "loss": 0.6417, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.0388757549226284, "rewards/margins": 0.1117323786020279, "rewards/rejected": -0.1506081372499466, "step": 220 }, { "epoch": 0.46588175718445374, "grad_norm": 24.861696243286133, "learning_rate": 9.906641215078196e-08, "logits/chosen": -2.462665557861328, "logits/rejected": -2.309985876083374, "logps/chosen": -77.72923278808594, "logps/rejected": -75.81291961669922, "loss": 0.6384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.052964676171541214, "rewards/margins": 0.12125100940465927, "rewards/rejected": -0.1742156744003296, "step": 230 }, { "epoch": 0.48613748575769083, "grad_norm": 30.613037109375, "learning_rate": 9.882577299639835e-08, "logits/chosen": -2.4711391925811768, "logits/rejected": -2.329216957092285, "logps/chosen": -80.07206726074219, "logps/rejected": -80.30625915527344, "loss": 0.6333, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.0626656636595726, "rewards/margins": 0.13491353392601013, "rewards/rejected": -0.19757920503616333, "step": 240 }, { "epoch": 0.506393214330928, "grad_norm": 26.08587074279785, "learning_rate": 9.85579336105728e-08, "logits/chosen": -2.443732738494873, "logits/rejected": -2.2919225692749023, "logps/chosen": -81.61358642578125, "logps/rejected": -77.46446990966797, "loss": 0.6342, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.05705754831433296, "rewards/margins": 0.13192656636238098, "rewards/rejected": -0.18898411095142365, "step": 250 }, { "epoch": 0.5266489429041651, "grad_norm": 24.169845581054688, "learning_rate": 9.826304320329907e-08, "logits/chosen": -2.478874683380127, "logits/rejected": -2.297999382019043, "logps/chosen": -83.48451232910156, "logps/rejected": -78.53952026367188, "loss": 0.6226, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.07330447435379028, "rewards/margins": 0.16096489131450653, "rewards/rejected": -0.23426935076713562, "step": 260 }, { "epoch": 0.5469046714774022, "grad_norm": 26.509349822998047, "learning_rate": 9.794126605435884e-08, "logits/chosen": -2.452291488647461, "logits/rejected": -2.2766990661621094, "logps/chosen": -83.7742691040039, "logps/rejected": -80.86663818359375, "loss": 0.6085, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.08826017379760742, "rewards/margins": 0.20091946423053741, "rewards/rejected": -0.28917962312698364, "step": 270 }, { "epoch": 0.5671604000506393, "grad_norm": 23.097434997558594, "learning_rate": 9.759278142180347e-08, "logits/chosen": -2.4537911415100098, "logits/rejected": -2.291194438934326, "logps/chosen": -77.87368774414062, "logps/rejected": -77.5306625366211, "loss": 0.6047, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.08215166628360748, "rewards/margins": 0.20681920647621155, "rewards/rejected": -0.28897085785865784, "step": 280 }, { "epoch": 0.5874161286238765, "grad_norm": 25.596263885498047, "learning_rate": 9.72177834420916e-08, "logits/chosen": -2.4155325889587402, "logits/rejected": -2.2689411640167236, "logps/chosen": -84.03662109375, "logps/rejected": -83.03952026367188, "loss": 0.6053, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12551462650299072, "rewards/margins": 0.20985326170921326, "rewards/rejected": -0.33536791801452637, "step": 290 }, { "epoch": 0.6076718571971136, "grad_norm": 32.68680191040039, "learning_rate": 9.68164810219381e-08, "logits/chosen": -2.4283127784729004, "logits/rejected": -2.3022093772888184, "logps/chosen": -74.84422302246094, "logps/rejected": -76.46062469482422, "loss": 0.6136, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.13987191021442413, "rewards/margins": 0.19149479269981384, "rewards/rejected": -0.33136671781539917, "step": 300 }, { "epoch": 0.6279275857703507, "grad_norm": 26.36361312866211, "learning_rate": 9.638909772193478e-08, "logits/chosen": -2.3842902183532715, "logits/rejected": -2.242034435272217, "logps/chosen": -85.19241333007812, "logps/rejected": -81.81166076660156, "loss": 0.6018, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.12717826664447784, "rewards/margins": 0.21827277541160583, "rewards/rejected": -0.3454510569572449, "step": 310 }, { "epoch": 0.6481833143435878, "grad_norm": 26.719867706298828, "learning_rate": 9.593587163200753e-08, "logits/chosen": -2.4053542613983154, "logits/rejected": -2.277993679046631, "logps/chosen": -82.21893310546875, "logps/rejected": -82.72879791259766, "loss": 0.5946, "rewards/accuracies": 0.765625, "rewards/chosen": -0.14859510958194733, "rewards/margins": 0.2397138774394989, "rewards/rejected": -0.38830894231796265, "step": 320 }, { "epoch": 0.6684390429168249, "grad_norm": 29.379661560058594, "learning_rate": 9.545705523877943e-08, "logits/chosen": -2.39337420463562, "logits/rejected": -2.243393659591675, "logps/chosen": -89.27748107910156, "logps/rejected": -88.36190795898438, "loss": 0.583, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.15845103561878204, "rewards/margins": 0.27531546354293823, "rewards/rejected": -0.43376651406288147, "step": 330 }, { "epoch": 0.6886947714900621, "grad_norm": 29.51100730895996, "learning_rate": 9.495291528491348e-08, "logits/chosen": -2.4061636924743652, "logits/rejected": -2.2737927436828613, "logps/chosen": -77.5340805053711, "logps/rejected": -78.92658996582031, "loss": 0.6076, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.18812718987464905, "rewards/margins": 0.22010421752929688, "rewards/rejected": -0.4082314372062683, "step": 340 }, { "epoch": 0.7089505000632992, "grad_norm": 28.85425567626953, "learning_rate": 9.442373262051371e-08, "logits/chosen": -2.3706448078155518, "logits/rejected": -2.207597017288208, "logps/chosen": -84.70035552978516, "logps/rejected": -79.90083312988281, "loss": 0.596, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.1977623850107193, "rewards/margins": 0.24996185302734375, "rewards/rejected": -0.44772419333457947, "step": 350 }, { "epoch": 0.7292062286365363, "grad_norm": 28.540573120117188, "learning_rate": 9.386980204666698e-08, "logits/chosen": -2.369175910949707, "logits/rejected": -2.214489459991455, "logps/chosen": -80.21549224853516, "logps/rejected": -79.48722839355469, "loss": 0.5941, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.21811044216156006, "rewards/margins": 0.2636975646018982, "rewards/rejected": -0.48180800676345825, "step": 360 }, { "epoch": 0.7494619572097734, "grad_norm": 32.249847412109375, "learning_rate": 9.3291432151213e-08, "logits/chosen": -2.3587095737457275, "logits/rejected": -2.218735456466675, "logps/chosen": -85.94621276855469, "logps/rejected": -85.6765365600586, "loss": 0.5842, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.21563701331615448, "rewards/margins": 0.27373427152633667, "rewards/rejected": -0.48937129974365234, "step": 370 }, { "epoch": 0.7697176857830105, "grad_norm": 26.788440704345703, "learning_rate": 9.26889451368339e-08, "logits/chosen": -2.368450164794922, "logits/rejected": -2.2382750511169434, "logps/chosen": -84.80735778808594, "logps/rejected": -83.58937072753906, "loss": 0.5836, "rewards/accuracies": 0.734375, "rewards/chosen": -0.22893838584423065, "rewards/margins": 0.27746590971946716, "rewards/rejected": -0.5064042806625366, "step": 380 }, { "epoch": 0.7899734143562476, "grad_norm": 26.557083129882812, "learning_rate": 9.206267664155906e-08, "logits/chosen": -2.389660120010376, "logits/rejected": -2.2198188304901123, "logps/chosen": -80.32451629638672, "logps/rejected": -81.00599670410156, "loss": 0.5576, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.22655515372753143, "rewards/margins": 0.3575701117515564, "rewards/rejected": -0.584125280380249, "step": 390 }, { "epoch": 0.8102291429294848, "grad_norm": 22.922771453857422, "learning_rate": 9.141297555178535e-08, "logits/chosen": -2.4124135971069336, "logits/rejected": -2.25138521194458, "logps/chosen": -73.63737487792969, "logps/rejected": -74.43919372558594, "loss": 0.5756, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2471962869167328, "rewards/margins": 0.31536445021629333, "rewards/rejected": -0.5625607371330261, "step": 400 }, { "epoch": 0.8304848715027219, "grad_norm": 27.076038360595703, "learning_rate": 9.074020380791693e-08, "logits/chosen": -2.387418270111084, "logits/rejected": -2.233450412750244, "logps/chosen": -75.89783477783203, "logps/rejected": -77.44602966308594, "loss": 0.5727, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2607758641242981, "rewards/margins": 0.3421139121055603, "rewards/rejected": -0.6028897762298584, "step": 410 }, { "epoch": 0.850740600075959, "grad_norm": 28.83265495300293, "learning_rate": 9.004473620273263e-08, "logits/chosen": -2.3343796730041504, "logits/rejected": -2.207730293273926, "logps/chosen": -80.99537658691406, "logps/rejected": -83.44149017333984, "loss": 0.5767, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2705189883708954, "rewards/margins": 0.32222992181777954, "rewards/rejected": -0.5927489399909973, "step": 420 }, { "epoch": 0.8709963286491961, "grad_norm": 29.984909057617188, "learning_rate": 8.932696017259361e-08, "logits/chosen": -2.3199007511138916, "logits/rejected": -2.1576333045959473, "logps/chosen": -85.59019470214844, "logps/rejected": -84.45293426513672, "loss": 0.5712, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.25998255610466003, "rewards/margins": 0.3359551429748535, "rewards/rejected": -0.5959377288818359, "step": 430 }, { "epoch": 0.8912520572224332, "grad_norm": 25.058645248413086, "learning_rate": 8.858727558160743e-08, "logits/chosen": -2.3427436351776123, "logits/rejected": -2.1888678073883057, "logps/chosen": -82.66050720214844, "logps/rejected": -82.61741638183594, "loss": 0.5589, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.26411646604537964, "rewards/margins": 0.3582358956336975, "rewards/rejected": -0.6223524212837219, "step": 440 }, { "epoch": 0.9115077857956704, "grad_norm": 27.469467163085938, "learning_rate": 8.782609449886861e-08, "logits/chosen": -2.325899839401245, "logits/rejected": -2.1819796562194824, "logps/chosen": -82.68738555908203, "logps/rejected": -82.69558715820312, "loss": 0.5571, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.27801764011383057, "rewards/margins": 0.37557533383369446, "rewards/rejected": -0.6535929441452026, "step": 450 }, { "epoch": 0.9317635143689075, "grad_norm": 25.38697052001953, "learning_rate": 8.704384096890013e-08, "logits/chosen": -2.3276028633117676, "logits/rejected": -2.1806609630584717, "logps/chosen": -84.02021789550781, "logps/rejected": -83.25233459472656, "loss": 0.5403, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.2819129526615143, "rewards/margins": 0.41642332077026367, "rewards/rejected": -0.6983363032341003, "step": 460 }, { "epoch": 0.9520192429421446, "grad_norm": 32.92365264892578, "learning_rate": 8.62409507754235e-08, "logits/chosen": -2.2575485706329346, "logits/rejected": -2.1456449031829834, "logps/chosen": -87.51969909667969, "logps/rejected": -87.74186706542969, "loss": 0.5594, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.3425014913082123, "rewards/margins": 0.37962430715560913, "rewards/rejected": -0.7221258878707886, "step": 470 }, { "epoch": 0.9722749715153817, "grad_norm": 29.529056549072266, "learning_rate": 8.541787119858902e-08, "logits/chosen": -2.302694082260132, "logits/rejected": -2.162090539932251, "logps/chosen": -79.02600860595703, "logps/rejected": -78.34095764160156, "loss": 0.5721, "rewards/accuracies": 0.734375, "rewards/chosen": -0.35388362407684326, "rewards/margins": 0.3509567975997925, "rewards/rejected": -0.7048403024673462, "step": 480 }, { "epoch": 0.9925307000886188, "grad_norm": 23.92174530029297, "learning_rate": 8.457506076580162e-08, "logits/chosen": -2.3030953407287598, "logits/rejected": -2.158973217010498, "logps/chosen": -81.10468292236328, "logps/rejected": -83.77845764160156, "loss": 0.547, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.3333788514137268, "rewards/margins": 0.4156631529331207, "rewards/rejected": -0.7490419745445251, "step": 490 }, { "epoch": 1.012786428661856, "grad_norm": 30.66814613342285, "learning_rate": 8.371298899628089e-08, "logits/chosen": -2.2549185752868652, "logits/rejected": -2.122537612915039, "logps/chosen": -83.01513671875, "logps/rejected": -86.776123046875, "loss": 0.5316, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.380868136882782, "rewards/margins": 0.471442848443985, "rewards/rejected": -0.8523109555244446, "step": 500 }, { "epoch": 1.033042157235093, "grad_norm": 26.109542846679688, "learning_rate": 8.28321361394978e-08, "logits/chosen": -2.2775070667266846, "logits/rejected": -2.1255202293395996, "logps/chosen": -81.81797790527344, "logps/rejected": -84.06095123291016, "loss": 0.5349, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3651012182235718, "rewards/margins": 0.4572354853153229, "rewards/rejected": -0.8223366737365723, "step": 510 }, { "epoch": 1.0532978858083302, "grad_norm": 28.901084899902344, "learning_rate": 8.193299290763362e-08, "logits/chosen": -2.2764482498168945, "logits/rejected": -2.128359317779541, "logps/chosen": -83.87080383300781, "logps/rejected": -83.10397338867188, "loss": 0.5413, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.38391047716140747, "rewards/margins": 0.43836045265197754, "rewards/rejected": -0.8222709894180298, "step": 520 }, { "epoch": 1.0735536143815674, "grad_norm": 26.954652786254883, "learning_rate": 8.101606020221038e-08, "logits/chosen": -2.26556134223938, "logits/rejected": -2.12338924407959, "logps/chosen": -85.02649688720703, "logps/rejected": -85.88096618652344, "loss": 0.5519, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.4198201298713684, "rewards/margins": 0.4140090048313141, "rewards/rejected": -0.8338291049003601, "step": 530 }, { "epoch": 1.0938093429548044, "grad_norm": 27.471698760986328, "learning_rate": 8.008184883504472e-08, "logits/chosen": -2.285780668258667, "logits/rejected": -2.15956449508667, "logps/chosen": -91.95730590820312, "logps/rejected": -89.56153869628906, "loss": 0.5388, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.37540143728256226, "rewards/margins": 0.4454117715358734, "rewards/rejected": -0.8208131790161133, "step": 540 }, { "epoch": 1.1140650715280416, "grad_norm": 27.839784622192383, "learning_rate": 7.913087924368102e-08, "logits/chosen": -2.272618055343628, "logits/rejected": -2.146136522293091, "logps/chosen": -82.60453033447266, "logps/rejected": -84.6351547241211, "loss": 0.5466, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3854660391807556, "rewards/margins": 0.4508994221687317, "rewards/rejected": -0.8363655209541321, "step": 550 }, { "epoch": 1.1343208001012786, "grad_norm": 29.220504760742188, "learning_rate": 7.816368120146224e-08, "logits/chosen": -2.2264904975891113, "logits/rejected": -2.1163620948791504, "logps/chosen": -80.90209197998047, "logps/rejected": -84.96456909179688, "loss": 0.5371, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.3850155472755432, "rewards/margins": 0.47849899530410767, "rewards/rejected": -0.8635146021842957, "step": 560 }, { "epoch": 1.1545765286745158, "grad_norm": 24.049087524414062, "learning_rate": 7.718079352239955e-08, "logits/chosen": -2.2715275287628174, "logits/rejected": -2.095773220062256, "logps/chosen": -83.15594482421875, "logps/rejected": -83.11366271972656, "loss": 0.5244, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.38344329595565796, "rewards/margins": 0.4991793632507324, "rewards/rejected": -0.8826227188110352, "step": 570 }, { "epoch": 1.174832257247753, "grad_norm": 32.831993103027344, "learning_rate": 7.618276376100587e-08, "logits/chosen": -2.2670161724090576, "logits/rejected": -2.106199264526367, "logps/chosen": -81.29884338378906, "logps/rejected": -82.74469757080078, "loss": 0.5303, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4227599501609802, "rewards/margins": 0.4853205680847168, "rewards/rejected": -0.9080804586410522, "step": 580 }, { "epoch": 1.19508798582099, "grad_norm": 26.217998504638672, "learning_rate": 7.517014790726011e-08, "logits/chosen": -2.2205467224121094, "logits/rejected": -2.0953052043914795, "logps/chosen": -83.64337921142578, "logps/rejected": -86.88542938232422, "loss": 0.5513, "rewards/accuracies": 0.734375, "rewards/chosen": -0.43212181329727173, "rewards/margins": 0.44605493545532227, "rewards/rejected": -0.8781768083572388, "step": 590 }, { "epoch": 1.2153437143942272, "grad_norm": 22.538532257080078, "learning_rate": 7.414351007687187e-08, "logits/chosen": -2.205540418624878, "logits/rejected": -2.0615344047546387, "logps/chosen": -88.0082015991211, "logps/rejected": -85.7721176147461, "loss": 0.5173, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.4331473708152771, "rewards/margins": 0.5372229814529419, "rewards/rejected": -0.9703702926635742, "step": 600 }, { "epoch": 1.2355994429674642, "grad_norm": 29.271059036254883, "learning_rate": 7.310342219701981e-08, "logits/chosen": -2.2468390464782715, "logits/rejected": -2.107861280441284, "logps/chosen": -82.70673370361328, "logps/rejected": -83.1991958618164, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": -0.4486440122127533, "rewards/margins": 0.5201537013053894, "rewards/rejected": -0.9687976837158203, "step": 610 }, { "epoch": 1.2558551715407014, "grad_norm": 26.889476776123047, "learning_rate": 7.205046368773794e-08, "logits/chosen": -2.1630682945251465, "logits/rejected": -2.018644332885742, "logps/chosen": -86.0914535522461, "logps/rejected": -87.1644515991211, "loss": 0.5364, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.48391470313072205, "rewards/margins": 0.474750280380249, "rewards/rejected": -0.9586650133132935, "step": 620 }, { "epoch": 1.2761109001139386, "grad_norm": 32.43236541748047, "learning_rate": 7.098522113912808e-08, "logits/chosen": -2.2398154735565186, "logits/rejected": -2.0971333980560303, "logps/chosen": -81.19099426269531, "logps/rejected": -82.07807922363281, "loss": 0.5372, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.441434383392334, "rewards/margins": 0.5117905735969543, "rewards/rejected": -0.9532249569892883, "step": 630 }, { "epoch": 1.2963666286871756, "grad_norm": 26.83734130859375, "learning_rate": 6.990828798457764e-08, "logits/chosen": -2.2109534740448, "logits/rejected": -2.083139657974243, "logps/chosen": -80.3914566040039, "logps/rejected": -87.99849700927734, "loss": 0.5314, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4658758044242859, "rewards/margins": 0.5169892311096191, "rewards/rejected": -0.9828651547431946, "step": 640 }, { "epoch": 1.3166223572604128, "grad_norm": 25.39097785949707, "learning_rate": 6.882026417016541e-08, "logits/chosen": -2.230027437210083, "logits/rejected": -2.100419759750366, "logps/chosen": -82.01457214355469, "logps/rejected": -83.42109680175781, "loss": 0.5346, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.4833486080169678, "rewards/margins": 0.4825879633426666, "rewards/rejected": -0.9659366607666016, "step": 650 }, { "epoch": 1.3368780858336498, "grad_norm": 30.31522560119629, "learning_rate": 6.772175582043889e-08, "logits/chosen": -2.1776843070983887, "logits/rejected": -2.039802074432373, "logps/chosen": -83.42335510253906, "logps/rejected": -88.11833953857422, "loss": 0.4909, "rewards/accuracies": 0.828125, "rewards/chosen": -0.45507392287254333, "rewards/margins": 0.6273630857467651, "rewards/rejected": -1.0824369192123413, "step": 660 }, { "epoch": 1.357133814406887, "grad_norm": 34.97041702270508, "learning_rate": 6.661337490075003e-08, "logits/chosen": -2.2355475425720215, "logits/rejected": -2.095804452896118, "logps/chosen": -83.29669952392578, "logps/rejected": -84.55702209472656, "loss": 0.5307, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.5031043291091919, "rewards/margins": 0.5249863862991333, "rewards/rejected": -1.0280907154083252, "step": 670 }, { "epoch": 1.377389542980124, "grad_norm": 26.33791160583496, "learning_rate": 6.549573887633676e-08, "logits/chosen": -2.1734795570373535, "logits/rejected": -2.028352737426758, "logps/chosen": -83.61205291748047, "logps/rejected": -84.9587173461914, "loss": 0.4813, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46062904596328735, "rewards/margins": 0.6703583002090454, "rewards/rejected": -1.1309874057769775, "step": 680 }, { "epoch": 1.3976452715533612, "grad_norm": 35.68913269042969, "learning_rate": 6.436947036834086e-08, "logits/chosen": -2.1777005195617676, "logits/rejected": -2.054405689239502, "logps/chosen": -83.69397735595703, "logps/rejected": -86.34680938720703, "loss": 0.5313, "rewards/accuracies": 0.765625, "rewards/chosen": -0.518213152885437, "rewards/margins": 0.5348538756370544, "rewards/rejected": -1.0530669689178467, "step": 690 }, { "epoch": 1.4179010001265984, "grad_norm": 30.295581817626953, "learning_rate": 6.323519680695349e-08, "logits/chosen": -2.1419105529785156, "logits/rejected": -1.9936020374298096, "logps/chosen": -90.16146087646484, "logps/rejected": -89.37017822265625, "loss": 0.522, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5005131363868713, "rewards/margins": 0.5668459534645081, "rewards/rejected": -1.067359209060669, "step": 700 }, { "epoch": 1.4381567286998354, "grad_norm": 26.36173439025879, "learning_rate": 6.209355008188152e-08, "logits/chosen": -2.1437783241271973, "logits/rejected": -2.0539603233337402, "logps/chosen": -89.0562973022461, "logps/rejected": -92.04231262207031, "loss": 0.5409, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5396249294281006, "rewards/margins": 0.5132231116294861, "rewards/rejected": -1.0528481006622314, "step": 710 }, { "epoch": 1.4584124572730726, "grad_norm": 27.820674896240234, "learning_rate": 6.094516619032975e-08, "logits/chosen": -2.1499810218811035, "logits/rejected": -2.025269031524658, "logps/chosen": -83.47040557861328, "logps/rejected": -86.60690307617188, "loss": 0.5196, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.49484682083129883, "rewards/margins": 0.5879716277122498, "rewards/rejected": -1.0828183889389038, "step": 720 }, { "epoch": 1.4786681858463098, "grad_norm": 24.657302856445312, "learning_rate": 5.979068488269468e-08, "logits/chosen": -2.1996073722839355, "logits/rejected": -2.0537047386169434, "logps/chosen": -86.91001892089844, "logps/rejected": -89.12824249267578, "loss": 0.4914, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.542976975440979, "rewards/margins": 0.6267004013061523, "rewards/rejected": -1.169677495956421, "step": 730 }, { "epoch": 1.4989239144195468, "grad_norm": 28.313621520996094, "learning_rate": 5.8630749306167556e-08, "logits/chosen": -2.1813175678253174, "logits/rejected": -2.0757999420166016, "logps/chosen": -83.51826477050781, "logps/rejected": -89.63159942626953, "loss": 0.5115, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.515524685382843, "rewards/margins": 0.594115674495697, "rewards/rejected": -1.1096404790878296, "step": 740 }, { "epoch": 1.5191796429927837, "grad_norm": 29.172090530395508, "learning_rate": 5.7466005646445095e-08, "logits/chosen": -2.1559250354766846, "logits/rejected": -2.0057528018951416, "logps/chosen": -83.4648208618164, "logps/rejected": -85.3985366821289, "loss": 0.4947, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5010935068130493, "rewards/margins": 0.6447556018829346, "rewards/rejected": -1.1458488702774048, "step": 750 }, { "epoch": 1.5394353715660212, "grad_norm": 30.766334533691406, "learning_rate": 5.6297102767747325e-08, "logits/chosen": -2.1724162101745605, "logits/rejected": -2.0574355125427246, "logps/chosen": -90.57199096679688, "logps/rejected": -92.65487670898438, "loss": 0.5309, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5361162424087524, "rewards/margins": 0.5525388121604919, "rewards/rejected": -1.0886551141738892, "step": 760 }, { "epoch": 1.5596911001392582, "grad_norm": 32.11360549926758, "learning_rate": 5.512469185134354e-08, "logits/chosen": -2.1918747425079346, "logits/rejected": -2.054835796356201, "logps/chosen": -84.68449401855469, "logps/rejected": -87.59661865234375, "loss": 0.4972, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.5611705780029297, "rewards/margins": 0.6253485083580017, "rewards/rejected": -1.1865190267562866, "step": 770 }, { "epoch": 1.5799468287124951, "grad_norm": 28.335988998413086, "learning_rate": 5.394942603278726e-08, "logits/chosen": -2.1388983726501465, "logits/rejected": -2.0282931327819824, "logps/chosen": -89.77628326416016, "logps/rejected": -90.62747955322266, "loss": 0.5258, "rewards/accuracies": 0.75, "rewards/chosen": -0.524773895740509, "rewards/margins": 0.5710369944572449, "rewards/rejected": -1.095810890197754, "step": 780 }, { "epoch": 1.6002025572857324, "grad_norm": 25.966876983642578, "learning_rate": 5.277196003806249e-08, "logits/chosen": -2.2010576725006104, "logits/rejected": -2.0596330165863037, "logps/chosen": -79.80914306640625, "logps/rejected": -82.98551940917969, "loss": 0.4784, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.4947393536567688, "rewards/margins": 0.6838704347610474, "rewards/rejected": -1.1786099672317505, "step": 790 }, { "epoch": 1.6204582858589696, "grad_norm": 29.524734497070312, "learning_rate": 5.1592949818844046e-08, "logits/chosen": -2.1284611225128174, "logits/rejected": -2.0030832290649414, "logps/chosen": -86.19153594970703, "logps/rejected": -87.28998565673828, "loss": 0.5126, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.6002845764160156, "rewards/margins": 0.6240141987800598, "rewards/rejected": -1.2242988348007202, "step": 800 }, { "epoch": 1.6407140144322065, "grad_norm": 25.266782760620117, "learning_rate": 5.0413052187075054e-08, "logits/chosen": -2.168487787246704, "logits/rejected": -2.0182714462280273, "logps/chosen": -79.49079132080078, "logps/rejected": -81.5199203491211, "loss": 0.5011, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.48622050881385803, "rewards/margins": 0.6199517846107483, "rewards/rejected": -1.1061723232269287, "step": 810 }, { "epoch": 1.6609697430054438, "grad_norm": 26.315034866333008, "learning_rate": 4.9232924449065095e-08, "logits/chosen": -2.1593496799468994, "logits/rejected": -2.030149459838867, "logps/chosen": -81.82089233398438, "logps/rejected": -89.67662811279297, "loss": 0.5042, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.5738162994384766, "rewards/margins": 0.6879658102989197, "rewards/rejected": -1.261782169342041, "step": 820 }, { "epoch": 1.681225471578681, "grad_norm": 33.470664978027344, "learning_rate": 4.8053224039313114e-08, "logits/chosen": -2.1270673274993896, "logits/rejected": -2.012338638305664, "logps/chosen": -86.01063537597656, "logps/rejected": -85.80992889404297, "loss": 0.5459, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.535088837146759, "rewards/margins": 0.49604400992393494, "rewards/rejected": -1.0311328172683716, "step": 830 }, { "epoch": 1.701481200151918, "grad_norm": 20.869911193847656, "learning_rate": 4.687460815425878e-08, "logits/chosen": -2.157341480255127, "logits/rejected": -2.007072925567627, "logps/chosen": -80.98677825927734, "logps/rejected": -83.40042114257812, "loss": 0.4965, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.568179190158844, "rewards/margins": 0.660231351852417, "rewards/rejected": -1.2284104824066162, "step": 840 }, { "epoch": 1.721736928725155, "grad_norm": 23.50938606262207, "learning_rate": 4.5697733386166524e-08, "logits/chosen": -2.1210384368896484, "logits/rejected": -1.9905798435211182, "logps/chosen": -87.40711975097656, "logps/rejected": -86.93902587890625, "loss": 0.5181, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5617082715034485, "rewards/margins": 0.592042088508606, "rewards/rejected": -1.1537501811981201, "step": 850 }, { "epoch": 1.7419926572983924, "grad_norm": 27.28333282470703, "learning_rate": 4.4523255357346187e-08, "logits/chosen": -2.1478943824768066, "logits/rejected": -2.024747371673584, "logps/chosen": -84.65662384033203, "logps/rejected": -88.84449005126953, "loss": 0.4993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5495315790176392, "rewards/margins": 0.6314437985420227, "rewards/rejected": -1.1809751987457275, "step": 860 }, { "epoch": 1.7622483858716294, "grad_norm": 25.31683349609375, "learning_rate": 4.335182835491387e-08, "logits/chosen": -2.1592297554016113, "logits/rejected": -2.031510829925537, "logps/chosen": -83.72755432128906, "logps/rejected": -90.34858703613281, "loss": 0.5023, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.6199240684509277, "rewards/margins": 0.634971022605896, "rewards/rejected": -1.2548949718475342, "step": 870 }, { "epoch": 1.7825041144448663, "grad_norm": 29.31256103515625, "learning_rate": 4.218410496629684e-08, "logits/chosen": -2.1241517066955566, "logits/rejected": -1.9871305227279663, "logps/chosen": -76.77335357666016, "logps/rejected": -80.37085723876953, "loss": 0.4854, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5658852458000183, "rewards/margins": 0.6777737736701965, "rewards/rejected": -1.2436590194702148, "step": 880 }, { "epoch": 1.8027598430181035, "grad_norm": 29.485347747802734, "learning_rate": 4.102073571568516e-08, "logits/chosen": -2.1224985122680664, "logits/rejected": -1.993857741355896, "logps/chosen": -86.9496078491211, "logps/rejected": -88.30699157714844, "loss": 0.506, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.5687106847763062, "rewards/margins": 0.6147977709770203, "rewards/rejected": -1.1835086345672607, "step": 890 }, { "epoch": 1.8230155715913408, "grad_norm": 27.672901153564453, "learning_rate": 3.986236870163262e-08, "logits/chosen": -2.1112308502197266, "logits/rejected": -1.99734628200531, "logps/chosen": -85.15098571777344, "logps/rejected": -90.8910140991211, "loss": 0.5078, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5743888020515442, "rewards/margins": 0.6254476308822632, "rewards/rejected": -1.1998364925384521, "step": 900 }, { "epoch": 1.8432713001645777, "grad_norm": 28.670635223388672, "learning_rate": 3.870964923600923e-08, "logits/chosen": -2.088013172149658, "logits/rejected": -1.9703779220581055, "logps/chosen": -83.16795349121094, "logps/rejected": -86.55892181396484, "loss": 0.496, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5139526724815369, "rewards/margins": 0.6616466045379639, "rewards/rejected": -1.1755993366241455, "step": 910 }, { "epoch": 1.863527028737815, "grad_norm": 30.791370391845703, "learning_rate": 3.756321948450599e-08, "logits/chosen": -2.120954751968384, "logits/rejected": -1.9975354671478271, "logps/chosen": -84.26751708984375, "logps/rejected": -86.91252899169922, "loss": 0.5453, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6508383750915527, "rewards/margins": 0.5812313556671143, "rewards/rejected": -1.232069730758667, "step": 920 }, { "epoch": 1.8837827573110522, "grad_norm": 33.708187103271484, "learning_rate": 3.642371810889222e-08, "logits/chosen": -2.092048168182373, "logits/rejected": -1.9911048412322998, "logps/chosen": -84.46561431884766, "logps/rejected": -88.94342041015625, "loss": 0.5404, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5762837529182434, "rewards/margins": 0.5255244970321655, "rewards/rejected": -1.1018081903457642, "step": 930 }, { "epoch": 1.9040384858842891, "grad_norm": 25.534833908081055, "learning_rate": 3.529177991122518e-08, "logits/chosen": -2.066344738006592, "logits/rejected": -1.9486182928085327, "logps/chosen": -91.85676574707031, "logps/rejected": -94.47185516357422, "loss": 0.493, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5671579837799072, "rewards/margins": 0.6500786542892456, "rewards/rejected": -1.2172366380691528, "step": 940 }, { "epoch": 1.9242942144575261, "grad_norm": 26.457944869995117, "learning_rate": 3.416803548020969e-08, "logits/chosen": -2.115591049194336, "logits/rejected": -1.9885094165802002, "logps/chosen": -89.46985626220703, "logps/rejected": -93.4120864868164, "loss": 0.5111, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.5443202257156372, "rewards/margins": 0.6258162260055542, "rewards/rejected": -1.1701364517211914, "step": 950 }, { "epoch": 1.9445499430307633, "grad_norm": 27.294607162475586, "learning_rate": 3.305311083990496e-08, "logits/chosen": -2.1644487380981445, "logits/rejected": -2.040801525115967, "logps/chosen": -76.92610931396484, "logps/rejected": -82.45623016357422, "loss": 0.5196, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6151694655418396, "rewards/margins": 0.5893052220344543, "rewards/rejected": -1.204474687576294, "step": 960 }, { "epoch": 1.9648056716040005, "grad_norm": 27.27229881286621, "learning_rate": 3.194762710097436e-08, "logits/chosen": -2.1350479125976562, "logits/rejected": -2.030987501144409, "logps/chosen": -80.64533996582031, "logps/rejected": -85.28868103027344, "loss": 0.5371, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.5596300363540649, "rewards/margins": 0.5202323794364929, "rewards/rejected": -1.0798624753952026, "step": 970 }, { "epoch": 1.9850614001772375, "grad_norm": 24.55060386657715, "learning_rate": 3.0852200114672453e-08, "logits/chosen": -2.127375602722168, "logits/rejected": -1.991199254989624, "logps/chosen": -85.77812957763672, "logps/rejected": -89.46642303466797, "loss": 0.5081, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5709745287895203, "rewards/margins": 0.6586155891418457, "rewards/rejected": -1.2295901775360107, "step": 980 }, { "epoch": 2.005317128750475, "grad_norm": 25.645376205444336, "learning_rate": 2.976744012976189e-08, "logits/chosen": -2.1159276962280273, "logits/rejected": -1.9866254329681396, "logps/chosen": -87.69465637207031, "logps/rejected": -88.93052673339844, "loss": 0.4894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5695109367370605, "rewards/margins": 0.6490300893783569, "rewards/rejected": -1.218540906906128, "step": 990 }, { "epoch": 2.025572857323712, "grad_norm": 27.878236770629883, "learning_rate": 2.8693951452551307e-08, "logits/chosen": -2.0782949924468994, "logits/rejected": -1.9823192358016968, "logps/chosen": -79.88198852539062, "logps/rejected": -86.08316802978516, "loss": 0.5315, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5980249643325806, "rewards/margins": 0.5769001245498657, "rewards/rejected": -1.1749250888824463, "step": 1000 }, { "epoch": 2.045828585896949, "grad_norm": 28.799379348754883, "learning_rate": 2.7632332110243967e-08, "logits/chosen": -2.0895416736602783, "logits/rejected": -1.975619912147522, "logps/chosen": -86.46625518798828, "logps/rejected": -90.41603088378906, "loss": 0.4972, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5257114171981812, "rewards/margins": 0.6659296751022339, "rewards/rejected": -1.1916412115097046, "step": 1010 }, { "epoch": 2.066084314470186, "grad_norm": 48.88608169555664, "learning_rate": 2.658317351778412e-08, "logits/chosen": -2.099612236022949, "logits/rejected": -1.9862468242645264, "logps/chosen": -86.78905487060547, "logps/rejected": -92.29573059082031, "loss": 0.4904, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.564150869846344, "rewards/margins": 0.7153151631355286, "rewards/rejected": -1.279465913772583, "step": 1020 }, { "epoch": 2.0863400430434234, "grad_norm": 24.239578247070312, "learning_rate": 2.554706014838705e-08, "logits/chosen": -2.1574556827545166, "logits/rejected": -2.014895439147949, "logps/chosen": -84.7563247680664, "logps/rejected": -85.77194213867188, "loss": 0.4855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5780550837516785, "rewards/margins": 0.6501516103744507, "rewards/rejected": -1.2282066345214844, "step": 1030 }, { "epoch": 2.1065957716166603, "grad_norm": 28.262638092041016, "learning_rate": 2.4524569207936445e-08, "logits/chosen": -2.0934982299804688, "logits/rejected": -1.962937355041504, "logps/chosen": -86.09654235839844, "logps/rejected": -91.18133544921875, "loss": 0.4598, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -0.567767322063446, "rewards/margins": 0.8064430356025696, "rewards/rejected": -1.374210238456726, "step": 1040 }, { "epoch": 2.1268515001898973, "grad_norm": 30.161561965942383, "learning_rate": 2.351627031343008e-08, "logits/chosen": -2.134225368499756, "logits/rejected": -1.9962198734283447, "logps/chosen": -87.08121490478516, "logps/rejected": -90.91963958740234, "loss": 0.5068, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.5947022438049316, "rewards/margins": 0.6418746113777161, "rewards/rejected": -1.236576795578003, "step": 1050 }, { "epoch": 2.1471072287631348, "grad_norm": 29.3469295501709, "learning_rate": 2.2522725175653233e-08, "logits/chosen": -2.0764639377593994, "logits/rejected": -1.9766466617584229, "logps/chosen": -87.24481964111328, "logps/rejected": -93.39856719970703, "loss": 0.533, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6166614294052124, "rewards/margins": 0.5650585293769836, "rewards/rejected": -1.1817197799682617, "step": 1060 }, { "epoch": 2.1673629573363717, "grad_norm": 23.277862548828125, "learning_rate": 2.154448728625668e-08, "logits/chosen": -2.1141998767852783, "logits/rejected": -1.9909133911132812, "logps/chosen": -84.21327209472656, "logps/rejected": -86.62323760986328, "loss": 0.4652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5331219434738159, "rewards/margins": 0.7241859436035156, "rewards/rejected": -1.257307767868042, "step": 1070 }, { "epoch": 2.1876186859096087, "grad_norm": 26.303924560546875, "learning_rate": 2.0582101609413333e-08, "logits/chosen": -2.0846378803253174, "logits/rejected": -1.9430017471313477, "logps/chosen": -90.30846405029297, "logps/rejected": -91.7738265991211, "loss": 0.483, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5784596800804138, "rewards/margins": 0.7019392251968384, "rewards/rejected": -1.2803988456726074, "step": 1080 }, { "epoch": 2.2078744144828457, "grad_norm": 29.66044044494629, "learning_rate": 1.9636104278225413e-08, "logits/chosen": -2.113520860671997, "logits/rejected": -2.006913661956787, "logps/chosen": -87.39739990234375, "logps/rejected": -91.94351959228516, "loss": 0.4885, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6303068995475769, "rewards/margins": 0.7132034301757812, "rewards/rejected": -1.343510389328003, "step": 1090 }, { "epoch": 2.228130143056083, "grad_norm": 31.82991600036621, "learning_rate": 1.8707022296051462e-08, "logits/chosen": -2.1318724155426025, "logits/rejected": -1.997719407081604, "logps/chosen": -83.96778869628906, "logps/rejected": -91.7020492553711, "loss": 0.4837, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5827969312667847, "rewards/margins": 0.7294033765792847, "rewards/rejected": -1.3122001886367798, "step": 1100 }, { "epoch": 2.24838587162932, "grad_norm": 30.810279846191406, "learning_rate": 1.779537324291926e-08, "logits/chosen": -2.087120771408081, "logits/rejected": -1.974585771560669, "logps/chosen": -85.29585266113281, "logps/rejected": -90.92556762695312, "loss": 0.5036, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6058118939399719, "rewards/margins": 0.6656503081321716, "rewards/rejected": -1.2714622020721436, "step": 1110 }, { "epoch": 2.268641600202557, "grad_norm": 24.00498390197754, "learning_rate": 1.6901664987188425e-08, "logits/chosen": -2.0903851985931396, "logits/rejected": -1.9751678705215454, "logps/chosen": -83.34523010253906, "logps/rejected": -86.21971130371094, "loss": 0.5044, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5954999923706055, "rewards/margins": 0.6783249378204346, "rewards/rejected": -1.27382493019104, "step": 1120 }, { "epoch": 2.2888973287757945, "grad_norm": 39.50699234008789, "learning_rate": 1.6026395402623272e-08, "logits/chosen": -2.0663511753082275, "logits/rejected": -1.9365609884262085, "logps/chosen": -94.30004119873047, "logps/rejected": -97.46401977539062, "loss": 0.4973, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6117586493492126, "rewards/margins": 0.6866195797920227, "rewards/rejected": -1.2983782291412354, "step": 1130 }, { "epoch": 2.3091530573490315, "grad_norm": 26.588275909423828, "learning_rate": 1.5170052091033552e-08, "logits/chosen": -2.1126387119293213, "logits/rejected": -1.959896445274353, "logps/chosen": -80.80674743652344, "logps/rejected": -83.63319396972656, "loss": 0.4726, "rewards/accuracies": 0.8218749761581421, "rewards/chosen": -0.616263747215271, "rewards/margins": 0.7580442428588867, "rewards/rejected": -1.3743079900741577, "step": 1140 }, { "epoch": 2.3294087859222685, "grad_norm": 28.325511932373047, "learning_rate": 1.4333112110637453e-08, "logits/chosen": -2.064669609069824, "logits/rejected": -1.9410665035247803, "logps/chosen": -84.78388977050781, "logps/rejected": -87.63703155517578, "loss": 0.493, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5784262418746948, "rewards/margins": 0.691449761390686, "rewards/rejected": -1.2698760032653809, "step": 1150 }, { "epoch": 2.349664514495506, "grad_norm": 27.597017288208008, "learning_rate": 1.3516041710298498e-08, "logits/chosen": -2.1402578353881836, "logits/rejected": -2.004826068878174, "logps/chosen": -87.74010467529297, "logps/rejected": -89.4506607055664, "loss": 0.5047, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5920398831367493, "rewards/margins": 0.6377407908439636, "rewards/rejected": -1.2297805547714233, "step": 1160 }, { "epoch": 2.369920243068743, "grad_norm": 33.10106658935547, "learning_rate": 1.2719296069784063e-08, "logits/chosen": -2.062407970428467, "logits/rejected": -1.9447336196899414, "logps/chosen": -89.19010925292969, "logps/rejected": -95.0318374633789, "loss": 0.4953, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5857541561126709, "rewards/margins": 0.7076044082641602, "rewards/rejected": -1.293358564376831, "step": 1170 }, { "epoch": 2.39017597164198, "grad_norm": 33.07532501220703, "learning_rate": 1.1943319046190332e-08, "logits/chosen": -2.074035167694092, "logits/rejected": -1.965685486793518, "logps/chosen": -80.5416030883789, "logps/rejected": -84.71125030517578, "loss": 0.4871, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5754114389419556, "rewards/margins": 0.7324446439743042, "rewards/rejected": -1.3078559637069702, "step": 1180 }, { "epoch": 2.4104317002152174, "grad_norm": 26.195051193237305, "learning_rate": 1.1188542926675104e-08, "logits/chosen": -2.117806911468506, "logits/rejected": -1.9781955480575562, "logps/chosen": -86.0428466796875, "logps/rejected": -90.29086303710938, "loss": 0.4505, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.5670899152755737, "rewards/margins": 0.8041001558303833, "rewards/rejected": -1.371190071105957, "step": 1190 }, { "epoch": 2.4306874287884543, "grad_norm": 23.817218780517578, "learning_rate": 1.0455388187635933e-08, "logits/chosen": -2.1228573322296143, "logits/rejected": -1.9943969249725342, "logps/chosen": -81.49883270263672, "logps/rejected": -83.27964782714844, "loss": 0.4844, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.6024131774902344, "rewards/margins": 0.6800934076309204, "rewards/rejected": -1.2825065851211548, "step": 1200 }, { "epoch": 2.4509431573616913, "grad_norm": 30.316747665405273, "learning_rate": 9.744263260468005e-09, "logits/chosen": -2.059378147125244, "logits/rejected": -1.9458458423614502, "logps/chosen": -92.07587432861328, "logps/rejected": -95.30807495117188, "loss": 0.4914, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -0.5844647884368896, "rewards/margins": 0.7100616097450256, "rewards/rejected": -1.2945263385772705, "step": 1210 }, { "epoch": 2.4711988859349283, "grad_norm": 28.162200927734375, "learning_rate": 9.055564304031981e-09, "logits/chosen": -2.082139730453491, "logits/rejected": -1.9701576232910156, "logps/chosen": -86.1491928100586, "logps/rejected": -92.10936737060547, "loss": 0.4954, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.5959383845329285, "rewards/margins": 0.7093779444694519, "rewards/rejected": -1.3053163290023804, "step": 1220 }, { "epoch": 2.4914546145081657, "grad_norm": 29.766904830932617, "learning_rate": 8.38967498395895e-09, "logits/chosen": -2.1094155311584473, "logits/rejected": -1.9789727926254272, "logps/chosen": -80.73603820800781, "logps/rejected": -84.64231872558594, "loss": 0.4904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5383921265602112, "rewards/margins": 0.6760575175285339, "rewards/rejected": -1.2144496440887451, "step": 1230 }, { "epoch": 2.5117103430814027, "grad_norm": 35.819358825683594, "learning_rate": 7.746966258914988e-09, "logits/chosen": -2.1154112815856934, "logits/rejected": -1.9815971851348877, "logps/chosen": -86.95155334472656, "logps/rejected": -85.64454650878906, "loss": 0.5197, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6145626306533813, "rewards/margins": 0.631356418132782, "rewards/rejected": -1.2459189891815186, "step": 1240 }, { "epoch": 2.5319660716546397, "grad_norm": 29.06343650817871, "learning_rate": 7.127796173944695e-09, "logits/chosen": -2.085669994354248, "logits/rejected": -1.9675817489624023, "logps/chosen": -89.59977722167969, "logps/rejected": -91.9139404296875, "loss": 0.4879, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5644815564155579, "rewards/margins": 0.7162417769432068, "rewards/rejected": -1.2807233333587646, "step": 1250 }, { "epoch": 2.552221800227877, "grad_norm": 32.731590270996094, "learning_rate": 6.532509661008789e-09, "logits/chosen": -2.1157116889953613, "logits/rejected": -1.9942877292633057, "logps/chosen": -81.25712585449219, "logps/rejected": -85.86751556396484, "loss": 0.4855, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5965838432312012, "rewards/margins": 0.7261613011360168, "rewards/rejected": -1.3227452039718628, "step": 1260 }, { "epoch": 2.572477528801114, "grad_norm": 21.976213455200195, "learning_rate": 5.9614383468267916e-09, "logits/chosen": -2.064387321472168, "logits/rejected": -1.939223289489746, "logps/chosen": -87.50946807861328, "logps/rejected": -90.81806945800781, "loss": 0.4677, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5381378531455994, "rewards/margins": 0.7613744735717773, "rewards/rejected": -1.2995123863220215, "step": 1270 }, { "epoch": 2.592733257374351, "grad_norm": 26.645793914794922, "learning_rate": 5.4149003681318525e-09, "logits/chosen": -2.0983309745788574, "logits/rejected": -1.9681360721588135, "logps/chosen": -86.5027847290039, "logps/rejected": -88.28022003173828, "loss": 0.4846, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.6178566217422485, "rewards/margins": 0.7156898975372314, "rewards/rejected": -1.3335466384887695, "step": 1280 }, { "epoch": 2.612988985947588, "grad_norm": 34.658424377441406, "learning_rate": 4.8932001944408e-09, "logits/chosen": -2.114567995071411, "logits/rejected": -2.003492832183838, "logps/chosen": -86.31463623046875, "logps/rejected": -87.3377456665039, "loss": 0.5289, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.5726319551467896, "rewards/margins": 0.5831801891326904, "rewards/rejected": -1.1558120250701904, "step": 1290 }, { "epoch": 2.6332447145208255, "grad_norm": 29.894866943359375, "learning_rate": 4.396628458437912e-09, "logits/chosen": -2.057438373565674, "logits/rejected": -1.9271215200424194, "logps/chosen": -86.70679473876953, "logps/rejected": -89.4132308959961, "loss": 0.4833, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5987198948860168, "rewards/margins": 0.7157739996910095, "rewards/rejected": -1.3144938945770264, "step": 1300 }, { "epoch": 2.6535004430940625, "grad_norm": 28.439125061035156, "learning_rate": 3.9254617940670474e-09, "logits/chosen": -2.0954787731170654, "logits/rejected": -1.9630225896835327, "logps/chosen": -84.08492279052734, "logps/rejected": -87.75090026855469, "loss": 0.4635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.523126482963562, "rewards/margins": 0.7484750151634216, "rewards/rejected": -1.2716015577316284, "step": 1310 }, { "epoch": 2.6737561716672995, "grad_norm": 25.333951950073242, "learning_rate": 3.479962682422366e-09, "logits/chosen": -2.1200668811798096, "logits/rejected": -1.9594342708587646, "logps/chosen": -83.06964111328125, "logps/rejected": -84.25973510742188, "loss": 0.4752, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6142527461051941, "rewards/margins": 0.7399193048477173, "rewards/rejected": -1.3541719913482666, "step": 1320 }, { "epoch": 2.694011900240537, "grad_norm": 38.79462814331055, "learning_rate": 3.0603793055233194e-09, "logits/chosen": -2.078015089035034, "logits/rejected": -1.9544031620025635, "logps/chosen": -86.95954895019531, "logps/rejected": -87.41011810302734, "loss": 0.5153, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6789627075195312, "rewards/margins": 0.6111471652984619, "rewards/rejected": -1.2901098728179932, "step": 1330 }, { "epoch": 2.714267628813774, "grad_norm": 25.715852737426758, "learning_rate": 2.6669454080555707e-09, "logits/chosen": -2.081672191619873, "logits/rejected": -1.9599504470825195, "logps/chosen": -81.85166931152344, "logps/rejected": -84.5459213256836, "loss": 0.4941, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5518099069595337, "rewards/margins": 0.6579364538192749, "rewards/rejected": -1.2097463607788086, "step": 1340 }, { "epoch": 2.734523357387011, "grad_norm": 31.674190521240234, "learning_rate": 2.299880167154694e-09, "logits/chosen": -2.0664610862731934, "logits/rejected": -1.9412147998809814, "logps/chosen": -86.22771453857422, "logps/rejected": -90.61781311035156, "loss": 0.4909, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5787885785102844, "rewards/margins": 0.6425621509552002, "rewards/rejected": -1.2213506698608398, "step": 1350 }, { "epoch": 2.754779085960248, "grad_norm": 21.180307388305664, "learning_rate": 1.959388070305368e-09, "logits/chosen": -2.1191658973693848, "logits/rejected": -1.9780559539794922, "logps/chosen": -84.12895202636719, "logps/rejected": -87.06913757324219, "loss": 0.4683, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.598002016544342, "rewards/margins": 0.7502027750015259, "rewards/rejected": -1.3482048511505127, "step": 1360 }, { "epoch": 2.7750348145334853, "grad_norm": 31.467397689819336, "learning_rate": 1.6456588014238826e-09, "logits/chosen": -2.0679941177368164, "logits/rejected": -1.984758973121643, "logps/chosen": -82.20966339111328, "logps/rejected": -89.56126403808594, "loss": 0.5158, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6504064798355103, "rewards/margins": 0.6637855768203735, "rewards/rejected": -1.3141919374465942, "step": 1370 }, { "epoch": 2.7952905431067223, "grad_norm": 25.813554763793945, "learning_rate": 1.3588671351876358e-09, "logits/chosen": -2.088512897491455, "logits/rejected": -1.9785858392715454, "logps/chosen": -86.8198013305664, "logps/rejected": -90.5335922241211, "loss": 0.4835, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6225306987762451, "rewards/margins": 0.7479228973388672, "rewards/rejected": -1.3704535961151123, "step": 1380 }, { "epoch": 2.8155462716799597, "grad_norm": 27.169641494750977, "learning_rate": 1.099172839670298e-09, "logits/chosen": -2.0676891803741455, "logits/rejected": -1.9737918376922607, "logps/chosen": -78.63626861572266, "logps/rejected": -83.41529083251953, "loss": 0.5417, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.6757029294967651, "rewards/margins": 0.6025967597961426, "rewards/rejected": -1.2782996892929077, "step": 1390 }, { "epoch": 2.8358020002531967, "grad_norm": 27.115802764892578, "learning_rate": 8.66720587337011e-10, "logits/chosen": -2.065960645675659, "logits/rejected": -1.9609451293945312, "logps/chosen": -87.2630615234375, "logps/rejected": -90.57234191894531, "loss": 0.5015, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.619225263595581, "rewards/margins": 0.6657966375350952, "rewards/rejected": -1.2850219011306763, "step": 1400 }, { "epoch": 2.8560577288264337, "grad_norm": 24.163036346435547, "learning_rate": 6.616398744491825e-10, "logits/chosen": -2.079883337020874, "logits/rejected": -1.9338324069976807, "logps/chosen": -88.01893615722656, "logps/rejected": -89.63729095458984, "loss": 0.4572, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5343765020370483, "rewards/margins": 0.7622194886207581, "rewards/rejected": -1.2965959310531616, "step": 1410 }, { "epoch": 2.8763134573996707, "grad_norm": 33.94268798828125, "learning_rate": 4.840449489236786e-10, "logits/chosen": -2.0775258541107178, "logits/rejected": -1.9470727443695068, "logps/chosen": -83.77029418945312, "logps/rejected": -87.10340881347656, "loss": 0.478, "rewards/accuracies": 0.809374988079071, "rewards/chosen": -0.578041672706604, "rewards/margins": 0.695804238319397, "rewards/rejected": -1.273845911026001, "step": 1420 }, { "epoch": 2.8965691859729077, "grad_norm": 31.864702224731445, "learning_rate": 3.3403474668677324e-10, "logits/chosen": -2.063178062438965, "logits/rejected": -1.9387401342391968, "logps/chosen": -88.2574234008789, "logps/rejected": -90.79651641845703, "loss": 0.5111, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6772388219833374, "rewards/margins": 0.6761503219604492, "rewards/rejected": -1.3533891439437866, "step": 1430 }, { "epoch": 2.916824914546145, "grad_norm": 23.90968894958496, "learning_rate": 2.1169283655815274e-10, "logits/chosen": -2.082099199295044, "logits/rejected": -1.949532151222229, "logps/chosen": -87.7041244506836, "logps/rejected": -91.3929214477539, "loss": 0.4748, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6174831390380859, "rewards/margins": 0.7841922044754028, "rewards/rejected": -1.4016753435134888, "step": 1440 }, { "epoch": 2.937080643119382, "grad_norm": 25.46677589416504, "learning_rate": 1.1708737369576228e-10, "logits/chosen": -2.081848621368408, "logits/rejected": -1.9594366550445557, "logps/chosen": -78.64682006835938, "logps/rejected": -87.83316040039062, "loss": 0.4826, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5947951078414917, "rewards/margins": 0.7158970832824707, "rewards/rejected": -1.3106920719146729, "step": 1450 }, { "epoch": 2.9573363716926195, "grad_norm": 34.160301208496094, "learning_rate": 5.0271061627427115e-11, "logits/chosen": -2.1119987964630127, "logits/rejected": -1.9852664470672607, "logps/chosen": -80.1048355102539, "logps/rejected": -88.52069091796875, "loss": 0.5029, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6696670651435852, "rewards/margins": 0.718749463558197, "rewards/rejected": -1.3884165287017822, "step": 1460 }, { "epoch": 2.9775921002658565, "grad_norm": 26.002410888671875, "learning_rate": 1.1281122890355322e-11, "logits/chosen": -2.076430082321167, "logits/rejected": -1.9372243881225586, "logps/chosen": -83.05271911621094, "logps/rejected": -85.09577941894531, "loss": 0.5069, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.542018711566925, "rewards/margins": 0.6729723215103149, "rewards/rejected": -1.2149909734725952, "step": 1470 }, { "epoch": 2.99582225598177, "step": 1479, "total_flos": 0.0, "train_loss": 0.546259533964032, "train_runtime": 14357.5178, "train_samples_per_second": 3.301, "train_steps_per_second": 0.103 } ], "logging_steps": 10, "max_steps": 1479, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }