{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5095, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019627085377821394, "grad_norm": 55.42576454689225, "learning_rate": 9.803921568627451e-10, "logits/chosen": -2.9195547103881836, "logits/rejected": -2.4565553665161133, "logps/chosen": -421.782470703125, "logps/rejected": -89.33955383300781, "loss": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001962708537782139, "grad_norm": 52.06135973086457, "learning_rate": 9.803921568627451e-09, "logits/chosen": -2.558222770690918, "logits/rejected": -2.5535826683044434, "logps/chosen": -328.5440673828125, "logps/rejected": -224.7199249267578, "loss": 0.0003, "rewards/accuracies": 0.3333333134651184, "rewards/chosen": -0.03320746868848801, "rewards/margins": -0.14528942108154297, "rewards/rejected": 0.11208193749189377, "step": 10 }, { "epoch": 0.003925417075564278, "grad_norm": 56.89658523828712, "learning_rate": 1.9607843137254902e-08, "logits/chosen": -2.7485036849975586, "logits/rejected": -2.6489720344543457, "logps/chosen": -241.45883178710938, "logps/rejected": -228.8603515625, "loss": -0.0004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1173335462808609, "rewards/margins": 0.1355145424604416, "rewards/rejected": -0.018181007355451584, "step": 20 }, { "epoch": 0.005888125613346418, "grad_norm": 45.255056406635454, "learning_rate": 2.941176470588235e-08, "logits/chosen": -2.8057525157928467, "logits/rejected": -2.749558687210083, "logps/chosen": -271.7970886230469, "logps/rejected": -277.11260986328125, "loss": 0.0009, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": -0.06314592063426971, "rewards/margins": -0.1372196227312088, "rewards/rejected": 0.0740736722946167, "step": 30 }, { "epoch": 0.007850834151128557, "grad_norm": 55.405585195341, "learning_rate": 3.9215686274509804e-08, "logits/chosen": -2.5288026332855225, "logits/rejected": -2.613417148590088, "logps/chosen": -236.00308227539062, "logps/rejected": -199.1337890625, "loss": -0.0002, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.15636861324310303, "rewards/margins": 0.1928229033946991, "rewards/rejected": -0.036454297602176666, "step": 40 }, { "epoch": 0.009813542688910697, "grad_norm": 49.01526447310567, "learning_rate": 4.901960784313725e-08, "logits/chosen": -2.7653937339782715, "logits/rejected": -2.7310125827789307, "logps/chosen": -265.25726318359375, "logps/rejected": -283.57275390625, "loss": -0.0002, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 0.16415968537330627, "rewards/margins": 0.13651719689369202, "rewards/rejected": 0.027642499655485153, "step": 50 }, { "epoch": 0.011776251226692836, "grad_norm": 46.62264667378852, "learning_rate": 5.88235294117647e-08, "logits/chosen": -2.7559211254119873, "logits/rejected": -2.671520233154297, "logps/chosen": -254.8046112060547, "logps/rejected": -236.09078979492188, "loss": -0.0004, "rewards/accuracies": 0.5, "rewards/chosen": 0.08886684477329254, "rewards/margins": 0.049677345901727676, "rewards/rejected": 0.039189498871564865, "step": 60 }, { "epoch": 0.013738959764474975, "grad_norm": 48.26772678837694, "learning_rate": 6.862745098039216e-08, "logits/chosen": -2.837052345275879, "logits/rejected": -2.7726972103118896, "logps/chosen": -301.92010498046875, "logps/rejected": -237.0230255126953, "loss": -0.0006, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 0.18614128232002258, "rewards/margins": -0.07164065539836884, "rewards/rejected": 0.2577819228172302, "step": 70 }, { "epoch": 0.015701668302257114, "grad_norm": 56.72082827271601, "learning_rate": 7.843137254901961e-08, "logits/chosen": -2.7584192752838135, "logits/rejected": -2.5587003231048584, "logps/chosen": -324.06365966796875, "logps/rejected": -213.2407684326172, "loss": -0.0006, "rewards/accuracies": 0.5, "rewards/chosen": 0.2116052806377411, "rewards/margins": 0.003895642701536417, "rewards/rejected": 0.2077096402645111, "step": 80 }, { "epoch": 0.017664376840039256, "grad_norm": 56.49868797565438, "learning_rate": 8.823529411764706e-08, "logits/chosen": -2.821058750152588, "logits/rejected": -2.7977123260498047, "logps/chosen": -269.60736083984375, "logps/rejected": -272.5506896972656, "loss": -0.0022, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.5445754528045654, "rewards/margins": 0.1933358609676361, "rewards/rejected": 0.35123956203460693, "step": 90 }, { "epoch": 0.019627085377821395, "grad_norm": 47.073945639503265, "learning_rate": 9.80392156862745e-08, "logits/chosen": -2.792196750640869, "logits/rejected": -2.6972861289978027, "logps/chosen": -295.4858093261719, "logps/rejected": -259.53302001953125, "loss": -0.002, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 0.536602258682251, "rewards/margins": 0.15680420398712158, "rewards/rejected": 0.379798024892807, "step": 100 }, { "epoch": 0.021589793915603533, "grad_norm": 55.31346254119081, "learning_rate": 1.0784313725490195e-07, "logits/chosen": -2.7970364093780518, "logits/rejected": -2.7083146572113037, "logps/chosen": -331.5827331542969, "logps/rejected": -292.47857666015625, "loss": -0.0036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.154811143875122, "rewards/margins": 0.6904684901237488, "rewards/rejected": 0.4643428325653076, "step": 110 }, { "epoch": 0.023552502453385672, "grad_norm": 42.62283781546687, "learning_rate": 1.176470588235294e-07, "logits/chosen": -2.647566080093384, "logits/rejected": -2.5511529445648193, "logps/chosen": -202.06785583496094, "logps/rejected": -188.60385131835938, "loss": -0.0076, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 0.8458110690116882, "rewards/margins": 1.0164639949798584, "rewards/rejected": -0.17065294086933136, "step": 120 }, { "epoch": 0.02551521099116781, "grad_norm": 49.90985311951694, "learning_rate": 1.2745098039215685e-07, "logits/chosen": -2.591831922531128, "logits/rejected": -2.6268253326416016, "logps/chosen": -359.3841552734375, "logps/rejected": -300.1329650878906, "loss": -0.0079, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 1.7609771490097046, "rewards/margins": 1.052795171737671, "rewards/rejected": 0.7081820368766785, "step": 130 }, { "epoch": 0.02747791952894995, "grad_norm": 69.08265173603021, "learning_rate": 1.3725490196078432e-07, "logits/chosen": -2.686288833618164, "logits/rejected": -2.7173526287078857, "logps/chosen": -192.25914001464844, "logps/rejected": -193.92724609375, "loss": -0.0115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9333054423332214, "rewards/margins": 1.2931725978851318, "rewards/rejected": -0.3598671555519104, "step": 140 }, { "epoch": 0.029440628066732092, "grad_norm": 54.562941227728246, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -2.8407418727874756, "logits/rejected": -2.7342238426208496, "logps/chosen": -233.2815399169922, "logps/rejected": -225.36019897460938, "loss": -0.0131, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.8612493276596069, "rewards/margins": 0.8497702479362488, "rewards/rejected": 0.011479055508971214, "step": 150 }, { "epoch": 0.03140333660451423, "grad_norm": 63.47402541398964, "learning_rate": 1.5686274509803921e-07, "logits/chosen": -2.7791740894317627, "logits/rejected": -2.6811375617980957, "logps/chosen": -277.4623107910156, "logps/rejected": -219.72213745117188, "loss": -0.0285, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 1.0771534442901611, "rewards/margins": 2.586575984954834, "rewards/rejected": -1.5094225406646729, "step": 160 }, { "epoch": 0.033366045142296366, "grad_norm": 56.67134098750124, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.669949769973755, "logits/rejected": -2.6452701091766357, "logps/chosen": -248.5420684814453, "logps/rejected": -208.90786743164062, "loss": -0.0263, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.44815006852149963, "rewards/margins": 1.9631726741790771, "rewards/rejected": -2.411322832107544, "step": 170 }, { "epoch": 0.03532875368007851, "grad_norm": 39.40690681772726, "learning_rate": 1.764705882352941e-07, "logits/chosen": -2.751427173614502, "logits/rejected": -2.6481759548187256, "logps/chosen": -259.8966369628906, "logps/rejected": -251.5900421142578, "loss": -0.0233, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -1.0558886528015137, "rewards/margins": 3.4843649864196777, "rewards/rejected": -4.540253639221191, "step": 180 }, { "epoch": 0.03729146221786065, "grad_norm": 62.47158031773993, "learning_rate": 1.8627450980392158e-07, "logits/chosen": -2.6491169929504395, "logits/rejected": -2.590907335281372, "logps/chosen": -318.29046630859375, "logps/rejected": -260.68206787109375, "loss": -0.0243, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.11146416515111923, "rewards/margins": 3.8212521076202393, "rewards/rejected": -3.9327163696289062, "step": 190 }, { "epoch": 0.03925417075564279, "grad_norm": 67.22803692873129, "learning_rate": 1.96078431372549e-07, "logits/chosen": -2.7688140869140625, "logits/rejected": -2.574352741241455, "logps/chosen": -285.4090881347656, "logps/rejected": -201.56417846679688, "loss": -0.0417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4215115308761597, "rewards/margins": 6.695929050445557, "rewards/rejected": -5.274416923522949, "step": 200 }, { "epoch": 0.04121687929342493, "grad_norm": 55.69330259375479, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -2.5630764961242676, "logits/rejected": -2.483159303665161, "logps/chosen": -244.29983520507812, "logps/rejected": -241.3026885986328, "loss": -0.0648, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9128227233886719, "rewards/margins": 3.919031858444214, "rewards/rejected": -5.831854343414307, "step": 210 }, { "epoch": 0.04317958783120707, "grad_norm": 57.469004520548765, "learning_rate": 2.156862745098039e-07, "logits/chosen": -2.7338674068450928, "logits/rejected": -2.6867470741271973, "logps/chosen": -291.15673828125, "logps/rejected": -295.14581298828125, "loss": -0.0637, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.957396984100342, "rewards/margins": 6.729005336761475, "rewards/rejected": -9.6864013671875, "step": 220 }, { "epoch": 0.045142296368989206, "grad_norm": 52.20088600279648, "learning_rate": 2.2549019607843137e-07, "logits/chosen": -2.765036106109619, "logits/rejected": -2.6319448947906494, "logps/chosen": -271.0700378417969, "logps/rejected": -213.5701904296875, "loss": -0.0735, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -4.162587642669678, "rewards/margins": 6.882012367248535, "rewards/rejected": -11.044599533081055, "step": 230 }, { "epoch": 0.047105004906771344, "grad_norm": 61.77295677969488, "learning_rate": 2.352941176470588e-07, "logits/chosen": -2.6707522869110107, "logits/rejected": -2.6531193256378174, "logps/chosen": -251.86428833007812, "logps/rejected": -264.7287292480469, "loss": -0.0829, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -1.9166555404663086, "rewards/margins": 15.396039962768555, "rewards/rejected": -17.312694549560547, "step": 240 }, { "epoch": 0.04906771344455348, "grad_norm": 73.89409979808553, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -2.629805564880371, "logits/rejected": -2.540875196456909, "logps/chosen": -247.9143829345703, "logps/rejected": -218.74600219726562, "loss": -0.0971, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.48637866973877, "rewards/margins": 7.239996433258057, "rewards/rejected": -15.726374626159668, "step": 250 }, { "epoch": 0.05103042198233562, "grad_norm": 66.29074269764887, "learning_rate": 2.549019607843137e-07, "logits/chosen": -2.72763991355896, "logits/rejected": -2.7005534172058105, "logps/chosen": -320.3201904296875, "logps/rejected": -263.1239318847656, "loss": -0.0952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.5693891048431396, "rewards/margins": 11.3147554397583, "rewards/rejected": -13.88414478302002, "step": 260 }, { "epoch": 0.05299313052011776, "grad_norm": 69.40161302997811, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -2.6740689277648926, "logits/rejected": -2.636819362640381, "logps/chosen": -230.25936889648438, "logps/rejected": -232.55703735351562, "loss": -0.0559, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -2.6644139289855957, "rewards/margins": 6.941485404968262, "rewards/rejected": -9.605899810791016, "step": 270 }, { "epoch": 0.0549558390578999, "grad_norm": 65.05492968329817, "learning_rate": 2.7450980392156863e-07, "logits/chosen": -2.69507098197937, "logits/rejected": -2.600647449493408, "logps/chosen": -255.4451446533203, "logps/rejected": -230.79037475585938, "loss": -0.1374, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -10.016531944274902, "rewards/margins": 18.90683937072754, "rewards/rejected": -28.923370361328125, "step": 280 }, { "epoch": 0.05691854759568204, "grad_norm": 80.60280571552023, "learning_rate": 2.8431372549019607e-07, "logits/chosen": -2.8089046478271484, "logits/rejected": -2.670240879058838, "logps/chosen": -299.865234375, "logps/rejected": -243.9931640625, "loss": -0.1361, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 4.855011463165283, "rewards/margins": 17.643901824951172, "rewards/rejected": -12.788888931274414, "step": 290 }, { "epoch": 0.058881256133464184, "grad_norm": 114.26931921522811, "learning_rate": 2.941176470588235e-07, "logits/chosen": -2.6850318908691406, "logits/rejected": -2.6374764442443848, "logps/chosen": -279.58856201171875, "logps/rejected": -308.7133483886719, "loss": -0.1432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7034823894500732, "rewards/margins": 12.2323637008667, "rewards/rejected": -8.528882026672363, "step": 300 }, { "epoch": 0.06084396467124632, "grad_norm": 98.35876940848675, "learning_rate": 3.0392156862745094e-07, "logits/chosen": -2.6629366874694824, "logits/rejected": -2.5560824871063232, "logps/chosen": -322.3955993652344, "logps/rejected": -268.99041748046875, "loss": -0.1189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.34133529663086, "rewards/margins": 12.180948257446289, "rewards/rejected": -32.52228546142578, "step": 310 }, { "epoch": 0.06280667320902845, "grad_norm": 142.72415154736706, "learning_rate": 3.1372549019607843e-07, "logits/chosen": -2.648869514465332, "logits/rejected": -2.6653428077697754, "logps/chosen": -231.400390625, "logps/rejected": -255.35140991210938, "loss": -0.0871, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -27.71343421936035, "rewards/margins": 3.059386730194092, "rewards/rejected": -30.772823333740234, "step": 320 }, { "epoch": 0.0647693817468106, "grad_norm": 122.94034076448233, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -2.735877513885498, "logits/rejected": -2.6852054595947266, "logps/chosen": -296.5848083496094, "logps/rejected": -266.68878173828125, "loss": -0.1574, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -20.550838470458984, "rewards/margins": 32.51291275024414, "rewards/rejected": -53.063751220703125, "step": 330 }, { "epoch": 0.06673209028459273, "grad_norm": 90.76559205067144, "learning_rate": 3.333333333333333e-07, "logits/chosen": -2.83514404296875, "logits/rejected": -2.64210844039917, "logps/chosen": -364.6646423339844, "logps/rejected": -285.5331115722656, "loss": -0.1582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.436984062194824, "rewards/margins": 30.28008460998535, "rewards/rejected": -44.717071533203125, "step": 340 }, { "epoch": 0.06869479882237488, "grad_norm": 105.41764293594291, "learning_rate": 3.431372549019608e-07, "logits/chosen": -2.7746009826660156, "logits/rejected": -2.6759238243103027, "logps/chosen": -202.76531982421875, "logps/rejected": -191.0487060546875, "loss": -0.1096, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -12.143823623657227, "rewards/margins": 1.596701741218567, "rewards/rejected": -13.74052906036377, "step": 350 }, { "epoch": 0.07065750736015702, "grad_norm": 86.16876458236432, "learning_rate": 3.529411764705882e-07, "logits/chosen": -2.8188443183898926, "logits/rejected": -2.593196392059326, "logps/chosen": -353.32891845703125, "logps/rejected": -291.8623352050781, "loss": -0.2637, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -2.967794895172119, "rewards/margins": 32.80992889404297, "rewards/rejected": -35.77772521972656, "step": 360 }, { "epoch": 0.07262021589793916, "grad_norm": 125.37414457272092, "learning_rate": 3.6274509803921566e-07, "logits/chosen": -2.741210460662842, "logits/rejected": -2.6491539478302, "logps/chosen": -282.52386474609375, "logps/rejected": -295.5267333984375, "loss": -0.1618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.858935356140137, "rewards/margins": 23.869136810302734, "rewards/rejected": -32.72806930541992, "step": 370 }, { "epoch": 0.0745829244357213, "grad_norm": 107.59446022811072, "learning_rate": 3.7254901960784315e-07, "logits/chosen": -2.559868335723877, "logits/rejected": -2.710339069366455, "logps/chosen": -239.18466186523438, "logps/rejected": -320.7379150390625, "loss": -0.2261, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.9056112170219421, "rewards/margins": 28.805679321289062, "rewards/rejected": -27.900070190429688, "step": 380 }, { "epoch": 0.07654563297350343, "grad_norm": 104.1567948683286, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -2.5698726177215576, "logits/rejected": -2.3517141342163086, "logps/chosen": -283.9027404785156, "logps/rejected": -334.44110107421875, "loss": -0.2289, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -17.196475982666016, "rewards/margins": 38.552696228027344, "rewards/rejected": -55.749176025390625, "step": 390 }, { "epoch": 0.07850834151128558, "grad_norm": 72.72058445266771, "learning_rate": 3.92156862745098e-07, "logits/chosen": -2.6728971004486084, "logits/rejected": -2.540029287338257, "logps/chosen": -277.1605529785156, "logps/rejected": -303.30194091796875, "loss": -0.1376, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -12.780940055847168, "rewards/margins": 14.76891803741455, "rewards/rejected": -27.54986000061035, "step": 400 }, { "epoch": 0.08047105004906771, "grad_norm": 80.71430874425238, "learning_rate": 4.019607843137255e-07, "logits/chosen": -2.6524100303649902, "logits/rejected": -2.6495046615600586, "logps/chosen": -322.28900146484375, "logps/rejected": -308.7735290527344, "loss": -0.2358, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -20.927623748779297, "rewards/margins": 34.26749038696289, "rewards/rejected": -55.19511795043945, "step": 410 }, { "epoch": 0.08243375858684986, "grad_norm": 217.0300462879297, "learning_rate": 4.117647058823529e-07, "logits/chosen": -2.6198325157165527, "logits/rejected": -2.5762195587158203, "logps/chosen": -291.5865783691406, "logps/rejected": -358.6512451171875, "loss": -0.303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.95696449279785, "rewards/margins": 51.48836135864258, "rewards/rejected": -82.44532775878906, "step": 420 }, { "epoch": 0.08439646712463199, "grad_norm": 130.86544986831066, "learning_rate": 4.215686274509804e-07, "logits/chosen": -2.739365577697754, "logits/rejected": -2.461707353591919, "logps/chosen": -391.75482177734375, "logps/rejected": -300.4351806640625, "loss": -0.2969, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -27.522998809814453, "rewards/margins": 32.6249885559082, "rewards/rejected": -60.14799118041992, "step": 430 }, { "epoch": 0.08635917566241413, "grad_norm": 171.9338418158325, "learning_rate": 4.313725490196078e-07, "logits/chosen": -2.3868844509124756, "logits/rejected": -2.1997270584106445, "logps/chosen": -308.11358642578125, "logps/rejected": -293.33380126953125, "loss": -0.3469, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -8.658950805664062, "rewards/margins": 41.66270446777344, "rewards/rejected": -50.32164764404297, "step": 440 }, { "epoch": 0.08832188420019627, "grad_norm": 174.1066758269077, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -2.186063289642334, "logits/rejected": -2.3623898029327393, "logps/chosen": -346.97161865234375, "logps/rejected": -400.64044189453125, "loss": -0.19, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -66.03451538085938, "rewards/margins": 45.21254348754883, "rewards/rejected": -111.2470703125, "step": 450 }, { "epoch": 0.09028459273797841, "grad_norm": 85.4775852556709, "learning_rate": 4.5098039215686274e-07, "logits/chosen": -2.2321205139160156, "logits/rejected": -1.9455502033233643, "logps/chosen": -277.85137939453125, "logps/rejected": -344.2044677734375, "loss": -0.2591, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -28.514850616455078, "rewards/margins": 57.593421936035156, "rewards/rejected": -86.1082763671875, "step": 460 }, { "epoch": 0.09224730127576054, "grad_norm": 153.81064718064303, "learning_rate": 4.6078431372549013e-07, "logits/chosen": -2.497727632522583, "logits/rejected": -2.2833471298217773, "logps/chosen": -276.8794250488281, "logps/rejected": -389.337646484375, "loss": -0.4183, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -13.273488998413086, "rewards/margins": 39.83915328979492, "rewards/rejected": -53.112640380859375, "step": 470 }, { "epoch": 0.09421000981354269, "grad_norm": 445.8429087813216, "learning_rate": 4.705882352941176e-07, "logits/chosen": -2.6095690727233887, "logits/rejected": -2.1548361778259277, "logps/chosen": -361.65338134765625, "logps/rejected": -426.72601318359375, "loss": -0.3216, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -41.51061248779297, "rewards/margins": 62.375877380371094, "rewards/rejected": -103.8864974975586, "step": 480 }, { "epoch": 0.09617271835132483, "grad_norm": 89.04254958231228, "learning_rate": 4.803921568627451e-07, "logits/chosen": -2.368504047393799, "logits/rejected": -2.2150702476501465, "logps/chosen": -319.50006103515625, "logps/rejected": -393.1334533691406, "loss": -0.319, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -24.828651428222656, "rewards/margins": 46.397682189941406, "rewards/rejected": -71.22633361816406, "step": 490 }, { "epoch": 0.09813542688910697, "grad_norm": 241.76895654119772, "learning_rate": 4.901960784313725e-07, "logits/chosen": -2.57795786857605, "logits/rejected": -2.4584553241729736, "logps/chosen": -326.4345703125, "logps/rejected": -306.2259826660156, "loss": -0.2756, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -26.8358154296875, "rewards/margins": 28.74239730834961, "rewards/rejected": -55.578216552734375, "step": 500 }, { "epoch": 0.10009813542688911, "grad_norm": 264.5165149997937, "learning_rate": 5e-07, "logits/chosen": -2.301535129547119, "logits/rejected": -1.850098967552185, "logps/chosen": -335.3379821777344, "logps/rejected": -351.9546813964844, "loss": -0.2603, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -47.27622604370117, "rewards/margins": 31.92726707458496, "rewards/rejected": -79.20349884033203, "step": 510 }, { "epoch": 0.10206084396467124, "grad_norm": 256.12236152073547, "learning_rate": 4.999941314693213e-07, "logits/chosen": -2.387305974960327, "logits/rejected": -2.093822479248047, "logps/chosen": -279.55377197265625, "logps/rejected": -288.3088073730469, "loss": -0.2526, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -29.910446166992188, "rewards/margins": 51.61469268798828, "rewards/rejected": -81.52513122558594, "step": 520 }, { "epoch": 0.10402355250245339, "grad_norm": 84.11574617516239, "learning_rate": 4.999765261528027e-07, "logits/chosen": -2.428924560546875, "logits/rejected": -2.2418971061706543, "logps/chosen": -308.13690185546875, "logps/rejected": -425.5289611816406, "loss": -0.2481, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -18.78133773803711, "rewards/margins": 90.52960968017578, "rewards/rejected": -109.31095886230469, "step": 530 }, { "epoch": 0.10598626104023552, "grad_norm": 138.96263435109205, "learning_rate": 4.999471848769828e-07, "logits/chosen": -2.2968649864196777, "logits/rejected": -2.2856202125549316, "logps/chosen": -350.42437744140625, "logps/rejected": -404.6186828613281, "loss": -0.2945, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -60.3192253112793, "rewards/margins": 30.69902992248535, "rewards/rejected": -91.01825714111328, "step": 540 }, { "epoch": 0.10794896957801767, "grad_norm": 195.9722588481125, "learning_rate": 4.999061090193831e-07, "logits/chosen": -2.743603229522705, "logits/rejected": -2.556018352508545, "logps/chosen": -370.62225341796875, "logps/rejected": -375.95867919921875, "loss": -0.2633, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -47.81446838378906, "rewards/margins": 26.41641616821289, "rewards/rejected": -74.23088073730469, "step": 550 }, { "epoch": 0.1099116781157998, "grad_norm": 106.73375585216431, "learning_rate": 4.998533005084428e-07, "logits/chosen": -2.743039846420288, "logits/rejected": -2.6808793544769287, "logps/chosen": -302.07318115234375, "logps/rejected": -326.42840576171875, "loss": -0.1725, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -20.548267364501953, "rewards/margins": 54.09697341918945, "rewards/rejected": -74.64524841308594, "step": 560 }, { "epoch": 0.11187438665358194, "grad_norm": 129.19431986296829, "learning_rate": 4.997887618234292e-07, "logits/chosen": -2.690355062484741, "logits/rejected": -2.696227550506592, "logps/chosen": -289.22882080078125, "logps/rejected": -368.32159423828125, "loss": -0.5496, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7.5803728103637695, "rewards/margins": 51.98181915283203, "rewards/rejected": -59.56218719482422, "step": 570 }, { "epoch": 0.11383709519136408, "grad_norm": 366.7196058070477, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.523827314376831, "logits/rejected": -1.6506551504135132, "logps/chosen": -284.63677978515625, "logps/rejected": -301.85748291015625, "loss": -0.0144, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -40.72862243652344, "rewards/margins": 47.8609733581543, "rewards/rejected": -88.58959197998047, "step": 580 }, { "epoch": 0.11579980372914622, "grad_norm": 178.59574489981918, "learning_rate": 4.996245066016623e-07, "logits/chosen": -2.2600531578063965, "logits/rejected": -1.7368602752685547, "logps/chosen": -277.12286376953125, "logps/rejected": -323.1368408203125, "loss": -0.394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.99538803100586, "rewards/margins": 69.44538116455078, "rewards/rejected": -91.4407730102539, "step": 590 }, { "epoch": 0.11776251226692837, "grad_norm": 265.57856442599115, "learning_rate": 4.995247977764035e-07, "logits/chosen": -2.4274303913116455, "logits/rejected": -2.0409669876098633, "logps/chosen": -247.5201416015625, "logps/rejected": -297.291748046875, "loss": -0.3023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.959405899047852, "rewards/margins": 74.5136947631836, "rewards/rejected": -87.47310638427734, "step": 600 }, { "epoch": 0.1197252208047105, "grad_norm": 128.9123408742007, "learning_rate": 4.994133741996982e-07, "logits/chosen": -2.3767576217651367, "logits/rejected": -2.219869375228882, "logps/chosen": -315.9458312988281, "logps/rejected": -308.843994140625, "loss": -0.3328, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -34.130062103271484, "rewards/margins": 35.57612228393555, "rewards/rejected": -69.70618438720703, "step": 610 }, { "epoch": 0.12168792934249265, "grad_norm": 276.5185615656701, "learning_rate": 4.992902411026877e-07, "logits/chosen": -2.181988477706909, "logits/rejected": -1.965958595275879, "logps/chosen": -312.49957275390625, "logps/rejected": -435.63128662109375, "loss": -0.3255, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -46.6572380065918, "rewards/margins": 46.17048645019531, "rewards/rejected": -92.82772064208984, "step": 620 }, { "epoch": 0.12365063788027478, "grad_norm": 287.629427646687, "learning_rate": 4.991554042662548e-07, "logits/chosen": -2.2768828868865967, "logits/rejected": -2.15800404548645, "logps/chosen": -305.6785888671875, "logps/rejected": -325.3777770996094, "loss": -0.1643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -56.725929260253906, "rewards/margins": 28.1070499420166, "rewards/rejected": -84.83297729492188, "step": 630 }, { "epoch": 0.1256133464180569, "grad_norm": 93.51813893030197, "learning_rate": 4.990088700207525e-07, "logits/chosen": -2.442497491836548, "logits/rejected": -2.4978153705596924, "logps/chosen": -260.5741882324219, "logps/rejected": -327.4527893066406, "loss": -0.3092, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -50.749755859375, "rewards/margins": 36.45991134643555, "rewards/rejected": -87.20967102050781, "step": 640 }, { "epoch": 0.12757605495583907, "grad_norm": 291.7759176839187, "learning_rate": 4.988506452457066e-07, "logits/chosen": -2.610625743865967, "logits/rejected": -2.3595685958862305, "logps/chosen": -312.53509521484375, "logps/rejected": -378.19525146484375, "loss": -0.3342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.899950981140137, "rewards/margins": 66.04251861572266, "rewards/rejected": -81.94246673583984, "step": 650 }, { "epoch": 0.1295387634936212, "grad_norm": 144.9833191500008, "learning_rate": 4.986807373694925e-07, "logits/chosen": -2.5556507110595703, "logits/rejected": -2.404313087463379, "logps/chosen": -280.6217346191406, "logps/rejected": -315.28131103515625, "loss": -0.2526, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -16.632827758789062, "rewards/margins": 30.876800537109375, "rewards/rejected": -47.50963592529297, "step": 660 }, { "epoch": 0.13150147203140333, "grad_norm": 125.84079358691508, "learning_rate": 4.984991543689869e-07, "logits/chosen": -2.311565637588501, "logits/rejected": -2.0786004066467285, "logps/chosen": -326.7909851074219, "logps/rejected": -359.30078125, "loss": -0.2437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -43.393157958984375, "rewards/margins": 35.29859924316406, "rewards/rejected": -78.69175720214844, "step": 670 }, { "epoch": 0.13346418056918546, "grad_norm": 191.02038814443432, "learning_rate": 4.983059047691931e-07, "logits/chosen": -2.4630486965179443, "logits/rejected": -2.2847580909729004, "logps/chosen": -284.15216064453125, "logps/rejected": -270.96478271484375, "loss": -0.3101, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -25.678781509399414, "rewards/margins": 31.067684173583984, "rewards/rejected": -56.74646759033203, "step": 680 }, { "epoch": 0.13542688910696762, "grad_norm": 284.2254776791221, "learning_rate": 4.981009976428408e-07, "logits/chosen": -2.2247262001037598, "logits/rejected": -1.9255733489990234, "logps/chosen": -341.8725280761719, "logps/rejected": -359.5609436035156, "loss": -0.264, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -30.570653915405273, "rewards/margins": 62.6126594543457, "rewards/rejected": -93.18330383300781, "step": 690 }, { "epoch": 0.13738959764474976, "grad_norm": 86.57370030845577, "learning_rate": 4.9788444260996e-07, "logits/chosen": -2.268033504486084, "logits/rejected": -2.1153342723846436, "logps/chosen": -277.9055480957031, "logps/rejected": -311.05645751953125, "loss": -0.3819, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -13.68879222869873, "rewards/margins": 39.499717712402344, "rewards/rejected": -53.188514709472656, "step": 700 }, { "epoch": 0.1393523061825319, "grad_norm": 338.05295305391684, "learning_rate": 4.976562498374295e-07, "logits/chosen": -2.362950325012207, "logits/rejected": -1.6634302139282227, "logps/chosen": -310.96356201171875, "logps/rejected": -385.9927673339844, "loss": -0.2462, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -21.215097427368164, "rewards/margins": 105.8913345336914, "rewards/rejected": -127.10643005371094, "step": 710 }, { "epoch": 0.14131501472031405, "grad_norm": 94.651325061684, "learning_rate": 4.974164300384997e-07, "logits/chosen": -2.023195743560791, "logits/rejected": -2.026101589202881, "logps/chosen": -250.32742309570312, "logps/rejected": -377.23101806640625, "loss": -0.4751, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -19.123937606811523, "rewards/margins": 64.68328094482422, "rewards/rejected": -83.80722045898438, "step": 720 }, { "epoch": 0.14327772325809618, "grad_norm": 188.47080569588115, "learning_rate": 4.971649944722893e-07, "logits/chosen": -1.4261937141418457, "logits/rejected": -1.3943569660186768, "logps/chosen": -304.92669677734375, "logps/rejected": -393.43450927734375, "loss": -0.3547, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -52.48748016357422, "rewards/margins": 57.24298858642578, "rewards/rejected": -109.73048400878906, "step": 730 }, { "epoch": 0.1452404317958783, "grad_norm": 119.65538293437197, "learning_rate": 4.96901954943257e-07, "logits/chosen": -1.623649001121521, "logits/rejected": -0.6015797853469849, "logps/chosen": -316.39544677734375, "logps/rejected": -308.2103271484375, "loss": -0.3321, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -50.778038024902344, "rewards/margins": 90.8272705078125, "rewards/rejected": -141.60531616210938, "step": 740 }, { "epoch": 0.14720314033366044, "grad_norm": 86.48810999349877, "learning_rate": 4.96627323800647e-07, "logits/chosen": -2.2582385540008545, "logits/rejected": -1.9543392658233643, "logps/chosen": -261.2178039550781, "logps/rejected": -318.0521240234375, "loss": -0.4172, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -23.52887725830078, "rewards/margins": 50.16834259033203, "rewards/rejected": -73.69721221923828, "step": 750 }, { "epoch": 0.1491658488714426, "grad_norm": 113.97308557258582, "learning_rate": 4.963411139379099e-07, "logits/chosen": -2.4743828773498535, "logits/rejected": -1.6739752292633057, "logps/chosen": -298.59271240234375, "logps/rejected": -342.79595947265625, "loss": -0.4882, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7.983345031738281, "rewards/margins": 74.26821899414062, "rewards/rejected": -82.25157165527344, "step": 760 }, { "epoch": 0.15112855740922473, "grad_norm": 392.30494311916834, "learning_rate": 4.960433387920964e-07, "logits/chosen": -1.2702064514160156, "logits/rejected": -1.4976896047592163, "logps/chosen": -225.3599853515625, "logps/rejected": -356.6004333496094, "loss": -0.4134, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -55.614784240722656, "rewards/margins": 8.209315299987793, "rewards/rejected": -63.82408905029297, "step": 770 }, { "epoch": 0.15309126594700687, "grad_norm": 127.10227967472944, "learning_rate": 4.957340123432271e-07, "logits/chosen": -1.3343271017074585, "logits/rejected": -0.10657083988189697, "logps/chosen": -352.56072998046875, "logps/rejected": -341.1171875, "loss": -0.3394, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -19.992746353149414, "rewards/margins": 89.02325439453125, "rewards/rejected": -109.01600646972656, "step": 780 }, { "epoch": 0.155053974484789, "grad_norm": 420.58536554983976, "learning_rate": 4.954131491136361e-07, "logits/chosen": -1.0421171188354492, "logits/rejected": -0.5474944710731506, "logps/chosen": -353.1336364746094, "logps/rejected": -364.11773681640625, "loss": -0.269, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.73956298828125, "rewards/margins": 47.741920471191406, "rewards/rejected": -75.48148345947266, "step": 790 }, { "epoch": 0.15701668302257116, "grad_norm": 195.75815539361142, "learning_rate": 4.95080764167289e-07, "logits/chosen": -1.1912492513656616, "logits/rejected": -0.9618164300918579, "logps/chosen": -253.3438262939453, "logps/rejected": -374.35675048828125, "loss": -0.597, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -3.0270016193389893, "rewards/margins": 97.99317932128906, "rewards/rejected": -101.02018737792969, "step": 800 }, { "epoch": 0.1589793915603533, "grad_norm": 139.25684009769012, "learning_rate": 4.94736873109076e-07, "logits/chosen": -1.2744401693344116, "logits/rejected": -0.7750533223152161, "logps/chosen": -281.9678039550781, "logps/rejected": -328.2683410644531, "loss": -0.3557, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -29.657827377319336, "rewards/margins": 66.78199768066406, "rewards/rejected": -96.4398193359375, "step": 810 }, { "epoch": 0.16094210009813542, "grad_norm": 178.30783895621693, "learning_rate": 4.943814920840787e-07, "logits/chosen": -0.9328937530517578, "logits/rejected": -0.6445346474647522, "logps/chosen": -305.2715148925781, "logps/rejected": -317.13470458984375, "loss": -0.2182, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -27.077402114868164, "rewards/margins": 24.533750534057617, "rewards/rejected": -51.61115646362305, "step": 820 }, { "epoch": 0.16290480863591755, "grad_norm": 171.86147694269948, "learning_rate": 4.940146377768126e-07, "logits/chosen": -1.30418860912323, "logits/rejected": -0.7124021053314209, "logps/chosen": -290.5502014160156, "logps/rejected": -321.49639892578125, "loss": -0.4875, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -3.149672031402588, "rewards/margins": 88.61860656738281, "rewards/rejected": -91.76827239990234, "step": 830 }, { "epoch": 0.1648675171736997, "grad_norm": 274.5543658349456, "learning_rate": 4.936363274104441e-07, "logits/chosen": -1.2539094686508179, "logits/rejected": -0.7504829168319702, "logps/chosen": -345.98040771484375, "logps/rejected": -350.09796142578125, "loss": -0.3229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -59.561668395996094, "rewards/margins": 63.72674560546875, "rewards/rejected": -123.2884292602539, "step": 840 }, { "epoch": 0.16683022571148184, "grad_norm": 190.34962104283832, "learning_rate": 4.932465787459808e-07, "logits/chosen": -0.9200321435928345, "logits/rejected": -0.39313608407974243, "logps/chosen": -297.28704833984375, "logps/rejected": -332.4362487792969, "loss": -0.4113, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -31.210453033447266, "rewards/margins": 48.063751220703125, "rewards/rejected": -79.27420806884766, "step": 850 }, { "epoch": 0.16879293424926398, "grad_norm": 192.06158458050416, "learning_rate": 4.92845410081439e-07, "logits/chosen": 0.14147694408893585, "logits/rejected": 0.6419375538825989, "logps/chosen": -296.87158203125, "logps/rejected": -445.063232421875, "loss": -0.2896, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -37.98882293701172, "rewards/margins": 123.41569519042969, "rewards/rejected": -161.404541015625, "step": 860 }, { "epoch": 0.17075564278704614, "grad_norm": 122.74008106568301, "learning_rate": 4.924328402509833e-07, "logits/chosen": 0.125508114695549, "logits/rejected": 0.6455780267715454, "logps/chosen": -335.5021667480469, "logps/rejected": -378.15625, "loss": -0.4273, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -61.71384811401367, "rewards/margins": 79.3321533203125, "rewards/rejected": -141.04600524902344, "step": 870 }, { "epoch": 0.17271835132482827, "grad_norm": 685.9402691010895, "learning_rate": 4.920088886240434e-07, "logits/chosen": 0.07589732110500336, "logits/rejected": 1.1146031618118286, "logps/chosen": -292.1110534667969, "logps/rejected": -382.05706787109375, "loss": -0.5777, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -7.556271553039551, "rewards/margins": 114.8202133178711, "rewards/rejected": -122.3764877319336, "step": 880 }, { "epoch": 0.1746810598626104, "grad_norm": 271.1024175100246, "learning_rate": 4.915735751044045e-07, "logits/chosen": -0.5838578939437866, "logits/rejected": 0.1533234566450119, "logps/chosen": -347.8789367675781, "logps/rejected": -408.5445251464844, "loss": -0.3337, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -59.50645065307617, "rewards/margins": 102.5400161743164, "rewards/rejected": -162.0464630126953, "step": 890 }, { "epoch": 0.17664376840039253, "grad_norm": 235.51558694045585, "learning_rate": 4.911269201292724e-07, "logits/chosen": -0.6416040658950806, "logits/rejected": -0.19059182703495026, "logps/chosen": -354.4915466308594, "logps/rejected": -370.1239013671875, "loss": -0.1454, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -60.1268310546875, "rewards/margins": 51.82355499267578, "rewards/rejected": -111.95039367675781, "step": 900 }, { "epoch": 0.1786064769381747, "grad_norm": 145.00758891751065, "learning_rate": 4.906689446683146e-07, "logits/chosen": -1.3398195505142212, "logits/rejected": -1.218482255935669, "logps/chosen": -255.09414672851562, "logps/rejected": -401.0498046875, "loss": -0.3631, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -17.96017074584961, "rewards/margins": 65.85710144042969, "rewards/rejected": -83.81727600097656, "step": 910 }, { "epoch": 0.18056918547595682, "grad_norm": 167.58121540373645, "learning_rate": 4.901996702226755e-07, "logits/chosen": -1.3835618495941162, "logits/rejected": -1.1535968780517578, "logps/chosen": -317.39752197265625, "logps/rejected": -416.4386291503906, "loss": -0.5316, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -16.77207374572754, "rewards/margins": 61.89057159423828, "rewards/rejected": -78.66264343261719, "step": 920 }, { "epoch": 0.18253189401373895, "grad_norm": 231.2481969777746, "learning_rate": 4.897191188239667e-07, "logits/chosen": -1.480469822883606, "logits/rejected": -0.03306392580270767, "logps/chosen": -375.44720458984375, "logps/rejected": -326.720458984375, "loss": -0.3226, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -51.41065216064453, "rewards/margins": 64.13352966308594, "rewards/rejected": -115.54417419433594, "step": 930 }, { "epoch": 0.1844946025515211, "grad_norm": 95.89451364951198, "learning_rate": 4.892273130332334e-07, "logits/chosen": -1.559515357017517, "logits/rejected": -1.0601645708084106, "logps/chosen": -366.4732360839844, "logps/rejected": -480.1495666503906, "loss": -0.3171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -38.44028854370117, "rewards/margins": 82.14427185058594, "rewards/rejected": -120.58455657958984, "step": 940 }, { "epoch": 0.18645731108930325, "grad_norm": 165.28594620525925, "learning_rate": 4.887242759398945e-07, "logits/chosen": -1.6824363470077515, "logits/rejected": -0.6552220582962036, "logps/chosen": -223.1175537109375, "logps/rejected": -305.62994384765625, "loss": -0.5034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.53265380859375, "rewards/margins": 86.1128158569336, "rewards/rejected": -111.64546966552734, "step": 950 }, { "epoch": 0.18842001962708538, "grad_norm": 352.8798167060443, "learning_rate": 4.88210031160659e-07, "logits/chosen": -1.1741770505905151, "logits/rejected": -0.8634139895439148, "logps/chosen": -291.512939453125, "logps/rejected": -352.5907897949219, "loss": -0.4225, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -24.746952056884766, "rewards/margins": 75.15422821044922, "rewards/rejected": -99.90117645263672, "step": 960 }, { "epoch": 0.1903827281648675, "grad_norm": 560.8621196580634, "learning_rate": 4.876846028384169e-07, "logits/chosen": -1.093379259109497, "logits/rejected": -0.6906472444534302, "logps/chosen": -271.27606201171875, "logps/rejected": -361.41082763671875, "loss": -0.3826, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -67.39205169677734, "rewards/margins": 58.437278747558594, "rewards/rejected": -125.82933044433594, "step": 970 }, { "epoch": 0.19234543670264967, "grad_norm": 159.55606280882097, "learning_rate": 4.87148015641106e-07, "logits/chosen": -1.4224119186401367, "logits/rejected": -1.2690508365631104, "logps/chosen": -292.9847717285156, "logps/rejected": -399.9471740722656, "loss": -0.427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -41.93357849121094, "rewards/margins": 83.13137817382812, "rewards/rejected": -125.06495666503906, "step": 980 }, { "epoch": 0.1943081452404318, "grad_norm": 641.5121807143399, "learning_rate": 4.866002947605539e-07, "logits/chosen": -1.591728925704956, "logits/rejected": -0.5198885202407837, "logps/chosen": -258.06512451171875, "logps/rejected": -357.8591613769531, "loss": -0.4375, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -21.172761917114258, "rewards/margins": 94.05410766601562, "rewards/rejected": -115.22686767578125, "step": 990 }, { "epoch": 0.19627085377821393, "grad_norm": 294.8038593268707, "learning_rate": 4.860414659112948e-07, "logits/chosen": -1.466378927230835, "logits/rejected": -0.14748302102088928, "logps/chosen": -287.17327880859375, "logps/rejected": -319.5938415527344, "loss": -0.2781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -49.524131774902344, "rewards/margins": 77.31150817871094, "rewards/rejected": -126.83563232421875, "step": 1000 }, { "epoch": 0.19823356231599606, "grad_norm": 156.36694012458494, "learning_rate": 4.854715553293627e-07, "logits/chosen": -2.3596818447113037, "logits/rejected": -1.8426835536956787, "logps/chosen": -309.9227600097656, "logps/rejected": -283.349609375, "loss": -0.3961, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 4.278321266174316, "rewards/margins": 85.08187103271484, "rewards/rejected": -80.80355834960938, "step": 1010 }, { "epoch": 0.20019627085377822, "grad_norm": 381.72564192876956, "learning_rate": 4.848905897710595e-07, "logits/chosen": -1.2412532567977905, "logits/rejected": -0.6532396078109741, "logps/chosen": -364.1349182128906, "logps/rejected": -307.4398498535156, "loss": -0.3027, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -53.071319580078125, "rewards/margins": 37.50835418701172, "rewards/rejected": -90.57967376708984, "step": 1020 }, { "epoch": 0.20215897939156036, "grad_norm": 78.98141096690985, "learning_rate": 4.842985965116987e-07, "logits/chosen": -1.6987800598144531, "logits/rejected": -1.1645739078521729, "logps/chosen": -369.7084655761719, "logps/rejected": -363.26177978515625, "loss": -0.2136, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -28.673961639404297, "rewards/margins": 52.8286018371582, "rewards/rejected": -81.50257110595703, "step": 1030 }, { "epoch": 0.2041216879293425, "grad_norm": 346.925274350411, "learning_rate": 4.836956033443253e-07, "logits/chosen": -1.106367826461792, "logits/rejected": -0.31603357195854187, "logps/chosen": -356.8329162597656, "logps/rejected": -444.78179931640625, "loss": -0.3064, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 10.349096298217773, "rewards/margins": 73.9643783569336, "rewards/rejected": -63.61528396606445, "step": 1040 }, { "epoch": 0.20608439646712462, "grad_norm": 438.4198715642591, "learning_rate": 4.830816385784104e-07, "logits/chosen": -0.49576035141944885, "logits/rejected": 0.03637387603521347, "logps/chosen": -348.09161376953125, "logps/rejected": -368.6332092285156, "loss": -0.3907, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -40.26350021362305, "rewards/margins": 49.70841598510742, "rewards/rejected": -89.97191619873047, "step": 1050 }, { "epoch": 0.20804710500490678, "grad_norm": 79.5420107330979, "learning_rate": 4.824567310385226e-07, "logits/chosen": -1.0224738121032715, "logits/rejected": -0.25595730543136597, "logps/chosen": -327.381103515625, "logps/rejected": -342.5819396972656, "loss": -0.5246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -10.011220932006836, "rewards/margins": 70.51642608642578, "rewards/rejected": -80.52764892578125, "step": 1060 }, { "epoch": 0.2100098135426889, "grad_norm": 253.531339773475, "learning_rate": 4.818209100629744e-07, "logits/chosen": -0.9710432291030884, "logits/rejected": -0.8155088424682617, "logps/chosen": -245.9547119140625, "logps/rejected": -328.13055419921875, "loss": -0.5165, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 2.715376377105713, "rewards/margins": 59.916221618652344, "rewards/rejected": -57.20084762573242, "step": 1070 }, { "epoch": 0.21197252208047104, "grad_norm": 146.43974810880522, "learning_rate": 4.81174205502445e-07, "logits/chosen": -0.2731146216392517, "logits/rejected": -0.33066827058792114, "logps/chosen": -261.10858154296875, "logps/rejected": -319.43499755859375, "loss": -0.4185, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -39.90928268432617, "rewards/margins": 67.99629974365234, "rewards/rejected": -107.90559387207031, "step": 1080 }, { "epoch": 0.2139352306182532, "grad_norm": 165.87682456896025, "learning_rate": 4.80516647718579e-07, "logits/chosen": 0.37342050671577454, "logits/rejected": 0.8275140523910522, "logps/chosen": -284.4266357421875, "logps/rejected": -429.19451904296875, "loss": -0.4425, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -44.65375518798828, "rewards/margins": 130.34715270996094, "rewards/rejected": -175.00088500976562, "step": 1090 }, { "epoch": 0.21589793915603533, "grad_norm": 237.45769793544008, "learning_rate": 4.798482675825602e-07, "logits/chosen": -0.37835073471069336, "logits/rejected": -0.46622657775878906, "logps/chosen": -219.25900268554688, "logps/rejected": -347.93817138671875, "loss": -0.2896, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -11.062115669250488, "rewards/margins": 88.63566589355469, "rewards/rejected": -99.69776916503906, "step": 1100 }, { "epoch": 0.21786064769381747, "grad_norm": 91.18145998015851, "learning_rate": 4.791690964736636e-07, "logits/chosen": -0.28714293241500854, "logits/rejected": 0.12103681266307831, "logps/chosen": -320.0545959472656, "logps/rejected": -316.3763122558594, "loss": -0.2683, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -8.284218788146973, "rewards/margins": 93.08128356933594, "rewards/rejected": -101.3655014038086, "step": 1110 }, { "epoch": 0.2198233562315996, "grad_norm": 121.42603440193854, "learning_rate": 4.78479166277781e-07, "logits/chosen": -0.0407433919608593, "logits/rejected": 0.8031107187271118, "logps/chosen": -345.73785400390625, "logps/rejected": -389.3685302734375, "loss": -0.6513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.530865669250488, "rewards/margins": 89.83076477050781, "rewards/rejected": -104.36163330078125, "step": 1120 }, { "epoch": 0.22178606476938176, "grad_norm": 178.85598494929042, "learning_rate": 4.777785093859247e-07, "logits/chosen": -0.24294237792491913, "logits/rejected": 1.3924249410629272, "logps/chosen": -319.64617919921875, "logps/rejected": -478.07318115234375, "loss": -0.4545, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -34.97998046875, "rewards/margins": 150.94078063964844, "rewards/rejected": -185.92076110839844, "step": 1130 }, { "epoch": 0.2237487733071639, "grad_norm": 203.87624153803125, "learning_rate": 4.770671586927063e-07, "logits/chosen": -1.1225866079330444, "logits/rejected": -0.2693057954311371, "logps/chosen": -378.0223693847656, "logps/rejected": -405.68939208984375, "loss": -0.4597, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -21.15289878845215, "rewards/margins": 55.96300506591797, "rewards/rejected": -77.11591339111328, "step": 1140 }, { "epoch": 0.22571148184494602, "grad_norm": 151.87784988687667, "learning_rate": 4.7634514759479275e-07, "logits/chosen": -0.641355574131012, "logits/rejected": 0.8348206281661987, "logps/chosen": -320.2981262207031, "logps/rejected": -383.2963562011719, "loss": -0.5063, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -34.637332916259766, "rewards/margins": 128.01498413085938, "rewards/rejected": -162.65231323242188, "step": 1150 }, { "epoch": 0.22767419038272815, "grad_norm": 163.72370258460307, "learning_rate": 4.7561250998933835e-07, "logits/chosen": -0.26535776257514954, "logits/rejected": 0.9920859336853027, "logps/chosen": -341.9827575683594, "logps/rejected": -310.3675231933594, "loss": -0.5729, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 2.338268756866455, "rewards/margins": 98.30659484863281, "rewards/rejected": -95.96833038330078, "step": 1160 }, { "epoch": 0.2296368989205103, "grad_norm": 238.124688408163, "learning_rate": 4.7486928027239304e-07, "logits/chosen": 0.680788516998291, "logits/rejected": 1.1130828857421875, "logps/chosen": -212.67501831054688, "logps/rejected": -307.6126403808594, "loss": -0.2993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.026186466217041, "rewards/margins": 88.6893539428711, "rewards/rejected": -95.71553039550781, "step": 1170 }, { "epoch": 0.23159960745829244, "grad_norm": 805.0980357395986, "learning_rate": 4.7411549333728807e-07, "logits/chosen": 0.15952681005001068, "logits/rejected": -0.0020960806868970394, "logps/chosen": -371.7433166503906, "logps/rejected": -403.44757080078125, "loss": -0.2357, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -93.2693862915039, "rewards/margins": 25.92207908630371, "rewards/rejected": -119.19146728515625, "step": 1180 }, { "epoch": 0.23356231599607458, "grad_norm": 258.9523686769121, "learning_rate": 4.7335118457299756e-07, "logits/chosen": 0.36494073271751404, "logits/rejected": 0.283443421125412, "logps/chosen": -318.1351318359375, "logps/rejected": -366.8081359863281, "loss": -0.3282, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -30.163848876953125, "rewards/margins": 75.2159423828125, "rewards/rejected": -105.3797836303711, "step": 1190 }, { "epoch": 0.23552502453385674, "grad_norm": 133.20915104852426, "learning_rate": 4.7257638986247684e-07, "logits/chosen": -0.5547437071800232, "logits/rejected": -0.1863251030445099, "logps/chosen": -341.30316162109375, "logps/rejected": -451.8502502441406, "loss": -0.3821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -44.10496520996094, "rewards/margins": 62.69295120239258, "rewards/rejected": -106.79791259765625, "step": 1200 }, { "epoch": 0.23748773307163887, "grad_norm": 106.00598993940797, "learning_rate": 4.7179114558097814e-07, "logits/chosen": -0.6369959115982056, "logits/rejected": 0.20513415336608887, "logps/chosen": -265.24688720703125, "logps/rejected": -304.51287841796875, "loss": -0.5352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.943883419036865, "rewards/margins": 93.66683959960938, "rewards/rejected": -99.6107177734375, "step": 1210 }, { "epoch": 0.239450441609421, "grad_norm": 267.91703907799683, "learning_rate": 4.709954885943428e-07, "logits/chosen": -0.5717731714248657, "logits/rejected": -0.352356493473053, "logps/chosen": -311.2847900390625, "logps/rejected": -283.5633544921875, "loss": -0.166, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -38.53837203979492, "rewards/margins": 27.605804443359375, "rewards/rejected": -66.14418029785156, "step": 1220 }, { "epoch": 0.24141315014720313, "grad_norm": 134.52001393314518, "learning_rate": 4.7018945625727026e-07, "logits/chosen": -1.4019057750701904, "logits/rejected": -0.9080744981765747, "logps/chosen": -313.23150634765625, "logps/rejected": -345.84368896484375, "loss": -0.3121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.53964614868164, "rewards/margins": 41.04530334472656, "rewards/rejected": -76.58494567871094, "step": 1230 }, { "epoch": 0.2433758586849853, "grad_norm": 114.79403816792266, "learning_rate": 4.6937308641156447e-07, "logits/chosen": -0.7664046883583069, "logits/rejected": -0.3152869939804077, "logps/chosen": -207.9265594482422, "logps/rejected": -280.22601318359375, "loss": -0.3627, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 2.171916961669922, "rewards/margins": 70.08987426757812, "rewards/rejected": -67.9179458618164, "step": 1240 }, { "epoch": 0.24533856722276742, "grad_norm": 650.163880963966, "learning_rate": 4.685464173843574e-07, "logits/chosen": -0.9291526079177856, "logits/rejected": -0.560367226600647, "logps/chosen": -236.35678100585938, "logps/rejected": -341.97314453125, "loss": -0.4531, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -2.365212917327881, "rewards/margins": 84.2957763671875, "rewards/rejected": -86.6609878540039, "step": 1250 }, { "epoch": 0.24730127576054955, "grad_norm": 141.00731130720393, "learning_rate": 4.677094879863093e-07, "logits/chosen": -0.3296302556991577, "logits/rejected": 0.6696518063545227, "logps/chosen": -274.39739990234375, "logps/rejected": -319.5539245605469, "loss": -0.2632, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -25.98756980895996, "rewards/margins": 76.83675384521484, "rewards/rejected": -102.82432556152344, "step": 1260 }, { "epoch": 0.2492639842983317, "grad_norm": 257.82707549351966, "learning_rate": 4.66862337509787e-07, "logits/chosen": -0.4417852461338043, "logits/rejected": 0.5099334716796875, "logps/chosen": -332.7542724609375, "logps/rejected": -348.40850830078125, "loss": -0.2051, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -11.930026054382324, "rewards/margins": 116.28517150878906, "rewards/rejected": -128.21517944335938, "step": 1270 }, { "epoch": 0.2512266928361138, "grad_norm": 187.1587822045199, "learning_rate": 4.660050057270191e-07, "logits/chosen": -0.007944846525788307, "logits/rejected": 1.442101240158081, "logps/chosen": -262.4355163574219, "logps/rejected": -349.79510498046875, "loss": -0.4732, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.347912788391113, "rewards/margins": 129.41015625, "rewards/rejected": -135.758056640625, "step": 1280 }, { "epoch": 0.25318940137389595, "grad_norm": 666.1245726262127, "learning_rate": 4.6513753288822833e-07, "logits/chosen": 0.024620437994599342, "logits/rejected": 0.3650778830051422, "logps/chosen": -207.1399383544922, "logps/rejected": -286.62640380859375, "loss": -0.337, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -47.83836364746094, "rewards/margins": 66.05262756347656, "rewards/rejected": -113.8909912109375, "step": 1290 }, { "epoch": 0.25515210991167814, "grad_norm": 99.35924285009612, "learning_rate": 4.6425995971974265e-07, "logits/chosen": 0.296586811542511, "logits/rejected": 0.6352599263191223, "logps/chosen": -343.1158447265625, "logps/rejected": -315.1025390625, "loss": 0.0791, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -59.8315315246582, "rewards/margins": 38.25086212158203, "rewards/rejected": -98.08238220214844, "step": 1300 }, { "epoch": 0.25711481844946027, "grad_norm": 92.30107684645584, "learning_rate": 4.633723274220824e-07, "logits/chosen": -0.5543586611747742, "logits/rejected": -0.5389699935913086, "logps/chosen": -332.3842468261719, "logps/rejected": -396.8943176269531, "loss": -0.329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.449684143066406, "rewards/margins": 37.95604705810547, "rewards/rejected": -73.40573120117188, "step": 1310 }, { "epoch": 0.2590775269872424, "grad_norm": 361.873429330949, "learning_rate": 4.624746776680267e-07, "logits/chosen": -0.004599392414093018, "logits/rejected": 0.009301548823714256, "logps/chosen": -268.728515625, "logps/rejected": -327.2678527832031, "loss": -0.3963, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -20.148090362548828, "rewards/margins": 60.392845153808594, "rewards/rejected": -80.54093933105469, "step": 1320 }, { "epoch": 0.26104023552502453, "grad_norm": 109.14778662061352, "learning_rate": 4.6156705260065634e-07, "logits/chosen": -0.5415644645690918, "logits/rejected": -0.6133627891540527, "logps/chosen": -220.20889282226562, "logps/rejected": -311.1971130371094, "loss": -0.3628, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -11.035032272338867, "rewards/margins": 75.02830505371094, "rewards/rejected": -86.0633316040039, "step": 1330 }, { "epoch": 0.26300294406280667, "grad_norm": 97.3276437552577, "learning_rate": 4.606494948313758e-07, "logits/chosen": -0.067128024995327, "logits/rejected": 0.13759474456310272, "logps/chosen": -287.49169921875, "logps/rejected": -361.89947509765625, "loss": -0.417, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -45.38007736206055, "rewards/margins": 60.861473083496094, "rewards/rejected": -106.24156188964844, "step": 1340 }, { "epoch": 0.2649656526005888, "grad_norm": 214.23452407810288, "learning_rate": 4.597220474379125e-07, "logits/chosen": -0.6385024785995483, "logits/rejected": -0.8114584684371948, "logps/chosen": -374.2613830566406, "logps/rejected": -419.7872619628906, "loss": -0.1842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -46.98633575439453, "rewards/margins": 49.645164489746094, "rewards/rejected": -96.63150024414062, "step": 1350 }, { "epoch": 0.26692836113837093, "grad_norm": 161.01599172509583, "learning_rate": 4.587847539622942e-07, "logits/chosen": -0.8062373995780945, "logits/rejected": -0.5399857759475708, "logps/chosen": -404.7890625, "logps/rejected": -394.0738220214844, "loss": -0.5806, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -26.630985260009766, "rewards/margins": 44.17183303833008, "rewards/rejected": -70.80281829833984, "step": 1360 }, { "epoch": 0.2688910696761531, "grad_norm": 311.7019525663922, "learning_rate": 4.5783765840880505e-07, "logits/chosen": -0.8065996170043945, "logits/rejected": -0.20719997584819794, "logps/chosen": -352.0925598144531, "logps/rejected": -424.54632568359375, "loss": -0.3118, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -24.066116333007812, "rewards/margins": 93.7226333618164, "rewards/rejected": -117.78874206542969, "step": 1370 }, { "epoch": 0.27085377821393525, "grad_norm": 242.40925811797663, "learning_rate": 4.568808052419196e-07, "logits/chosen": -0.4659969210624695, "logits/rejected": -0.025981564074754715, "logps/chosen": -250.2906036376953, "logps/rejected": -320.0798645019531, "loss": -0.5674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.68593406677246, "rewards/margins": 76.80311584472656, "rewards/rejected": -102.48905181884766, "step": 1380 }, { "epoch": 0.2728164867517174, "grad_norm": 150.67743335056548, "learning_rate": 4.5591423938421513e-07, "logits/chosen": -0.09468124061822891, "logits/rejected": 0.4379270672798157, "logps/chosen": -341.0621643066406, "logps/rejected": -393.36834716796875, "loss": -0.2812, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -57.19630813598633, "rewards/margins": 77.68519592285156, "rewards/rejected": -134.88150024414062, "step": 1390 }, { "epoch": 0.2747791952894995, "grad_norm": 189.2180154396537, "learning_rate": 4.549380062142627e-07, "logits/chosen": -0.7744480967521667, "logits/rejected": -0.47629514336586, "logps/chosen": -297.24383544921875, "logps/rejected": -367.6190490722656, "loss": -0.2708, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -36.73484420776367, "rewards/margins": 37.12657928466797, "rewards/rejected": -73.8614273071289, "step": 1400 }, { "epoch": 0.27674190382728164, "grad_norm": 398.1433179588958, "learning_rate": 4.5395215156449683e-07, "logits/chosen": -0.35732418298721313, "logits/rejected": -0.7713747024536133, "logps/chosen": -320.3706970214844, "logps/rejected": -427.70147705078125, "loss": -0.5049, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -17.494029998779297, "rewards/margins": 61.1119384765625, "rewards/rejected": -78.60597229003906, "step": 1410 }, { "epoch": 0.2787046123650638, "grad_norm": 176.93165495809393, "learning_rate": 4.5295672171906365e-07, "logits/chosen": -0.8126991391181946, "logits/rejected": -0.05773216485977173, "logps/chosen": -278.0550231933594, "logps/rejected": -299.2767639160156, "loss": -0.4139, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -12.590118408203125, "rewards/margins": 58.349082946777344, "rewards/rejected": -70.93920135498047, "step": 1420 }, { "epoch": 0.2806673209028459, "grad_norm": 143.0438988447445, "learning_rate": 4.5195176341164765e-07, "logits/chosen": -0.9170870780944824, "logits/rejected": -0.9549194574356079, "logps/chosen": -302.0723876953125, "logps/rejected": -421.36395263671875, "loss": -0.2934, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -33.58391571044922, "rewards/margins": 92.94100952148438, "rewards/rejected": -126.5249252319336, "step": 1430 }, { "epoch": 0.2826300294406281, "grad_norm": 173.87703703799082, "learning_rate": 4.509373238232782e-07, "logits/chosen": -0.795932412147522, "logits/rejected": -0.2218722403049469, "logps/chosen": -323.84259033203125, "logps/rejected": -302.75732421875, "loss": -0.4701, "rewards/accuracies": 0.533333420753479, "rewards/chosen": -25.0303955078125, "rewards/margins": 43.92810821533203, "rewards/rejected": -68.95849609375, "step": 1440 }, { "epoch": 0.2845927379784102, "grad_norm": 121.8127323301411, "learning_rate": 4.499134505801141e-07, "logits/chosen": -0.306710422039032, "logits/rejected": -0.17278121411800385, "logps/chosen": -237.3304443359375, "logps/rejected": -352.43231201171875, "loss": -0.4569, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -6.329686164855957, "rewards/margins": 93.90690612792969, "rewards/rejected": -100.23661041259766, "step": 1450 }, { "epoch": 0.28655544651619236, "grad_norm": 149.5512301219652, "learning_rate": 4.488801917512076e-07, "logits/chosen": -0.38846054673194885, "logits/rejected": -0.6771696209907532, "logps/chosen": -319.62164306640625, "logps/rejected": -426.58294677734375, "loss": -0.2105, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -30.19916343688965, "rewards/margins": 59.85419845581055, "rewards/rejected": -90.0533676147461, "step": 1460 }, { "epoch": 0.2885181550539745, "grad_norm": 254.60225906008407, "learning_rate": 4.478375958462479e-07, "logits/chosen": -0.31459730863571167, "logits/rejected": 0.6876497268676758, "logps/chosen": -352.12396240234375, "logps/rejected": -366.0809631347656, "loss": -0.2637, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -53.21332931518555, "rewards/margins": 87.39404296875, "rewards/rejected": -140.6073760986328, "step": 1470 }, { "epoch": 0.2904808635917566, "grad_norm": 220.4961511024023, "learning_rate": 4.467857118132833e-07, "logits/chosen": -0.6099443435668945, "logits/rejected": -0.48984870314598083, "logps/chosen": -280.97027587890625, "logps/rejected": -304.026611328125, "loss": -0.237, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": -34.057838439941406, "rewards/margins": 25.877832412719727, "rewards/rejected": -59.93566131591797, "step": 1480 }, { "epoch": 0.29244357212953875, "grad_norm": 271.888383810607, "learning_rate": 4.457245890364235e-07, "logits/chosen": -0.9527867436408997, "logits/rejected": 0.04681504890322685, "logps/chosen": -348.16107177734375, "logps/rejected": -335.77044677734375, "loss": -0.3423, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.4055097103118896, "rewards/margins": 73.13142395019531, "rewards/rejected": -71.72590637207031, "step": 1490 }, { "epoch": 0.2944062806673209, "grad_norm": 630.4514646091122, "learning_rate": 4.4465427733352124e-07, "logits/chosen": -0.7063072919845581, "logits/rejected": -0.4565967917442322, "logps/chosen": -295.3616943359375, "logps/rejected": -383.42950439453125, "loss": -0.4628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -12.71705436706543, "rewards/margins": 98.17766571044922, "rewards/rejected": -110.89472961425781, "step": 1500 }, { "epoch": 0.296368989205103, "grad_norm": 190.15994178997343, "learning_rate": 4.43574826953833e-07, "logits/chosen": -0.5849586725234985, "logits/rejected": 0.1167064756155014, "logps/chosen": -318.72027587890625, "logps/rejected": -442.87738037109375, "loss": -0.3713, "rewards/accuracies": 0.8333331942558289, "rewards/chosen": -7.379508018493652, "rewards/margins": 109.5814437866211, "rewards/rejected": -116.96095275878906, "step": 1510 }, { "epoch": 0.2983316977428852, "grad_norm": 129.60364723768552, "learning_rate": 4.4248628857565997e-07, "logits/chosen": -0.01909918151795864, "logits/rejected": 0.701531171798706, "logps/chosen": -355.89434814453125, "logps/rejected": -316.97027587890625, "loss": -0.461, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -19.56877899169922, "rewards/margins": 84.98934173583984, "rewards/rejected": -104.55812072753906, "step": 1520 }, { "epoch": 0.30029440628066734, "grad_norm": 273.44359518179436, "learning_rate": 4.413887133039692e-07, "logits/chosen": -0.8855515718460083, "logits/rejected": 0.502372682094574, "logps/chosen": -429.783203125, "logps/rejected": -412.05401611328125, "loss": -0.4716, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.15018081665039, "rewards/margins": 117.2563247680664, "rewards/rejected": -137.40650939941406, "step": 1530 }, { "epoch": 0.30225711481844947, "grad_norm": 131.2149314160654, "learning_rate": 4.4028215266799395e-07, "logits/chosen": -0.3892694115638733, "logits/rejected": 0.39369386434555054, "logps/chosen": -282.8835754394531, "logps/rejected": -336.5223693847656, "loss": -0.4009, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -9.758750915527344, "rewards/margins": 105.216064453125, "rewards/rejected": -114.9748306274414, "step": 1540 }, { "epoch": 0.3042198233562316, "grad_norm": 577.338101015651, "learning_rate": 4.391666586188145e-07, "logits/chosen": 0.2062421292066574, "logits/rejected": 0.7659724950790405, "logps/chosen": -243.0502166748047, "logps/rejected": -323.6511535644531, "loss": -0.4572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.352008819580078, "rewards/margins": 67.0520248413086, "rewards/rejected": -97.4040298461914, "step": 1550 }, { "epoch": 0.30618253189401373, "grad_norm": 146.4717861452119, "learning_rate": 4.380422835269193e-07, "logits/chosen": -0.4887501299381256, "logits/rejected": 0.06172027066349983, "logps/chosen": -322.8172607421875, "logps/rejected": -423.1553649902344, "loss": -0.367, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -68.84202575683594, "rewards/margins": 88.07933044433594, "rewards/rejected": -156.92135620117188, "step": 1560 }, { "epoch": 0.30814524043179586, "grad_norm": 138.86670589752217, "learning_rate": 4.3690908017974596e-07, "logits/chosen": -0.2648061215877533, "logits/rejected": -0.24680499732494354, "logps/chosen": -245.3034210205078, "logps/rejected": -377.53033447265625, "loss": -0.3897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.56615447998047, "rewards/margins": 84.67030334472656, "rewards/rejected": -104.2364501953125, "step": 1570 }, { "epoch": 0.310107948969578, "grad_norm": 361.99759490114053, "learning_rate": 4.3576710177920356e-07, "logits/chosen": -0.7863548398017883, "logits/rejected": -0.24762578308582306, "logps/chosen": -265.63458251953125, "logps/rejected": -330.4261779785156, "loss": -0.4518, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -29.226421356201172, "rewards/margins": 70.5469970703125, "rewards/rejected": -99.77342224121094, "step": 1580 }, { "epoch": 0.3120706575073602, "grad_norm": 110.26161190271637, "learning_rate": 4.346164019391742e-07, "logits/chosen": -0.8181453943252563, "logits/rejected": -0.41510826349258423, "logps/chosen": -379.28326416015625, "logps/rejected": -444.27020263671875, "loss": -0.4288, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -17.949371337890625, "rewards/margins": 93.24482727050781, "rewards/rejected": -111.1942138671875, "step": 1590 }, { "epoch": 0.3140333660451423, "grad_norm": 418.14545445046167, "learning_rate": 4.3345703468299634e-07, "logits/chosen": -0.04152932018041611, "logits/rejected": -0.17715571820735931, "logps/chosen": -317.2388610839844, "logps/rejected": -370.34503173828125, "loss": -0.077, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -39.51045608520508, "rewards/margins": 59.95743942260742, "rewards/rejected": -99.4678955078125, "step": 1600 }, { "epoch": 0.31599607458292445, "grad_norm": 341.5697755314112, "learning_rate": 4.322890544409286e-07, "logits/chosen": -0.933850109577179, "logits/rejected": 0.7957839369773865, "logps/chosen": -325.163818359375, "logps/rejected": -417.4605407714844, "loss": -0.8099, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.714933395385742, "rewards/margins": 148.0409698486328, "rewards/rejected": -158.7559051513672, "step": 1610 }, { "epoch": 0.3179587831207066, "grad_norm": 1172.1117342934133, "learning_rate": 4.311125160475938e-07, "logits/chosen": 0.015446802601218224, "logits/rejected": 0.40264415740966797, "logps/chosen": -313.5318603515625, "logps/rejected": -489.9974670410156, "loss": -0.5845, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -41.430572509765625, "rewards/margins": 96.05662536621094, "rewards/rejected": -137.48721313476562, "step": 1620 }, { "epoch": 0.3199214916584887, "grad_norm": 292.0814140958923, "learning_rate": 4.299274747394055e-07, "logits/chosen": -0.01684349775314331, "logits/rejected": 0.5265442728996277, "logps/chosen": -288.3865966796875, "logps/rejected": -414.86944580078125, "loss": -0.6777, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -22.02135467529297, "rewards/margins": 150.4295654296875, "rewards/rejected": -172.45091247558594, "step": 1630 }, { "epoch": 0.32188420019627084, "grad_norm": 303.0218494139286, "learning_rate": 4.287339861519737e-07, "logits/chosen": -0.4546899199485779, "logits/rejected": 0.34309619665145874, "logps/chosen": -338.341064453125, "logps/rejected": -413.78448486328125, "loss": -0.3411, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -38.64856719970703, "rewards/margins": 92.2686996459961, "rewards/rejected": -130.91726684570312, "step": 1640 }, { "epoch": 0.323846908734053, "grad_norm": 406.3278604007249, "learning_rate": 4.275321063174936e-07, "logits/chosen": -1.1363162994384766, "logits/rejected": -0.6217517852783203, "logps/chosen": -352.1303405761719, "logps/rejected": -346.18896484375, "loss": -0.4748, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -15.482063293457031, "rewards/margins": 75.24620056152344, "rewards/rejected": -90.728271484375, "step": 1650 }, { "epoch": 0.3258096172718351, "grad_norm": 266.73343262711734, "learning_rate": 4.2632189166211454e-07, "logits/chosen": -0.22011294960975647, "logits/rejected": -0.6874805688858032, "logps/chosen": -276.23443603515625, "logps/rejected": -361.9880065917969, "loss": -0.514, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -38.76226806640625, "rewards/margins": 59.25056076049805, "rewards/rejected": -98.01282501220703, "step": 1660 }, { "epoch": 0.3277723258096173, "grad_norm": 186.8053048885168, "learning_rate": 4.251033990032912e-07, "logits/chosen": -0.20579977333545685, "logits/rejected": 0.3252968490123749, "logps/chosen": -328.60992431640625, "logps/rejected": -461.1065979003906, "loss": -0.4945, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.5130558013916, "rewards/margins": 125.23563385009766, "rewards/rejected": -156.74867248535156, "step": 1670 }, { "epoch": 0.3297350343473994, "grad_norm": 146.48868065371127, "learning_rate": 4.238766855471161e-07, "logits/chosen": -0.5992350578308105, "logits/rejected": 0.19298234581947327, "logps/chosen": -372.07769775390625, "logps/rejected": -344.19244384765625, "loss": -0.5042, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -25.195465087890625, "rewards/margins": 83.94126892089844, "rewards/rejected": -109.13673400878906, "step": 1680 }, { "epoch": 0.33169774288518156, "grad_norm": 174.31540598051274, "learning_rate": 4.226418088856335e-07, "logits/chosen": -0.520244836807251, "logits/rejected": -0.006069634575396776, "logps/chosen": -268.96282958984375, "logps/rejected": -441.2184143066406, "loss": -0.4037, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -7.5589494705200195, "rewards/margins": 101.68757629394531, "rewards/rejected": -109.24654388427734, "step": 1690 }, { "epoch": 0.3336604514229637, "grad_norm": 125.96035649492566, "learning_rate": 4.2139882699413613e-07, "logits/chosen": -0.971671462059021, "logits/rejected": 0.798663318157196, "logps/chosen": -299.2193603515625, "logps/rejected": -302.34381103515625, "loss": -0.5299, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -27.212772369384766, "rewards/margins": 84.49937438964844, "rewards/rejected": -111.712158203125, "step": 1700 }, { "epoch": 0.3356231599607458, "grad_norm": 309.9836127821377, "learning_rate": 4.2014779822844274e-07, "logits/chosen": -0.3972320556640625, "logits/rejected": 0.03052547574043274, "logps/chosen": -267.49505615234375, "logps/rejected": -406.4791259765625, "loss": -0.4363, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -37.45825958251953, "rewards/margins": 112.9147720336914, "rewards/rejected": -150.373046875, "step": 1710 }, { "epoch": 0.33758586849852795, "grad_norm": 246.7349155956216, "learning_rate": 4.18888781322159e-07, "logits/chosen": -0.15653717517852783, "logits/rejected": 0.8107942342758179, "logps/chosen": -289.45263671875, "logps/rejected": -403.25396728515625, "loss": -0.6618, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -42.41587829589844, "rewards/margins": 103.66729736328125, "rewards/rejected": -146.08316040039062, "step": 1720 }, { "epoch": 0.3395485770363101, "grad_norm": 144.78021131836027, "learning_rate": 4.176218353839195e-07, "logits/chosen": -0.9188981056213379, "logits/rejected": -0.6861734986305237, "logps/chosen": -301.84503173828125, "logps/rejected": -298.54827880859375, "loss": -0.2465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -39.52213668823242, "rewards/margins": 48.964603424072266, "rewards/rejected": -88.48674011230469, "step": 1730 }, { "epoch": 0.34151128557409227, "grad_norm": 1346.2376860191207, "learning_rate": 4.1634701989461325e-07, "logits/chosen": -0.4513590931892395, "logits/rejected": -0.4555346965789795, "logps/chosen": -302.19073486328125, "logps/rejected": -408.97137451171875, "loss": -0.5585, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -33.58020782470703, "rewards/margins": 97.23760986328125, "rewards/rejected": -130.81781005859375, "step": 1740 }, { "epoch": 0.3434739941118744, "grad_norm": 150.10453473273904, "learning_rate": 4.1506439470459056e-07, "logits/chosen": 0.05149317905306816, "logits/rejected": 0.23694534599781036, "logps/chosen": -286.8460388183594, "logps/rejected": -367.44732666015625, "loss": -0.4108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -52.61028289794922, "rewards/margins": 93.46243286132812, "rewards/rejected": -146.07272338867188, "step": 1750 }, { "epoch": 0.34543670264965654, "grad_norm": 113.95944393481646, "learning_rate": 4.137740200308537e-07, "logits/chosen": -0.5160384178161621, "logits/rejected": -0.018883686512708664, "logps/chosen": -301.40863037109375, "logps/rejected": -341.28314208984375, "loss": -0.3418, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -11.017160415649414, "rewards/margins": 62.98115158081055, "rewards/rejected": -73.99830627441406, "step": 1760 }, { "epoch": 0.34739941118743867, "grad_norm": 285.0767935629754, "learning_rate": 4.124759564542295e-07, "logits/chosen": -0.6595760583877563, "logits/rejected": -0.058022283017635345, "logps/chosen": -338.2293701171875, "logps/rejected": -303.3883056640625, "loss": -0.3546, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -42.089073181152344, "rewards/margins": 48.59239959716797, "rewards/rejected": -90.68148040771484, "step": 1770 }, { "epoch": 0.3493621197252208, "grad_norm": 101.03236183328043, "learning_rate": 4.111702649165255e-07, "logits/chosen": -0.9161526560783386, "logits/rejected": 0.10425040870904922, "logps/chosen": -283.76446533203125, "logps/rejected": -302.06591796875, "loss": -0.2768, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -32.11731719970703, "rewards/margins": 54.53614044189453, "rewards/rejected": -86.65345764160156, "step": 1780 }, { "epoch": 0.35132482826300293, "grad_norm": 263.8868367231896, "learning_rate": 4.0985700671766834e-07, "logits/chosen": -0.9468552470207214, "logits/rejected": 0.9110676646232605, "logps/chosen": -364.1922302246094, "logps/rejected": -412.3247985839844, "loss": -0.605, "rewards/accuracies": 0.8333331942558289, "rewards/chosen": -9.130239486694336, "rewards/margins": 121.54182434082031, "rewards/rejected": -130.6720733642578, "step": 1790 }, { "epoch": 0.35328753680078506, "grad_norm": 130.85053749132823, "learning_rate": 4.085362435128262e-07, "logits/chosen": -0.405953586101532, "logits/rejected": 1.244800329208374, "logps/chosen": -306.9328308105469, "logps/rejected": -437.663818359375, "loss": -0.6196, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -19.635013580322266, "rewards/margins": 167.39654541015625, "rewards/rejected": -187.0315704345703, "step": 1800 }, { "epoch": 0.35525024533856725, "grad_norm": 223.96094505752646, "learning_rate": 4.0720803730951423e-07, "logits/chosen": -1.1470909118652344, "logits/rejected": 0.7912918925285339, "logps/chosen": -343.0729675292969, "logps/rejected": -324.56890869140625, "loss": -0.5042, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -32.91019058227539, "rewards/margins": 98.570068359375, "rewards/rejected": -131.48025512695312, "step": 1810 }, { "epoch": 0.3572129538763494, "grad_norm": 162.33950498911926, "learning_rate": 4.058724504646834e-07, "logits/chosen": -0.558147668838501, "logits/rejected": 0.019284352660179138, "logps/chosen": -248.22433471679688, "logps/rejected": -336.770751953125, "loss": -0.5787, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -19.4455623626709, "rewards/margins": 90.05865478515625, "rewards/rejected": -109.50422668457031, "step": 1820 }, { "epoch": 0.3591756624141315, "grad_norm": 278.1738166038469, "learning_rate": 4.045295456817924e-07, "logits/chosen": -0.25461921095848083, "logits/rejected": 0.1365271359682083, "logps/chosen": -326.7425231933594, "logps/rejected": -372.71990966796875, "loss": -0.4594, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -49.863792419433594, "rewards/margins": 45.01863479614258, "rewards/rejected": -94.88243103027344, "step": 1830 }, { "epoch": 0.36113837095191365, "grad_norm": 108.83183167859887, "learning_rate": 4.0317938600786484e-07, "logits/chosen": -0.7870214581489563, "logits/rejected": -0.34110045433044434, "logps/chosen": -341.3463134765625, "logps/rejected": -372.3836364746094, "loss": -0.3748, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -46.37664794921875, "rewards/margins": 50.43975830078125, "rewards/rejected": -96.81640625, "step": 1840 }, { "epoch": 0.3631010794896958, "grad_norm": 206.2312367532649, "learning_rate": 4.0182203483052825e-07, "logits/chosen": -0.487540066242218, "logits/rejected": 0.014774179086089134, "logps/chosen": -356.26446533203125, "logps/rejected": -345.57916259765625, "loss": -0.5075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -75.3117904663086, "rewards/margins": 64.49422454833984, "rewards/rejected": -139.80601501464844, "step": 1850 }, { "epoch": 0.3650637880274779, "grad_norm": 183.16497932938577, "learning_rate": 4.004575558750389e-07, "logits/chosen": -1.6905851364135742, "logits/rejected": -0.9367521405220032, "logps/chosen": -396.78436279296875, "logps/rejected": -417.2666015625, "loss": -0.4071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -46.17219543457031, "rewards/margins": 69.63795471191406, "rewards/rejected": -115.81014251708984, "step": 1860 }, { "epoch": 0.36702649656526004, "grad_norm": 160.43816794717725, "learning_rate": 3.9908601320128976e-07, "logits/chosen": -0.1876428872346878, "logits/rejected": -0.25627821683883667, "logps/chosen": -285.857177734375, "logps/rejected": -381.7496337890625, "loss": -0.2739, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -48.417816162109375, "rewards/margins": 91.4716567993164, "rewards/rejected": -139.88946533203125, "step": 1870 }, { "epoch": 0.3689892051030422, "grad_norm": 86.10127515629074, "learning_rate": 3.9770747120080284e-07, "logits/chosen": -1.1608283519744873, "logits/rejected": -0.7629313468933105, "logps/chosen": -252.29898071289062, "logps/rejected": -324.89410400390625, "loss": -0.4254, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -26.761028289794922, "rewards/margins": 88.71576690673828, "rewards/rejected": -115.4767837524414, "step": 1880 }, { "epoch": 0.37095191364082436, "grad_norm": 128.40419421883988, "learning_rate": 3.963219945937063e-07, "logits/chosen": -1.359933614730835, "logits/rejected": -1.0561379194259644, "logps/chosen": -242.55032348632812, "logps/rejected": -280.73297119140625, "loss": -0.3566, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -9.378472328186035, "rewards/margins": 45.709903717041016, "rewards/rejected": -55.0883674621582, "step": 1890 }, { "epoch": 0.3729146221786065, "grad_norm": 263.02338259002346, "learning_rate": 3.949296484256959e-07, "logits/chosen": -1.4414275884628296, "logits/rejected": -0.8082484006881714, "logps/chosen": -282.81085205078125, "logps/rejected": -333.346435546875, "loss": -0.4909, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -27.606435775756836, "rewards/margins": 62.13264846801758, "rewards/rejected": -89.73908996582031, "step": 1900 }, { "epoch": 0.3748773307163886, "grad_norm": 172.0701384014494, "learning_rate": 3.935304980649813e-07, "logits/chosen": -1.2286367416381836, "logits/rejected": 0.15929050743579865, "logps/chosen": -325.7023620605469, "logps/rejected": -418.928466796875, "loss": -0.3225, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -25.623632431030273, "rewards/margins": 114.02363586425781, "rewards/rejected": -139.64724731445312, "step": 1910 }, { "epoch": 0.37684003925417076, "grad_norm": 157.22760578014413, "learning_rate": 3.92124609199217e-07, "logits/chosen": -1.2718513011932373, "logits/rejected": -0.8121629953384399, "logps/chosen": -223.34976196289062, "logps/rejected": -311.38446044921875, "loss": -0.6499, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -19.075206756591797, "rewards/margins": 75.05207824707031, "rewards/rejected": -94.1272964477539, "step": 1920 }, { "epoch": 0.3788027477919529, "grad_norm": 116.06395870716385, "learning_rate": 3.907120478324185e-07, "logits/chosen": -1.3779296875, "logits/rejected": -0.646757960319519, "logps/chosen": -333.4664611816406, "logps/rejected": -378.027099609375, "loss": -0.5177, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -43.617271423339844, "rewards/margins": 63.46489334106445, "rewards/rejected": -107.08216857910156, "step": 1930 }, { "epoch": 0.380765456329735, "grad_norm": 164.47337878624106, "learning_rate": 3.8929288028186364e-07, "logits/chosen": -1.277091383934021, "logits/rejected": 0.1221558004617691, "logps/chosen": -237.57803344726562, "logps/rejected": -303.90631103515625, "loss": -0.6118, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -18.589313507080078, "rewards/margins": 88.36052703857422, "rewards/rejected": -106.9498291015625, "step": 1940 }, { "epoch": 0.38272816486751715, "grad_norm": 255.51215973354533, "learning_rate": 3.8786717317497875e-07, "logits/chosen": -0.8515610694885254, "logits/rejected": -0.052010197192430496, "logps/chosen": -350.1195373535156, "logps/rejected": -392.09185791015625, "loss": -0.5892, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -36.454444885253906, "rewards/margins": 85.26441955566406, "rewards/rejected": -121.71885681152344, "step": 1950 }, { "epoch": 0.38469087340529934, "grad_norm": 264.2380100054303, "learning_rate": 3.864349934462111e-07, "logits/chosen": -1.441253662109375, "logits/rejected": -0.5022369623184204, "logps/chosen": -285.413818359375, "logps/rejected": -365.9031677246094, "loss": -0.6713, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -22.97023582458496, "rewards/margins": 97.99842834472656, "rewards/rejected": -120.96867370605469, "step": 1960 }, { "epoch": 0.38665358194308147, "grad_norm": 406.9420345826437, "learning_rate": 3.84996408333886e-07, "logits/chosen": -1.703598976135254, "logits/rejected": -0.07386224716901779, "logps/chosen": -329.97857666015625, "logps/rejected": -355.3179016113281, "loss": -0.6019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.744609832763672, "rewards/margins": 110.19657135009766, "rewards/rejected": -133.941162109375, "step": 1970 }, { "epoch": 0.3886162904808636, "grad_norm": 213.0231162676536, "learning_rate": 3.8355148537705047e-07, "logits/chosen": -1.3146978616714478, "logits/rejected": -0.5318170785903931, "logps/chosen": -246.01046752929688, "logps/rejected": -320.2939147949219, "loss": -0.5452, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -15.361353874206543, "rewards/margins": 100.29632568359375, "rewards/rejected": -115.6576919555664, "step": 1980 }, { "epoch": 0.39057899901864573, "grad_norm": 363.43679686200045, "learning_rate": 3.8210029241230204e-07, "logits/chosen": -1.2556743621826172, "logits/rejected": -0.2555163502693176, "logps/chosen": -381.35833740234375, "logps/rejected": -415.9195251464844, "loss": -0.4048, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -33.422019958496094, "rewards/margins": 98.73605346679688, "rewards/rejected": -132.15806579589844, "step": 1990 }, { "epoch": 0.39254170755642787, "grad_norm": 496.3924980435426, "learning_rate": 3.806428975706042e-07, "logits/chosen": -0.2570883631706238, "logits/rejected": 0.7770005464553833, "logps/chosen": -236.70166015625, "logps/rejected": -321.8598937988281, "loss": -0.5504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2519516944885254, "rewards/margins": 89.99324798583984, "rewards/rejected": -92.24520111083984, "step": 2000 }, { "epoch": 0.39450441609421, "grad_norm": 166.07660009760488, "learning_rate": 3.791793692740876e-07, "logits/chosen": -0.29146766662597656, "logits/rejected": 1.1259669065475464, "logps/chosen": -241.04556274414062, "logps/rejected": -278.5406188964844, "loss": -0.4738, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -8.584646224975586, "rewards/margins": 101.43263244628906, "rewards/rejected": -110.01727294921875, "step": 2010 }, { "epoch": 0.39646712463199213, "grad_norm": 151.7333757685711, "learning_rate": 3.777097762328381e-07, "logits/chosen": -0.20288319885730743, "logits/rejected": 1.1260716915130615, "logps/chosen": -303.77459716796875, "logps/rejected": -375.0235595703125, "loss": -0.4607, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7.3858842849731445, "rewards/margins": 106.99356842041016, "rewards/rejected": -114.37945556640625, "step": 2020 }, { "epoch": 0.39842983316977426, "grad_norm": 202.86625448009454, "learning_rate": 3.762341874416702e-07, "logits/chosen": -0.1994774490594864, "logits/rejected": 1.9289014339447021, "logps/chosen": -249.23831176757812, "logps/rejected": -315.8790283203125, "loss": -0.6322, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -21.778350830078125, "rewards/margins": 130.3926544189453, "rewards/rejected": -152.1710205078125, "step": 2030 }, { "epoch": 0.40039254170755645, "grad_norm": 119.7268601709802, "learning_rate": 3.7475267217688896e-07, "logits/chosen": -0.04818441718816757, "logits/rejected": -0.041399337351322174, "logps/chosen": -212.044921875, "logps/rejected": -358.59783935546875, "loss": -0.5411, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -11.626317024230957, "rewards/margins": 94.14241790771484, "rewards/rejected": -105.76873779296875, "step": 2040 }, { "epoch": 0.4023552502453386, "grad_norm": 292.7655773162498, "learning_rate": 3.7326529999303633e-07, "logits/chosen": 0.3846183717250824, "logits/rejected": 0.9397007822990417, "logps/chosen": -231.75955200195312, "logps/rejected": -397.31243896484375, "loss": -0.421, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -16.065950393676758, "rewards/margins": 123.8441162109375, "rewards/rejected": -139.91006469726562, "step": 2050 }, { "epoch": 0.4043179587831207, "grad_norm": 1041.89610513327, "learning_rate": 3.7177214071962684e-07, "logits/chosen": 1.1130757331848145, "logits/rejected": 1.1956154108047485, "logps/chosen": -282.42291259765625, "logps/rejected": -441.60626220703125, "loss": -0.2097, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -30.17438316345215, "rewards/margins": 96.41376495361328, "rewards/rejected": -126.58815002441406, "step": 2060 }, { "epoch": 0.40628066732090284, "grad_norm": 399.61740628981335, "learning_rate": 3.7027326445786835e-07, "logits/chosen": 0.5155268907546997, "logits/rejected": 1.5318434238433838, "logps/chosen": -312.35400390625, "logps/rejected": -379.3057861328125, "loss": -0.3747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -55.69513702392578, "rewards/margins": 85.74750518798828, "rewards/rejected": -141.44264221191406, "step": 2070 }, { "epoch": 0.408243375858685, "grad_norm": 357.5283076916106, "learning_rate": 3.6876874157737167e-07, "logits/chosen": 0.27408498525619507, "logits/rejected": 0.6839532256126404, "logps/chosen": -341.238037109375, "logps/rejected": -381.3678283691406, "loss": -0.3677, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -80.13575744628906, "rewards/margins": 16.930858612060547, "rewards/rejected": -97.06661224365234, "step": 2080 }, { "epoch": 0.4102060843964671, "grad_norm": 445.980529199806, "learning_rate": 3.67258642712846e-07, "logits/chosen": -0.5547662973403931, "logits/rejected": 0.8245648145675659, "logps/chosen": -298.9428405761719, "logps/rejected": -269.8735046386719, "loss": -0.5025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -45.47167205810547, "rewards/margins": 26.456161499023438, "rewards/rejected": -71.92784118652344, "step": 2090 }, { "epoch": 0.41216879293424924, "grad_norm": 481.9606172116737, "learning_rate": 3.6574303876078366e-07, "logits/chosen": -0.28301194310188293, "logits/rejected": 0.8084600567817688, "logps/chosen": -312.2306823730469, "logps/rejected": -391.3448486328125, "loss": -0.6373, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -29.412633895874023, "rewards/margins": 88.322998046875, "rewards/rejected": -117.73564147949219, "step": 2100 }, { "epoch": 0.4141315014720314, "grad_norm": 649.9205592074017, "learning_rate": 3.642220008761309e-07, "logits/chosen": -0.38110026717185974, "logits/rejected": 1.2338255643844604, "logps/chosen": -338.86151123046875, "logps/rejected": -433.78594970703125, "loss": -0.6796, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -11.297021865844727, "rewards/margins": 135.54347229003906, "rewards/rejected": -146.84048461914062, "step": 2110 }, { "epoch": 0.41609421000981356, "grad_norm": 1304.34383157871, "learning_rate": 3.626956004689476e-07, "logits/chosen": -0.011320591904222965, "logits/rejected": 1.5619409084320068, "logps/chosen": -403.05279541015625, "logps/rejected": -323.76910400390625, "loss": -0.3211, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -41.494529724121094, "rewards/margins": 67.07801818847656, "rewards/rejected": -108.57255554199219, "step": 2120 }, { "epoch": 0.4180569185475957, "grad_norm": 398.74765502857923, "learning_rate": 3.6116390920105474e-07, "logits/chosen": -0.46276599168777466, "logits/rejected": 0.021661901846528053, "logps/chosen": -336.35540771484375, "logps/rejected": -342.64715576171875, "loss": -0.3147, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -69.02780151367188, "rewards/margins": 38.62982940673828, "rewards/rejected": -107.65763092041016, "step": 2130 }, { "epoch": 0.4200196270853778, "grad_norm": 297.17454237795334, "learning_rate": 3.5962699898266983e-07, "logits/chosen": -1.0453838109970093, "logits/rejected": -1.0867758989334106, "logps/chosen": -272.9595947265625, "logps/rejected": -308.0858459472656, "loss": -0.3608, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -28.943002700805664, "rewards/margins": 62.91179656982422, "rewards/rejected": -91.85479736328125, "step": 2140 }, { "epoch": 0.42198233562315995, "grad_norm": 209.24545493520753, "learning_rate": 3.5808494196903117e-07, "logits/chosen": -0.6677152514457703, "logits/rejected": -0.7276838421821594, "logps/chosen": -350.9630432128906, "logps/rejected": -321.83221435546875, "loss": -0.4785, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -24.715778350830078, "rewards/margins": 82.55493927001953, "rewards/rejected": -107.27071380615234, "step": 2150 }, { "epoch": 0.4239450441609421, "grad_norm": 219.7334740840983, "learning_rate": 3.565378105570097e-07, "logits/chosen": -0.871170699596405, "logits/rejected": -0.5837348699569702, "logps/chosen": -325.93682861328125, "logps/rejected": -333.42138671875, "loss": -0.4111, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -41.86164093017578, "rewards/margins": 84.16949462890625, "rewards/rejected": -126.03114318847656, "step": 2160 }, { "epoch": 0.4259077526987242, "grad_norm": 115.94671281018573, "learning_rate": 3.549856773817107e-07, "logits/chosen": -0.40060538053512573, "logits/rejected": -0.1994091272354126, "logps/chosen": -260.1810607910156, "logps/rejected": -327.2917785644531, "loss": -0.5332, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -21.62350845336914, "rewards/margins": 89.61854553222656, "rewards/rejected": -111.2420425415039, "step": 2170 }, { "epoch": 0.4278704612365064, "grad_norm": 139.96643791845855, "learning_rate": 3.5342861531306344e-07, "logits/chosen": -0.9581565856933594, "logits/rejected": -0.5796898007392883, "logps/chosen": -255.9517822265625, "logps/rejected": -313.5719909667969, "loss": -0.4206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7591590881347656, "rewards/margins": 95.80101776123047, "rewards/rejected": -97.5601806640625, "step": 2180 }, { "epoch": 0.42983316977428854, "grad_norm": 265.53739183803077, "learning_rate": 3.518666974524002e-07, "logits/chosen": -0.8339200019836426, "logits/rejected": -0.09470844268798828, "logps/chosen": -339.45281982421875, "logps/rejected": -399.4952392578125, "loss": -0.4712, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -23.41973876953125, "rewards/margins": 102.7240982055664, "rewards/rejected": -126.1438217163086, "step": 2190 }, { "epoch": 0.43179587831207067, "grad_norm": 146.12788059648364, "learning_rate": 3.5029999712902387e-07, "logits/chosen": -1.008504867553711, "logits/rejected": -0.9239922761917114, "logps/chosen": -337.4940490722656, "logps/rejected": -437.9051208496094, "loss": -0.5673, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8.190189361572266, "rewards/margins": 75.09192657470703, "rewards/rejected": -83.28211975097656, "step": 2200 }, { "epoch": 0.4337585868498528, "grad_norm": 166.643715503973, "learning_rate": 3.4872858789676583e-07, "logits/chosen": 0.4045542776584625, "logits/rejected": 0.12168149650096893, "logps/chosen": -279.0066833496094, "logps/rejected": -310.8238525390625, "loss": -0.3537, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -46.079505920410156, "rewards/margins": 30.016101837158203, "rewards/rejected": -76.09561157226562, "step": 2210 }, { "epoch": 0.43572129538763493, "grad_norm": 149.6903066102506, "learning_rate": 3.4715254353053236e-07, "logits/chosen": 0.14150777459144592, "logits/rejected": 0.6271403431892395, "logps/chosen": -296.629638671875, "logps/rejected": -407.3511657714844, "loss": -0.5133, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -24.697423934936523, "rewards/margins": 101.6956787109375, "rewards/rejected": -126.39311218261719, "step": 2220 }, { "epoch": 0.43768400392541706, "grad_norm": 246.7681146350035, "learning_rate": 3.4557193802284123e-07, "logits/chosen": -0.16007724404335022, "logits/rejected": 0.5914433598518372, "logps/chosen": -307.1545104980469, "logps/rejected": -362.4603271484375, "loss": -0.1753, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -42.424102783203125, "rewards/margins": 61.89319610595703, "rewards/rejected": -104.31729888916016, "step": 2230 }, { "epoch": 0.4396467124631992, "grad_norm": 92.52457640772026, "learning_rate": 3.4398684558034763e-07, "logits/chosen": 0.8897304534912109, "logits/rejected": 0.8202276229858398, "logps/chosen": -281.07421875, "logps/rejected": -344.7154541015625, "loss": -0.4262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.59819507598877, "rewards/margins": 82.18891906738281, "rewards/rejected": -94.78712463378906, "step": 2240 }, { "epoch": 0.44160942100098133, "grad_norm": 235.72772785387758, "learning_rate": 3.4239734062036067e-07, "logits/chosen": 0.2785721719264984, "logits/rejected": 0.6844779849052429, "logps/chosen": -319.9175109863281, "logps/rejected": -397.5693664550781, "loss": -0.4579, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -35.869171142578125, "rewards/margins": 81.81893157958984, "rewards/rejected": -117.68809509277344, "step": 2250 }, { "epoch": 0.4435721295387635, "grad_norm": 123.2801996558972, "learning_rate": 3.4080349776734924e-07, "logits/chosen": 0.30322542786598206, "logits/rejected": 1.1189171075820923, "logps/chosen": -308.3179626464844, "logps/rejected": -398.7733459472656, "loss": -0.4235, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -9.656079292297363, "rewards/margins": 111.47917175292969, "rewards/rejected": -121.13525390625, "step": 2260 }, { "epoch": 0.44553483807654565, "grad_norm": 422.7951249805826, "learning_rate": 3.392053918494389e-07, "logits/chosen": 0.1557307094335556, "logits/rejected": 0.32888704538345337, "logps/chosen": -312.52972412109375, "logps/rejected": -353.87396240234375, "loss": -0.3878, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -6.256176948547363, "rewards/margins": 77.47406005859375, "rewards/rejected": -83.73023986816406, "step": 2270 }, { "epoch": 0.4474975466143278, "grad_norm": 381.7477138574789, "learning_rate": 3.376030978948983e-07, "logits/chosen": 0.04796195402741432, "logits/rejected": 1.2978018522262573, "logps/chosen": -382.04931640625, "logps/rejected": -459.36932373046875, "loss": -0.6204, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -59.7066535949707, "rewards/margins": 112.34959411621094, "rewards/rejected": -172.05625915527344, "step": 2280 }, { "epoch": 0.4494602551521099, "grad_norm": 182.29284763727256, "learning_rate": 3.3599669112861756e-07, "logits/chosen": 0.24235352873802185, "logits/rejected": 0.5556513071060181, "logps/chosen": -301.8162841796875, "logps/rejected": -431.4063415527344, "loss": -0.3233, "rewards/accuracies": 0.7000001072883606, "rewards/chosen": -32.53940963745117, "rewards/margins": 106.93785095214844, "rewards/rejected": -139.47727966308594, "step": 2290 }, { "epoch": 0.45142296368989204, "grad_norm": 145.6022579835461, "learning_rate": 3.343862469685755e-07, "logits/chosen": 0.16065433621406555, "logits/rejected": 0.9648358225822449, "logps/chosen": -261.49786376953125, "logps/rejected": -349.4292907714844, "loss": -0.5335, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -21.281448364257812, "rewards/margins": 100.9854965209961, "rewards/rejected": -122.2669448852539, "step": 2300 }, { "epoch": 0.4533856722276742, "grad_norm": 137.10951415660998, "learning_rate": 3.3277184102230004e-07, "logits/chosen": -0.3531091511249542, "logits/rejected": -0.30775076150894165, "logps/chosen": -281.5614929199219, "logps/rejected": -357.7174072265625, "loss": -0.3896, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -11.92430591583252, "rewards/margins": 71.40213775634766, "rewards/rejected": -83.32644653320312, "step": 2310 }, { "epoch": 0.4553483807654563, "grad_norm": 193.8212343916491, "learning_rate": 3.311535490833176e-07, "logits/chosen": 0.26778221130371094, "logits/rejected": 0.6380144953727722, "logps/chosen": -269.3844299316406, "logps/rejected": -392.78302001953125, "loss": -0.4826, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -20.432802200317383, "rewards/margins": 85.39807891845703, "rewards/rejected": -105.83088684082031, "step": 2320 }, { "epoch": 0.4573110893032385, "grad_norm": 167.41836148316835, "learning_rate": 3.2953144712759537e-07, "logits/chosen": -0.6841514110565186, "logits/rejected": 0.876046359539032, "logps/chosen": -359.23260498046875, "logps/rejected": -413.22320556640625, "loss": -0.6022, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -16.56559181213379, "rewards/margins": 147.17495727539062, "rewards/rejected": -163.74053955078125, "step": 2330 }, { "epoch": 0.4592737978410206, "grad_norm": 679.0340078732252, "learning_rate": 3.279056113099742e-07, "logits/chosen": -0.7158817052841187, "logits/rejected": 0.32771921157836914, "logps/chosen": -294.82647705078125, "logps/rejected": -476.7598571777344, "loss": -0.5756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.300946235656738, "rewards/margins": 140.09725952148438, "rewards/rejected": -147.39822387695312, "step": 2340 }, { "epoch": 0.46123650637880276, "grad_norm": 224.68757055105792, "learning_rate": 3.2627611796059283e-07, "logits/chosen": 0.038073696196079254, "logits/rejected": 0.6189560294151306, "logps/chosen": -309.27520751953125, "logps/rejected": -335.2940368652344, "loss": -0.4494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -36.30601501464844, "rewards/margins": 60.565773010253906, "rewards/rejected": -96.87178039550781, "step": 2350 }, { "epoch": 0.4631992149165849, "grad_norm": 192.03484144768032, "learning_rate": 3.246430435813051e-07, "logits/chosen": -0.3521607518196106, "logits/rejected": 0.6606711745262146, "logps/chosen": -341.88897705078125, "logps/rejected": -338.5372314453125, "loss": -0.4172, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -68.22154235839844, "rewards/margins": 52.46270751953125, "rewards/rejected": -120.68424987792969, "step": 2360 }, { "epoch": 0.465161923454367, "grad_norm": 285.66527165372344, "learning_rate": 3.230064648420878e-07, "logits/chosen": -0.6408053636550903, "logits/rejected": 0.7174103260040283, "logps/chosen": -294.173583984375, "logps/rejected": -284.22052001953125, "loss": -0.4603, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -17.475337982177734, "rewards/margins": 73.92915344238281, "rewards/rejected": -91.40447998046875, "step": 2370 }, { "epoch": 0.46712463199214915, "grad_norm": 238.2655596420244, "learning_rate": 3.2136645857744114e-07, "logits/chosen": 0.41769298911094666, "logits/rejected": 0.5791391134262085, "logps/chosen": -255.065673828125, "logps/rejected": -362.6512451171875, "loss": -0.5383, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -22.547447204589844, "rewards/margins": 63.11621856689453, "rewards/rejected": -85.66365814208984, "step": 2380 }, { "epoch": 0.4690873405299313, "grad_norm": 157.67920365410467, "learning_rate": 3.197231017827818e-07, "logits/chosen": -0.4802599549293518, "logits/rejected": 0.2917357385158539, "logps/chosen": -271.93975830078125, "logps/rejected": -321.23828125, "loss": -0.5056, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 9.564969062805176, "rewards/margins": 69.64512634277344, "rewards/rejected": -60.08015823364258, "step": 2390 }, { "epoch": 0.47105004906771347, "grad_norm": 148.06614127642922, "learning_rate": 3.1807647161082797e-07, "logits/chosen": -0.18546701967716217, "logits/rejected": 0.9859294891357422, "logps/chosen": -277.1785888671875, "logps/rejected": -354.2449951171875, "loss": -0.5329, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.37544584274292, "rewards/margins": 94.2538070678711, "rewards/rejected": -96.62925720214844, "step": 2400 }, { "epoch": 0.4730127576054956, "grad_norm": 356.8778111878992, "learning_rate": 3.1642664536797693e-07, "logits/chosen": -0.04275861382484436, "logits/rejected": 1.0751268863677979, "logps/chosen": -305.48382568359375, "logps/rejected": -389.33551025390625, "loss": -0.4543, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -32.48809051513672, "rewards/margins": 82.50044250488281, "rewards/rejected": -114.988525390625, "step": 2410 }, { "epoch": 0.47497546614327774, "grad_norm": 310.96917376343254, "learning_rate": 3.147737005106762e-07, "logits/chosen": 0.6562548875808716, "logits/rejected": 0.7398786544799805, "logps/chosen": -342.5672607421875, "logps/rejected": -414.20123291015625, "loss": -0.4527, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -43.527381896972656, "rewards/margins": 96.02046966552734, "rewards/rejected": -139.54783630371094, "step": 2420 }, { "epoch": 0.47693817468105987, "grad_norm": 233.7335407647502, "learning_rate": 3.1311771464178655e-07, "logits/chosen": 0.7289456129074097, "logits/rejected": 1.8622195720672607, "logps/chosen": -312.67974853515625, "logps/rejected": -346.43865966796875, "loss": -0.6878, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -17.431365966796875, "rewards/margins": 112.78873443603516, "rewards/rejected": -130.2200927734375, "step": 2430 }, { "epoch": 0.478900883218842, "grad_norm": 423.8615337902106, "learning_rate": 3.1145876550693893e-07, "logits/chosen": -0.29180586338043213, "logits/rejected": 1.4814244508743286, "logps/chosen": -320.2853088378906, "logps/rejected": -407.13165283203125, "loss": -0.6833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -34.53456115722656, "rewards/margins": 138.62493896484375, "rewards/rejected": -173.1595001220703, "step": 2440 }, { "epoch": 0.48086359175662413, "grad_norm": 744.701122420697, "learning_rate": 3.097969309908847e-07, "logits/chosen": 0.9363061189651489, "logits/rejected": 1.8906930685043335, "logps/chosen": -251.7770538330078, "logps/rejected": -304.05963134765625, "loss": -0.6316, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -12.564455032348633, "rewards/margins": 84.75445556640625, "rewards/rejected": -97.31892395019531, "step": 2450 }, { "epoch": 0.48282630029440626, "grad_norm": 251.38223075018033, "learning_rate": 3.081322891138382e-07, "logits/chosen": 0.20053064823150635, "logits/rejected": 0.4371492266654968, "logps/chosen": -320.5032958984375, "logps/rejected": -346.4121398925781, "loss": -0.5638, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -15.324429512023926, "rewards/margins": 69.0589599609375, "rewards/rejected": -84.38338470458984, "step": 2460 }, { "epoch": 0.4847890088321884, "grad_norm": 245.21472704086145, "learning_rate": 3.0646491802781514e-07, "logits/chosen": 0.8865596055984497, "logits/rejected": 2.0313923358917236, "logps/chosen": -325.08001708984375, "logps/rejected": -273.4022521972656, "loss": -0.4297, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -58.433311462402344, "rewards/margins": 34.4904670715332, "rewards/rejected": -92.92378234863281, "step": 2470 }, { "epoch": 0.4867517173699706, "grad_norm": 175.9266452708588, "learning_rate": 3.047948960129624e-07, "logits/chosen": 0.5433771014213562, "logits/rejected": 0.8757207989692688, "logps/chosen": -209.1022186279297, "logps/rejected": -347.94110107421875, "loss": -0.5127, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.3948183059692383, "rewards/margins": 130.8015594482422, "rewards/rejected": -130.40673828125, "step": 2480 }, { "epoch": 0.4887144259077527, "grad_norm": 429.45192151115884, "learning_rate": 3.0312230147388334e-07, "logits/chosen": -0.4421153962612152, "logits/rejected": 0.6323944330215454, "logps/chosen": -348.27679443359375, "logps/rejected": -481.54931640625, "loss": -0.603, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -39.05653762817383, "rewards/margins": 135.2555389404297, "rewards/rejected": -174.3120880126953, "step": 2490 }, { "epoch": 0.49067713444553485, "grad_norm": 377.05759477567375, "learning_rate": 3.01447212935957e-07, "logits/chosen": 0.0974908173084259, "logits/rejected": 0.05548914521932602, "logps/chosen": -322.7247314453125, "logps/rejected": -400.3882751464844, "loss": -0.2574, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -96.27644348144531, "rewards/margins": 53.063560485839844, "rewards/rejected": -149.3400115966797, "step": 2500 }, { "epoch": 0.492639842983317, "grad_norm": 314.43327625998256, "learning_rate": 2.9976970904165104e-07, "logits/chosen": -0.8013470768928528, "logits/rejected": 0.47607460618019104, "logps/chosen": -423.392578125, "logps/rejected": -418.38525390625, "loss": -0.4672, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -38.44776916503906, "rewards/margins": 85.25347137451172, "rewards/rejected": -123.70123291015625, "step": 2510 }, { "epoch": 0.4946025515210991, "grad_norm": 433.9401551770119, "learning_rate": 2.980898685468301e-07, "logits/chosen": -0.4024876654148102, "logits/rejected": 0.22271475195884705, "logps/chosen": -301.8051452636719, "logps/rejected": -352.66485595703125, "loss": -0.5458, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.163656234741211, "rewards/margins": 117.77667236328125, "rewards/rejected": -119.9403305053711, "step": 2520 }, { "epoch": 0.49656526005888124, "grad_norm": 147.05168354892143, "learning_rate": 2.96407770317058e-07, "logits/chosen": 0.1366831511259079, "logits/rejected": 0.10594828426837921, "logps/chosen": -226.49386596679688, "logps/rejected": -310.25079345703125, "loss": -0.6687, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -18.096805572509766, "rewards/margins": 83.7448959350586, "rewards/rejected": -101.84171295166016, "step": 2530 }, { "epoch": 0.4985279685966634, "grad_norm": 493.55900973884064, "learning_rate": 2.9472349332389523e-07, "logits/chosen": -0.4268958568572998, "logits/rejected": 1.171649694442749, "logps/chosen": -324.69378662109375, "logps/rejected": -340.44158935546875, "loss": -0.3868, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -31.31919288635254, "rewards/margins": 109.56556701660156, "rewards/rejected": -140.88473510742188, "step": 2540 }, { "epoch": 0.5004906771344455, "grad_norm": 273.7588000109085, "learning_rate": 2.930371166411915e-07, "logits/chosen": -0.6654374003410339, "logits/rejected": 0.21415922045707703, "logps/chosen": -335.7213439941406, "logps/rejected": -407.8766174316406, "loss": -0.3825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.95157241821289, "rewards/margins": 74.48597717285156, "rewards/rejected": -95.43756103515625, "step": 2550 }, { "epoch": 0.5024533856722276, "grad_norm": 259.545847753256, "learning_rate": 2.913487194413731e-07, "logits/chosen": -0.5894133448600769, "logits/rejected": -0.5803017020225525, "logps/chosen": -288.4310302734375, "logps/rejected": -443.6865234375, "loss": -0.5866, "rewards/accuracies": 0.8999999165534973, "rewards/chosen": -5.7689409255981445, "rewards/margins": 158.903564453125, "rewards/rejected": -164.67250061035156, "step": 2560 }, { "epoch": 0.5044160942100098, "grad_norm": 291.4075312745963, "learning_rate": 2.896583809917262e-07, "logits/chosen": -0.1378103792667389, "logits/rejected": -0.324067085981369, "logps/chosen": -248.91915893554688, "logps/rejected": -341.13995361328125, "loss": -0.3583, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -24.418867111206055, "rewards/margins": 91.70320892333984, "rewards/rejected": -116.12208557128906, "step": 2570 }, { "epoch": 0.5063788027477919, "grad_norm": 135.6645715131718, "learning_rate": 2.879661806506751e-07, "logits/chosen": -0.561540424823761, "logits/rejected": -0.05086873844265938, "logps/chosen": -315.7080078125, "logps/rejected": -410.9491271972656, "loss": -0.364, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -42.76870346069336, "rewards/margins": 58.515785217285156, "rewards/rejected": -101.28450012207031, "step": 2580 }, { "epoch": 0.5083415112855741, "grad_norm": 315.9674431217383, "learning_rate": 2.86272197864057e-07, "logits/chosen": -1.497479796409607, "logits/rejected": -1.1507604122161865, "logps/chosen": -356.84002685546875, "logps/rejected": -362.37152099609375, "loss": -0.4785, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -36.30466842651367, "rewards/margins": 71.90602111816406, "rewards/rejected": -108.21067810058594, "step": 2590 }, { "epoch": 0.5103042198233563, "grad_norm": 118.39257618780518, "learning_rate": 2.845765121613912e-07, "logits/chosen": -0.8514739274978638, "logits/rejected": -0.6283332109451294, "logps/chosen": -309.8434143066406, "logps/rejected": -350.513671875, "loss": -0.7185, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.283230781555176, "rewards/margins": 137.38882446289062, "rewards/rejected": -151.67205810546875, "step": 2600 }, { "epoch": 0.5122669283611384, "grad_norm": 285.0573022792299, "learning_rate": 2.828792031521464e-07, "logits/chosen": -0.9516509771347046, "logits/rejected": -0.5323055982589722, "logps/chosen": -328.3294372558594, "logps/rejected": -464.4132385253906, "loss": -0.4725, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -13.5633544921875, "rewards/margins": 155.688232421875, "rewards/rejected": -169.25160217285156, "step": 2610 }, { "epoch": 0.5142296368989205, "grad_norm": 221.43043700872147, "learning_rate": 2.811803505220025e-07, "logits/chosen": -0.9343031644821167, "logits/rejected": -0.3405666947364807, "logps/chosen": -282.5321044921875, "logps/rejected": -337.3126525878906, "loss": -0.6065, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -19.01625633239746, "rewards/margins": 110.60337829589844, "rewards/rejected": -129.61962890625, "step": 2620 }, { "epoch": 0.5161923454367027, "grad_norm": 237.370168514313, "learning_rate": 2.7948003402910975e-07, "logits/chosen": -0.1904366910457611, "logits/rejected": 0.3686249554157257, "logps/chosen": -328.06866455078125, "logps/rejected": -384.8655700683594, "loss": -0.3728, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -30.02372169494629, "rewards/margins": 101.01863861083984, "rewards/rejected": -131.0423583984375, "step": 2630 }, { "epoch": 0.5181550539744848, "grad_norm": 139.61263912514792, "learning_rate": 2.777783335003442e-07, "logits/chosen": -0.42642202973365784, "logits/rejected": 0.09071238338947296, "logps/chosen": -342.4646301269531, "logps/rejected": -403.5032653808594, "loss": -0.5165, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -30.88848304748535, "rewards/margins": 112.633056640625, "rewards/rejected": -143.52154541015625, "step": 2640 }, { "epoch": 0.5201177625122669, "grad_norm": 391.3608215437027, "learning_rate": 2.760753288275598e-07, "logits/chosen": -0.25946754217147827, "logits/rejected": 0.7186147570610046, "logps/chosen": -279.14361572265625, "logps/rejected": -379.6006164550781, "loss": -0.3133, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -22.769941329956055, "rewards/margins": 103.9280776977539, "rewards/rejected": -126.6980209350586, "step": 2650 }, { "epoch": 0.5220804710500491, "grad_norm": 308.8738180108568, "learning_rate": 2.7437109996383795e-07, "logits/chosen": 0.22495320439338684, "logits/rejected": 1.347975730895996, "logps/chosen": -303.25616455078125, "logps/rejected": -349.54010009765625, "loss": -0.4444, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -47.82208251953125, "rewards/margins": 91.51597595214844, "rewards/rejected": -139.33804321289062, "step": 2660 }, { "epoch": 0.5240431795878312, "grad_norm": 105.59483552440615, "learning_rate": 2.7266572691973365e-07, "logits/chosen": -0.7208471894264221, "logits/rejected": 0.026754379272460938, "logps/chosen": -351.32861328125, "logps/rejected": -366.71551513671875, "loss": -0.548, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -17.688404083251953, "rewards/margins": 72.31254577636719, "rewards/rejected": -90.00094604492188, "step": 2670 }, { "epoch": 0.5260058881256133, "grad_norm": 547.2158458872873, "learning_rate": 2.709592897595191e-07, "logits/chosen": 0.32650887966156006, "logits/rejected": 1.0637938976287842, "logps/chosen": -291.64801025390625, "logps/rejected": -251.3025360107422, "loss": -0.5324, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.850461959838867, "rewards/margins": 49.82951354980469, "rewards/rejected": -65.67997741699219, "step": 2680 }, { "epoch": 0.5279685966633955, "grad_norm": 473.378993250825, "learning_rate": 2.6925186859742494e-07, "logits/chosen": -0.1511944681406021, "logits/rejected": 0.0580764040350914, "logps/chosen": -273.9756774902344, "logps/rejected": -322.53436279296875, "loss": -0.4016, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -16.555021286010742, "rewards/margins": 78.03257751464844, "rewards/rejected": -94.58760070800781, "step": 2690 }, { "epoch": 0.5299313052011776, "grad_norm": 391.3912175071586, "learning_rate": 2.675435435938788e-07, "logits/chosen": -0.3210826516151428, "logits/rejected": 0.6731927990913391, "logps/chosen": -344.1539611816406, "logps/rejected": -376.76116943359375, "loss": -0.5019, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -28.54323387145996, "rewards/margins": 92.14056396484375, "rewards/rejected": -120.68379974365234, "step": 2700 }, { "epoch": 0.5318940137389597, "grad_norm": 171.0253907949937, "learning_rate": 2.6583439495174247e-07, "logits/chosen": -0.4241692125797272, "logits/rejected": 1.0711032152175903, "logps/chosen": -298.0030822753906, "logps/rejected": -373.0411071777344, "loss": -0.5928, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.104356288909912, "rewards/margins": 125.88126373291016, "rewards/rejected": -132.98562622070312, "step": 2710 }, { "epoch": 0.5338567222767419, "grad_norm": 182.16139827707858, "learning_rate": 2.6412450291254564e-07, "logits/chosen": 0.5123127102851868, "logits/rejected": 1.3891392946243286, "logps/chosen": -307.1592102050781, "logps/rejected": -346.6839294433594, "loss": -0.5057, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -15.78852367401123, "rewards/margins": 89.80287170410156, "rewards/rejected": -105.59139251708984, "step": 2720 }, { "epoch": 0.535819430814524, "grad_norm": 259.8816047792165, "learning_rate": 2.6241394775271954e-07, "logits/chosen": 0.8046048283576965, "logits/rejected": 2.188105583190918, "logps/chosen": -287.75726318359375, "logps/rejected": -396.4264831542969, "loss": -0.382, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -27.231454849243164, "rewards/margins": 140.114501953125, "rewards/rejected": -167.34597778320312, "step": 2730 }, { "epoch": 0.5377821393523062, "grad_norm": 321.48906686200013, "learning_rate": 2.607028097798276e-07, "logits/chosen": 0.6382243037223816, "logits/rejected": 1.6460765600204468, "logps/chosen": -322.6622619628906, "logps/rejected": -424.9124450683594, "loss": -0.6139, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -36.69150161743164, "rewards/margins": 95.38984680175781, "rewards/rejected": -132.0813446044922, "step": 2740 }, { "epoch": 0.5397448478900884, "grad_norm": 185.42514226312778, "learning_rate": 2.5899116932879534e-07, "logits/chosen": 1.3601300716400146, "logits/rejected": 2.312749147415161, "logps/chosen": -243.48727416992188, "logps/rejected": -364.85150146484375, "loss": -0.6528, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -43.58977127075195, "rewards/margins": 112.76011657714844, "rewards/rejected": -156.34988403320312, "step": 2750 }, { "epoch": 0.5417075564278705, "grad_norm": 113.87898599411729, "learning_rate": 2.5727910675813866e-07, "logits/chosen": 0.5609289407730103, "logits/rejected": 0.6823413968086243, "logps/chosen": -265.54571533203125, "logps/rejected": -365.4170837402344, "loss": -0.5823, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -12.634196281433105, "rewards/margins": 80.48759460449219, "rewards/rejected": -93.12178802490234, "step": 2760 }, { "epoch": 0.5436702649656526, "grad_norm": 130.63615854608577, "learning_rate": 2.555667024461915e-07, "logits/chosen": 0.29585856199264526, "logits/rejected": 0.36376041173934937, "logps/chosen": -252.49246215820312, "logps/rejected": -383.80352783203125, "loss": -0.4498, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -19.849376678466797, "rewards/margins": 98.01789855957031, "rewards/rejected": -117.86726379394531, "step": 2770 }, { "epoch": 0.5456329735034348, "grad_norm": 170.48391015666377, "learning_rate": 2.5385403678733157e-07, "logits/chosen": 0.8951346278190613, "logits/rejected": 1.0202525854110718, "logps/chosen": -254.63217163085938, "logps/rejected": -348.728271484375, "loss": -0.3319, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -55.43613815307617, "rewards/margins": 61.74285125732422, "rewards/rejected": -117.17899322509766, "step": 2780 }, { "epoch": 0.5475956820412169, "grad_norm": 209.03297035174046, "learning_rate": 2.521411901882067e-07, "logits/chosen": 0.1450597047805786, "logits/rejected": 1.8982454538345337, "logps/chosen": -283.434326171875, "logps/rejected": -355.127197265625, "loss": -0.4797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.629661560058594, "rewards/margins": 128.4715118408203, "rewards/rejected": -149.10116577148438, "step": 2790 }, { "epoch": 0.549558390578999, "grad_norm": 316.4189278339142, "learning_rate": 2.504282430639594e-07, "logits/chosen": -0.9220380783081055, "logits/rejected": -0.2594057023525238, "logps/chosen": -219.96041870117188, "logps/rejected": -279.0653076171875, "loss": -0.5967, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -11.8013334274292, "rewards/margins": 62.681007385253906, "rewards/rejected": -74.48234558105469, "step": 2800 }, { "epoch": 0.5515210991167812, "grad_norm": 213.5433079797419, "learning_rate": 2.4871527583445163e-07, "logits/chosen": -0.31822261214256287, "logits/rejected": 0.5024303197860718, "logps/chosen": -317.51275634765625, "logps/rejected": -335.9136657714844, "loss": -0.4669, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -8.082704544067383, "rewards/margins": 54.3105583190918, "rewards/rejected": -62.39326095581055, "step": 2810 }, { "epoch": 0.5534838076545633, "grad_norm": 816.1579619271733, "learning_rate": 2.470023689204893e-07, "logits/chosen": 0.23843038082122803, "logits/rejected": 0.5871859788894653, "logps/chosen": -309.62445068359375, "logps/rejected": -391.88250732421875, "loss": -0.5, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.068517684936523, "rewards/margins": 96.9255142211914, "rewards/rejected": -111.9940414428711, "step": 2820 }, { "epoch": 0.5554465161923454, "grad_norm": 280.3106207734899, "learning_rate": 2.452896027400465e-07, "logits/chosen": -0.3961392939090729, "logits/rejected": 0.7283745408058167, "logps/chosen": -339.65435791015625, "logps/rejected": -421.9934997558594, "loss": -0.4777, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -59.08606719970703, "rewards/margins": 80.55363464355469, "rewards/rejected": -139.63970947265625, "step": 2830 }, { "epoch": 0.5574092247301276, "grad_norm": 167.81413165501237, "learning_rate": 2.4357705770449046e-07, "logits/chosen": 0.23879416286945343, "logits/rejected": 0.9762646555900574, "logps/chosen": -245.05215454101562, "logps/rejected": -347.5263671875, "loss": -0.5869, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 4.809001922607422, "rewards/margins": 114.6981201171875, "rewards/rejected": -109.88912200927734, "step": 2840 }, { "epoch": 0.5593719332679097, "grad_norm": 410.71247214339223, "learning_rate": 2.418648142148056e-07, "logits/chosen": 0.6748315095901489, "logits/rejected": 1.1633888483047485, "logps/chosen": -294.86175537109375, "logps/rejected": -309.39154052734375, "loss": -0.457, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -11.052109718322754, "rewards/margins": 81.19559478759766, "rewards/rejected": -92.2477035522461, "step": 2850 }, { "epoch": 0.5613346418056918, "grad_norm": 413.7593541786494, "learning_rate": 2.4015295265781966e-07, "logits/chosen": 0.6866556406021118, "logits/rejected": 1.3604393005371094, "logps/chosen": -299.4148864746094, "logps/rejected": -408.46795654296875, "loss": -0.5172, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -1.166587233543396, "rewards/margins": 98.66439819335938, "rewards/rejected": -99.83097839355469, "step": 2860 }, { "epoch": 0.563297350343474, "grad_norm": 181.55258976710394, "learning_rate": 2.3844155340242893e-07, "logits/chosen": 0.3880153298377991, "logits/rejected": 1.2153961658477783, "logps/chosen": -225.6029052734375, "logps/rejected": -322.51715087890625, "loss": -0.4219, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -9.458656311035156, "rewards/margins": 94.21931457519531, "rewards/rejected": -103.677978515625, "step": 2870 }, { "epoch": 0.5652600588812562, "grad_norm": 220.54795795101919, "learning_rate": 2.36730696795826e-07, "logits/chosen": 0.18370838463306427, "logits/rejected": 0.2858714461326599, "logps/chosen": -278.33013916015625, "logps/rejected": -425.53204345703125, "loss": -0.5321, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -38.12256622314453, "rewards/margins": 51.5951042175293, "rewards/rejected": -89.71766662597656, "step": 2880 }, { "epoch": 0.5672227674190383, "grad_norm": 578.8234640908487, "learning_rate": 2.3502046315972655e-07, "logits/chosen": 0.17645081877708435, "logits/rejected": 1.2865142822265625, "logps/chosen": -331.9459228515625, "logps/rejected": -421.33282470703125, "loss": -0.7247, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -13.197916984558105, "rewards/margins": 114.34892272949219, "rewards/rejected": -127.54683685302734, "step": 2890 }, { "epoch": 0.5691854759568205, "grad_norm": 232.0675435051825, "learning_rate": 2.3331093278659906e-07, "logits/chosen": 0.49825865030288696, "logits/rejected": 1.0472362041473389, "logps/chosen": -360.3938293457031, "logps/rejected": -421.24884033203125, "loss": -0.4672, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -51.801918029785156, "rewards/margins": 85.29037475585938, "rewards/rejected": -137.09228515625, "step": 2900 }, { "epoch": 0.5711481844946026, "grad_norm": 471.76705387499675, "learning_rate": 2.31602185935895e-07, "logits/chosen": 0.01983051374554634, "logits/rejected": 1.5712801218032837, "logps/chosen": -293.21490478515625, "logps/rejected": -355.33935546875, "loss": -0.5618, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -9.408475875854492, "rewards/margins": 122.68055725097656, "rewards/rejected": -132.0890350341797, "step": 2910 }, { "epoch": 0.5731108930323847, "grad_norm": 245.42857665560027, "learning_rate": 2.298943028302811e-07, "logits/chosen": -0.35872071981430054, "logits/rejected": 0.5607597827911377, "logps/chosen": -299.25189208984375, "logps/rejected": -461.92694091796875, "loss": -0.6854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.7750883102417, "rewards/margins": 140.8957061767578, "rewards/rejected": -150.67079162597656, "step": 2920 }, { "epoch": 0.5750736015701668, "grad_norm": 266.0404820595972, "learning_rate": 2.2818736365187242e-07, "logits/chosen": 0.13500066101551056, "logits/rejected": 1.0924553871154785, "logps/chosen": -255.1774139404297, "logps/rejected": -331.12811279296875, "loss": -0.6754, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -28.0899658203125, "rewards/margins": 111.04876708984375, "rewards/rejected": -139.13873291015625, "step": 2930 }, { "epoch": 0.577036310107949, "grad_norm": 334.69553613754675, "learning_rate": 2.2648144853846847e-07, "logits/chosen": 1.2062290906906128, "logits/rejected": 2.1320557594299316, "logps/chosen": -282.756103515625, "logps/rejected": -425.32763671875, "loss": -0.8328, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -23.573299407958984, "rewards/margins": 136.2438201904297, "rewards/rejected": -159.81710815429688, "step": 2940 }, { "epoch": 0.5789990186457311, "grad_norm": 536.1371741380907, "learning_rate": 2.247766375797906e-07, "logits/chosen": 1.191646933555603, "logits/rejected": 1.212264060974121, "logps/chosen": -222.5431671142578, "logps/rejected": -283.1639709472656, "loss": -0.4712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.147907257080078, "rewards/margins": 47.22834396362305, "rewards/rejected": -78.37625885009766, "step": 2950 }, { "epoch": 0.5809617271835132, "grad_norm": 277.61070176896783, "learning_rate": 2.2307301081372222e-07, "logits/chosen": 1.1878509521484375, "logits/rejected": 1.0988099575042725, "logps/chosen": -266.31292724609375, "logps/rejected": -325.9090881347656, "loss": -0.5037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.415120124816895, "rewards/margins": 41.67178726196289, "rewards/rejected": -50.0869026184082, "step": 2960 }, { "epoch": 0.5829244357212954, "grad_norm": 188.6950936096456, "learning_rate": 2.2137064822255086e-07, "logits/chosen": 0.7357149720191956, "logits/rejected": 1.3781557083129883, "logps/chosen": -268.281005859375, "logps/rejected": -321.81365966796875, "loss": -0.2804, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -47.52278137207031, "rewards/margins": 58.322242736816406, "rewards/rejected": -105.84503173828125, "step": 2970 }, { "epoch": 0.5848871442590775, "grad_norm": 236.25051602619024, "learning_rate": 2.1966962972921322e-07, "logits/chosen": 0.6798173785209656, "logits/rejected": 1.3316466808319092, "logps/chosen": -246.56906127929688, "logps/rejected": -374.881103515625, "loss": -0.4841, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -9.728264808654785, "rewards/margins": 95.40788269042969, "rewards/rejected": -105.13615417480469, "step": 2980 }, { "epoch": 0.5868498527968596, "grad_norm": 306.5765043786579, "learning_rate": 2.1797003519354285e-07, "logits/chosen": 0.7005780935287476, "logits/rejected": 1.2031795978546143, "logps/chosen": -283.9145202636719, "logps/rejected": -374.4034423828125, "loss": -0.4342, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -22.043981552124023, "rewards/margins": 89.0151596069336, "rewards/rejected": -111.05914306640625, "step": 2990 }, { "epoch": 0.5888125613346418, "grad_norm": 165.1066746878023, "learning_rate": 2.1627194440852142e-07, "logits/chosen": 1.7588584423065186, "logits/rejected": 2.171719789505005, "logps/chosen": -332.97210693359375, "logps/rejected": -390.28961181640625, "loss": -0.5126, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -39.196678161621094, "rewards/margins": 92.90324401855469, "rewards/rejected": -132.09991455078125, "step": 3000 }, { "epoch": 0.5907752698724239, "grad_norm": 288.32571922188697, "learning_rate": 2.1457543709653176e-07, "logits/chosen": 0.5540057420730591, "logits/rejected": 1.577968955039978, "logps/chosen": -303.7123107910156, "logps/rejected": -401.205078125, "loss": -0.6076, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -29.9188232421875, "rewards/margins": 119.54066467285156, "rewards/rejected": -149.45950317382812, "step": 3010 }, { "epoch": 0.592737978410206, "grad_norm": 183.1564103096898, "learning_rate": 2.128805929056154e-07, "logits/chosen": 0.8288165330886841, "logits/rejected": 1.4216700792312622, "logps/chosen": -207.1270294189453, "logps/rejected": -338.00469970703125, "loss": -0.4724, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -26.292505264282227, "rewards/margins": 117.13128662109375, "rewards/rejected": -143.42379760742188, "step": 3020 }, { "epoch": 0.5947006869479883, "grad_norm": 453.7931357345026, "learning_rate": 2.1118749140573358e-07, "logits/chosen": 0.2079722136259079, "logits/rejected": 1.3552095890045166, "logps/chosen": -273.36590576171875, "logps/rejected": -342.1899108886719, "loss": -0.7633, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -19.20699119567871, "rewards/margins": 71.76216125488281, "rewards/rejected": -90.96916198730469, "step": 3030 }, { "epoch": 0.5966633954857704, "grad_norm": 361.84275984996304, "learning_rate": 2.0949621208503092e-07, "logits/chosen": 1.0352598428726196, "logits/rejected": 1.4081958532333374, "logps/chosen": -301.6096496582031, "logps/rejected": -272.4801025390625, "loss": -0.2859, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -0.8882265090942383, "rewards/margins": 50.127777099609375, "rewards/rejected": -51.0160026550293, "step": 3040 }, { "epoch": 0.5986261040235525, "grad_norm": 284.93897537011503, "learning_rate": 2.0780683434610413e-07, "logits/chosen": 1.1556251049041748, "logits/rejected": 1.7304332256317139, "logps/chosen": -267.9789123535156, "logps/rejected": -346.31591796875, "loss": -0.5116, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -27.530445098876953, "rewards/margins": 64.04904174804688, "rewards/rejected": -91.57948303222656, "step": 3050 }, { "epoch": 0.6005888125613347, "grad_norm": 482.4916907396394, "learning_rate": 2.0611943750227375e-07, "logits/chosen": 1.165791630744934, "logits/rejected": 1.4937446117401123, "logps/chosen": -295.9093322753906, "logps/rejected": -373.0982666015625, "loss": -0.5185, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -33.31257629394531, "rewards/margins": 96.5605697631836, "rewards/rejected": -129.87313842773438, "step": 3060 }, { "epoch": 0.6025515210991168, "grad_norm": 331.97254237418036, "learning_rate": 2.044341007738612e-07, "logits/chosen": 0.7668994069099426, "logits/rejected": 2.2817959785461426, "logps/chosen": -363.9306335449219, "logps/rejected": -378.9432373046875, "loss": -0.2915, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -13.068647384643555, "rewards/margins": 68.29893493652344, "rewards/rejected": -81.36756896972656, "step": 3070 }, { "epoch": 0.6045142296368989, "grad_norm": 418.3816094045505, "learning_rate": 2.027509032844687e-07, "logits/chosen": 0.9893747568130493, "logits/rejected": 0.8026935458183289, "logps/chosen": -384.5431823730469, "logps/rejected": -456.28131103515625, "loss": -0.4825, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -64.47868347167969, "rewards/margins": 40.07658767700195, "rewards/rejected": -104.5552749633789, "step": 3080 }, { "epoch": 0.6064769381746811, "grad_norm": 1041.4192259949457, "learning_rate": 2.010699240572651e-07, "logits/chosen": 1.1153537034988403, "logits/rejected": 0.9373297691345215, "logps/chosen": -376.4952392578125, "logps/rejected": -405.5873718261719, "loss": -0.5225, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -17.543874740600586, "rewards/margins": 62.97236251831055, "rewards/rejected": -80.51622772216797, "step": 3090 }, { "epoch": 0.6084396467124632, "grad_norm": 195.58219958672998, "learning_rate": 1.993912420112756e-07, "logits/chosen": 1.7597665786743164, "logits/rejected": 1.7504100799560547, "logps/chosen": -331.1658630371094, "logps/rejected": -477.91741943359375, "loss": -0.3962, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -7.719897270202637, "rewards/margins": 86.05127716064453, "rewards/rejected": -93.77116394042969, "step": 3100 }, { "epoch": 0.6104023552502453, "grad_norm": 1212.7554210507622, "learning_rate": 1.9771493595767707e-07, "logits/chosen": 1.1921669244766235, "logits/rejected": 1.6928989887237549, "logps/chosen": -301.7481384277344, "logps/rejected": -412.67529296875, "loss": -0.4302, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -16.064899444580078, "rewards/margins": 69.83645629882812, "rewards/rejected": -85.90135192871094, "step": 3110 }, { "epoch": 0.6123650637880275, "grad_norm": 280.2192733262556, "learning_rate": 1.9604108459609752e-07, "logits/chosen": 2.572514295578003, "logits/rejected": 2.3528380393981934, "logps/chosen": -357.37030029296875, "logps/rejected": -440.0013732910156, "loss": -0.4794, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -37.611942291259766, "rewards/margins": 84.75715637207031, "rewards/rejected": -122.36910247802734, "step": 3120 }, { "epoch": 0.6143277723258096, "grad_norm": 288.41772244279105, "learning_rate": 1.9436976651092142e-07, "logits/chosen": 1.4929567575454712, "logits/rejected": 2.4030117988586426, "logps/chosen": -364.6961975097656, "logps/rejected": -454.45111083984375, "loss": -0.4981, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -56.003074645996094, "rewards/margins": 99.0272445678711, "rewards/rejected": -155.03030395507812, "step": 3130 }, { "epoch": 0.6162904808635917, "grad_norm": 223.58800209954262, "learning_rate": 1.9270106016760035e-07, "logits/chosen": 0.15036991238594055, "logits/rejected": 1.5348567962646484, "logps/chosen": -291.8357238769531, "logps/rejected": -419.71136474609375, "loss": -0.5021, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -28.053186416625977, "rewards/margins": 118.88764953613281, "rewards/rejected": -146.9408416748047, "step": 3140 }, { "epoch": 0.6182531894013739, "grad_norm": 183.3480969124145, "learning_rate": 1.9103504390896944e-07, "logits/chosen": 2.1762146949768066, "logits/rejected": 2.7052228450775146, "logps/chosen": -280.4345397949219, "logps/rejected": -441.0689392089844, "loss": -0.626, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -56.543914794921875, "rewards/margins": 98.6163101196289, "rewards/rejected": -155.16024780273438, "step": 3150 }, { "epoch": 0.620215897939156, "grad_norm": 172.9244867458964, "learning_rate": 1.8937179595156876e-07, "logits/chosen": 0.5130751729011536, "logits/rejected": 1.79129958152771, "logps/chosen": -311.43133544921875, "logps/rejected": -364.58795166015625, "loss": -0.623, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -32.472686767578125, "rewards/margins": 123.1352767944336, "rewards/rejected": -155.6079559326172, "step": 3160 }, { "epoch": 0.6221786064769381, "grad_norm": 234.5070864974775, "learning_rate": 1.8771139438197168e-07, "logits/chosen": 1.4119322299957275, "logits/rejected": 2.310541868209839, "logps/chosen": -310.05572509765625, "logps/rejected": -445.2001953125, "loss": -0.5831, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -17.906932830810547, "rewards/margins": 127.82010650634766, "rewards/rejected": -145.72705078125, "step": 3170 }, { "epoch": 0.6241413150147204, "grad_norm": 249.96181920277226, "learning_rate": 1.8605391715311846e-07, "logits/chosen": 1.629399061203003, "logits/rejected": 3.0770516395568848, "logps/chosen": -320.28265380859375, "logps/rejected": -302.6684265136719, "loss": -0.6057, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -35.234336853027344, "rewards/margins": 60.21760177612305, "rewards/rejected": -95.45193481445312, "step": 3180 }, { "epoch": 0.6261040235525025, "grad_norm": 148.03831268903735, "learning_rate": 1.8439944208065704e-07, "logits/chosen": 1.3731263875961304, "logits/rejected": 2.3876495361328125, "logps/chosen": -372.33026123046875, "logps/rejected": -455.12957763671875, "loss": -0.4608, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -22.71662712097168, "rewards/margins": 105.861328125, "rewards/rejected": -128.5779571533203, "step": 3190 }, { "epoch": 0.6280667320902846, "grad_norm": 244.4534245125885, "learning_rate": 1.8274804683928913e-07, "logits/chosen": 1.1035573482513428, "logits/rejected": 2.6440072059631348, "logps/chosen": -374.5699462890625, "logps/rejected": -428.7478942871094, "loss": -0.4164, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -41.70665740966797, "rewards/margins": 103.33683776855469, "rewards/rejected": -145.0435028076172, "step": 3200 }, { "epoch": 0.6300294406280668, "grad_norm": 109.62242811529732, "learning_rate": 1.810998089591238e-07, "logits/chosen": 0.7609840035438538, "logits/rejected": 1.2874010801315308, "logps/chosen": -283.74871826171875, "logps/rejected": -372.7958984375, "loss": -0.4905, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -30.674938201904297, "rewards/margins": 83.36442565917969, "rewards/rejected": -114.03938293457031, "step": 3210 }, { "epoch": 0.6319921491658489, "grad_norm": 121.27814585355463, "learning_rate": 1.7945480582203745e-07, "logits/chosen": 1.4029531478881836, "logits/rejected": 1.2399829626083374, "logps/chosen": -292.9909362792969, "logps/rejected": -370.7956848144531, "loss": -0.5169, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -44.148494720458984, "rewards/margins": 58.81536102294922, "rewards/rejected": -102.96385192871094, "step": 3220 }, { "epoch": 0.633954857703631, "grad_norm": 280.83982168767716, "learning_rate": 1.7781311465804128e-07, "logits/chosen": 1.577337622642517, "logits/rejected": 2.336193561553955, "logps/chosen": -295.2773132324219, "logps/rejected": -328.74664306640625, "loss": -0.4481, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -16.177501678466797, "rewards/margins": 79.00422668457031, "rewards/rejected": -95.18171691894531, "step": 3230 }, { "epoch": 0.6359175662414132, "grad_norm": 461.85620163691954, "learning_rate": 1.7617481254165487e-07, "logits/chosen": 1.1604465246200562, "logits/rejected": 2.0431575775146484, "logps/chosen": -278.5603332519531, "logps/rejected": -354.8936767578125, "loss": -0.6121, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -6.95641565322876, "rewards/margins": 125.07762145996094, "rewards/rejected": -132.03402709960938, "step": 3240 }, { "epoch": 0.6378802747791953, "grad_norm": 294.72060086783773, "learning_rate": 1.745399763882881e-07, "logits/chosen": 1.236867904663086, "logits/rejected": 2.3780226707458496, "logps/chosen": -307.36224365234375, "logps/rejected": -471.08795166015625, "loss": -0.6499, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -17.60917091369629, "rewards/margins": 176.69235229492188, "rewards/rejected": -194.301513671875, "step": 3250 }, { "epoch": 0.6398429833169774, "grad_norm": 316.18236023568033, "learning_rate": 1.7290868295062983e-07, "logits/chosen": 1.6850658655166626, "logits/rejected": 2.023160219192505, "logps/chosen": -281.6526794433594, "logps/rejected": -408.58892822265625, "loss": -0.6055, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 5.73920202255249, "rewards/margins": 129.40170288085938, "rewards/rejected": -123.6624984741211, "step": 3260 }, { "epoch": 0.6418056918547596, "grad_norm": 538.2140517846747, "learning_rate": 1.7128100881504492e-07, "logits/chosen": 1.0986950397491455, "logits/rejected": 2.613492727279663, "logps/chosen": -297.9154357910156, "logps/rejected": -331.2212829589844, "loss": -0.569, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -11.341866493225098, "rewards/margins": 102.48435974121094, "rewards/rejected": -113.82623291015625, "step": 3270 }, { "epoch": 0.6437684003925417, "grad_norm": 296.57329753749735, "learning_rate": 1.6965703039797808e-07, "logits/chosen": 0.8851855397224426, "logits/rejected": 3.257719039916992, "logps/chosen": -333.4023742675781, "logps/rejected": -404.5556335449219, "loss": -0.7286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.969829559326172, "rewards/margins": 159.63040161132812, "rewards/rejected": -177.60020446777344, "step": 3280 }, { "epoch": 0.6457311089303238, "grad_norm": 345.10178972612823, "learning_rate": 1.6803682394236656e-07, "logits/chosen": 0.7804575562477112, "logits/rejected": 2.5302646160125732, "logps/chosen": -335.7467041015625, "logps/rejected": -391.03607177734375, "loss": -0.4962, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3.409665584564209, "rewards/margins": 118.77891540527344, "rewards/rejected": -122.18858337402344, "step": 3290 }, { "epoch": 0.647693817468106, "grad_norm": 234.51865003515888, "learning_rate": 1.664204655140607e-07, "logits/chosen": 1.4312318563461304, "logits/rejected": 1.6907854080200195, "logps/chosen": -250.748291015625, "logps/rejected": -392.40423583984375, "loss": -0.6363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.458979606628418, "rewards/margins": 100.32145690917969, "rewards/rejected": -113.78043365478516, "step": 3300 }, { "epoch": 0.6496565260058881, "grad_norm": 281.70637757158755, "learning_rate": 1.6480803099825277e-07, "logits/chosen": 1.3997304439544678, "logits/rejected": 2.446746349334717, "logps/chosen": -266.99725341796875, "logps/rejected": -365.4762878417969, "loss": -0.6901, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -5.368094444274902, "rewards/margins": 138.910888671875, "rewards/rejected": -144.27896118164062, "step": 3310 }, { "epoch": 0.6516192345436702, "grad_norm": 400.467149919209, "learning_rate": 1.6319959609591412e-07, "logits/chosen": 2.444638252258301, "logits/rejected": 3.3924450874328613, "logps/chosen": -283.1401062011719, "logps/rejected": -289.62225341796875, "loss": -0.5487, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -40.56492233276367, "rewards/margins": 53.07103729248047, "rewards/rejected": -93.63597106933594, "step": 3320 }, { "epoch": 0.6535819430814525, "grad_norm": 185.375579267095, "learning_rate": 1.6159523632024126e-07, "logits/chosen": 1.1899499893188477, "logits/rejected": 2.3480947017669678, "logps/chosen": -344.8422546386719, "logps/rejected": -453.46661376953125, "loss": -0.456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -55.438575744628906, "rewards/margins": 82.45014953613281, "rewards/rejected": -137.88873291015625, "step": 3330 }, { "epoch": 0.6555446516192346, "grad_norm": 136.46339856465588, "learning_rate": 1.599950269931107e-07, "logits/chosen": 2.2227530479431152, "logits/rejected": 2.7438931465148926, "logps/chosen": -344.6984558105469, "logps/rejected": -318.1833190917969, "loss": -0.4392, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -48.4615364074707, "rewards/margins": 17.468292236328125, "rewards/rejected": -65.92981719970703, "step": 3340 }, { "epoch": 0.6575073601570167, "grad_norm": 230.0271625529546, "learning_rate": 1.5839904324154273e-07, "logits/chosen": 1.5149564743041992, "logits/rejected": 2.8948044776916504, "logps/chosen": -296.1535949707031, "logps/rejected": -403.90057373046875, "loss": -0.6306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.129514694213867, "rewards/margins": 97.59844207763672, "rewards/rejected": -108.72795104980469, "step": 3350 }, { "epoch": 0.6594700686947988, "grad_norm": 241.22053261151103, "learning_rate": 1.568073599941742e-07, "logits/chosen": 0.9362170100212097, "logits/rejected": 1.5310771465301514, "logps/chosen": -313.9458312988281, "logps/rejected": -397.6568298339844, "loss": -0.7312, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -15.276570320129395, "rewards/margins": 108.81392669677734, "rewards/rejected": -124.09049987792969, "step": 3360 }, { "epoch": 0.661432777232581, "grad_norm": 324.03389412978737, "learning_rate": 1.552200519777408e-07, "logits/chosen": 1.2166019678115845, "logits/rejected": 3.130192279815674, "logps/chosen": -354.7196044921875, "logps/rejected": -419.1104431152344, "loss": -0.5795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -36.037296295166016, "rewards/margins": 134.6616973876953, "rewards/rejected": -170.69900512695312, "step": 3370 }, { "epoch": 0.6633954857703631, "grad_norm": 190.25835073505576, "learning_rate": 1.5363719371356882e-07, "logits/chosen": 0.7802735567092896, "logits/rejected": 1.6470670700073242, "logps/chosen": -347.74395751953125, "logps/rejected": -345.00970458984375, "loss": -0.6156, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6.984062194824219, "rewards/margins": 110.39164733886719, "rewards/rejected": -117.3757095336914, "step": 3380 }, { "epoch": 0.6653581943081452, "grad_norm": 219.1929528790628, "learning_rate": 1.5205885951407665e-07, "logits/chosen": 2.0109965801239014, "logits/rejected": 2.1420645713806152, "logps/chosen": -299.0583190917969, "logps/rejected": -444.8160705566406, "loss": -0.691, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -45.24928283691406, "rewards/margins": 96.7963638305664, "rewards/rejected": -142.045654296875, "step": 3390 }, { "epoch": 0.6673209028459274, "grad_norm": 365.378455350356, "learning_rate": 1.5048512347928564e-07, "logits/chosen": 1.6866347789764404, "logits/rejected": 4.299131870269775, "logps/chosen": -358.8126220703125, "logps/rejected": -418.63671875, "loss": -0.5959, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -70.52055358886719, "rewards/margins": 145.75868225097656, "rewards/rejected": -216.2792205810547, "step": 3400 }, { "epoch": 0.6692836113837095, "grad_norm": 278.7569396914758, "learning_rate": 1.4891605949334133e-07, "logits/chosen": 0.7028568983078003, "logits/rejected": 3.0758461952209473, "logps/chosen": -528.3422241210938, "logps/rejected": -545.26953125, "loss": -0.4262, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -55.5322265625, "rewards/margins": 110.14747619628906, "rewards/rejected": -165.67970275878906, "step": 3410 }, { "epoch": 0.6712463199214916, "grad_norm": 397.2812387711312, "learning_rate": 1.4735174122104476e-07, "logits/chosen": 2.603889226913452, "logits/rejected": 3.3954358100891113, "logps/chosen": -256.2731018066406, "logps/rejected": -348.58001708984375, "loss": -0.4006, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -16.873428344726562, "rewards/margins": 132.78973388671875, "rewards/rejected": -149.66317749023438, "step": 3420 }, { "epoch": 0.6732090284592738, "grad_norm": 304.2857262870805, "learning_rate": 1.457922421043943e-07, "logits/chosen": 0.4312248229980469, "logits/rejected": 2.1262800693511963, "logps/chosen": -357.89703369140625, "logps/rejected": -317.8974914550781, "loss": -0.4859, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -37.8548698425293, "rewards/margins": 74.66275787353516, "rewards/rejected": -112.51763916015625, "step": 3430 }, { "epoch": 0.6751717369970559, "grad_norm": 387.62014373355345, "learning_rate": 1.4423763535913704e-07, "logits/chosen": 0.6034333109855652, "logits/rejected": 0.8363513946533203, "logps/chosen": -275.8504333496094, "logps/rejected": -388.68939208984375, "loss": -0.5767, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -8.406243324279785, "rewards/margins": 112.24727630615234, "rewards/rejected": -120.65352630615234, "step": 3440 }, { "epoch": 0.677134445534838, "grad_norm": 251.17708827453578, "learning_rate": 1.426879939713322e-07, "logits/chosen": 1.0980184078216553, "logits/rejected": 1.070042371749878, "logps/chosen": -339.46783447265625, "logps/rejected": -394.92974853515625, "loss": -0.4703, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -35.621665954589844, "rewards/margins": 123.67140197753906, "rewards/rejected": -159.29306030273438, "step": 3450 }, { "epoch": 0.6790971540726202, "grad_norm": 265.8926787931652, "learning_rate": 1.4114339069392374e-07, "logits/chosen": 1.2519810199737549, "logits/rejected": 2.74015736579895, "logps/chosen": -306.773681640625, "logps/rejected": -342.7041015625, "loss": -0.6756, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -10.08450698852539, "rewards/margins": 120.99635314941406, "rewards/rejected": -131.0808563232422, "step": 3460 }, { "epoch": 0.6810598626104023, "grad_norm": 142.08053216952268, "learning_rate": 1.3960389804332556e-07, "logits/chosen": 1.8515313863754272, "logits/rejected": 2.3379006385803223, "logps/chosen": -269.44189453125, "logps/rejected": -415.3699645996094, "loss": -0.4601, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -12.299274444580078, "rewards/margins": 111.2161636352539, "rewards/rejected": -123.51544189453125, "step": 3470 }, { "epoch": 0.6830225711481845, "grad_norm": 297.68343547141916, "learning_rate": 1.380695882960165e-07, "logits/chosen": 2.3755345344543457, "logits/rejected": 3.324944257736206, "logps/chosen": -303.84210205078125, "logps/rejected": -409.0731201171875, "loss": -0.6388, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -30.8568058013916, "rewards/margins": 151.22311401367188, "rewards/rejected": -182.0799102783203, "step": 3480 }, { "epoch": 0.6849852796859667, "grad_norm": 279.77973877221467, "learning_rate": 1.3654053348514702e-07, "logits/chosen": 2.4140167236328125, "logits/rejected": 2.442918300628662, "logps/chosen": -175.77764892578125, "logps/rejected": -345.3222961425781, "loss": -0.7239, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -11.252742767333984, "rewards/margins": 132.3984832763672, "rewards/rejected": -143.6512451171875, "step": 3490 }, { "epoch": 0.6869479882237488, "grad_norm": 460.300392286645, "learning_rate": 1.350168053971577e-07, "logits/chosen": 2.53047776222229, "logits/rejected": 3.1190500259399414, "logps/chosen": -395.7915954589844, "logps/rejected": -388.1199645996094, "loss": -0.3305, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -33.28361511230469, "rewards/margins": 115.11958312988281, "rewards/rejected": -148.4031982421875, "step": 3500 }, { "epoch": 0.6889106967615309, "grad_norm": 413.92216592989024, "learning_rate": 1.3349847556840876e-07, "logits/chosen": 2.0696728229522705, "logits/rejected": 2.69974946975708, "logps/chosen": -287.5434875488281, "logps/rejected": -428.76129150390625, "loss": -0.5497, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -42.18700408935547, "rewards/margins": 119.90152740478516, "rewards/rejected": -162.08853149414062, "step": 3510 }, { "epoch": 0.6908734052993131, "grad_norm": 384.7457818102189, "learning_rate": 1.3198561528182182e-07, "logits/chosen": 1.647242546081543, "logits/rejected": 1.7746429443359375, "logps/chosen": -264.11566162109375, "logps/rejected": -381.8612060546875, "loss": -0.3659, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -65.06902313232422, "rewards/margins": 93.1152114868164, "rewards/rejected": -158.18423461914062, "step": 3520 }, { "epoch": 0.6928361138370952, "grad_norm": 246.02225552085997, "learning_rate": 1.3047829556353263e-07, "logits/chosen": 1.6259530782699585, "logits/rejected": 2.0524356365203857, "logps/chosen": -267.45831298828125, "logps/rejected": -353.01702880859375, "loss": -0.4369, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.631207466125488, "rewards/margins": 98.35829162597656, "rewards/rejected": -112.989501953125, "step": 3530 }, { "epoch": 0.6947988223748773, "grad_norm": 169.49248459736611, "learning_rate": 1.2897658717955742e-07, "logits/chosen": 2.2385401725769043, "logits/rejected": 2.4062373638153076, "logps/chosen": -300.55035400390625, "logps/rejected": -373.66290283203125, "loss": -0.7203, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -38.223567962646484, "rewards/margins": 121.9072494506836, "rewards/rejected": -160.13082885742188, "step": 3540 }, { "epoch": 0.6967615309126595, "grad_norm": 305.6211458361156, "learning_rate": 1.2748056063246994e-07, "logits/chosen": 1.5881072282791138, "logits/rejected": 1.7754720449447632, "logps/chosen": -348.344482421875, "logps/rejected": -392.8487243652344, "loss": -0.4053, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -46.63700866699219, "rewards/margins": 56.822364807128906, "rewards/rejected": -103.45936584472656, "step": 3550 }, { "epoch": 0.6987242394504416, "grad_norm": 244.3085454314864, "learning_rate": 1.2599028615809183e-07, "logits/chosen": 1.9024779796600342, "logits/rejected": 1.784313440322876, "logps/chosen": -313.4281311035156, "logps/rejected": -343.64007568359375, "loss": -0.3804, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -16.648738861083984, "rewards/margins": 88.41041564941406, "rewards/rejected": -105.05915832519531, "step": 3560 }, { "epoch": 0.7006869479882237, "grad_norm": 258.8389666758088, "learning_rate": 1.2450583372219458e-07, "logits/chosen": 2.2555994987487793, "logits/rejected": 1.8444554805755615, "logps/chosen": -301.27239990234375, "logps/rejected": -376.5409851074219, "loss": -0.581, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 1.242764949798584, "rewards/margins": 82.57177734375, "rewards/rejected": -81.32901763916016, "step": 3570 }, { "epoch": 0.7026496565260059, "grad_norm": 259.8600830505897, "learning_rate": 1.230272730172157e-07, "logits/chosen": 1.8105671405792236, "logits/rejected": 2.0636348724365234, "logps/chosen": -310.08135986328125, "logps/rejected": -418.76678466796875, "loss": -0.405, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -13.664457321166992, "rewards/margins": 97.44984436035156, "rewards/rejected": -111.11429595947266, "step": 3580 }, { "epoch": 0.704612365063788, "grad_norm": 147.04742095509496, "learning_rate": 1.2155467345898602e-07, "logits/chosen": 0.6524871587753296, "logits/rejected": 1.2266894578933716, "logps/chosen": -302.1370849609375, "logps/rejected": -405.2377014160156, "loss": -0.3501, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -34.67855453491211, "rewards/margins": 57.04478073120117, "rewards/rejected": -91.72332763671875, "step": 3590 }, { "epoch": 0.7065750736015701, "grad_norm": 310.2071885494148, "learning_rate": 1.2008810418347093e-07, "logits/chosen": 1.3502354621887207, "logits/rejected": 1.7374212741851807, "logps/chosen": -214.0709228515625, "logps/rejected": -325.71087646484375, "loss": -0.5805, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -13.01269817352295, "rewards/margins": 100.90092468261719, "rewards/rejected": -113.91361999511719, "step": 3600 }, { "epoch": 0.7085377821393523, "grad_norm": 264.81278013923355, "learning_rate": 1.1862763404352483e-07, "logits/chosen": 0.3365991413593292, "logits/rejected": 1.8003944158554077, "logps/chosen": -346.4209289550781, "logps/rejected": -401.68988037109375, "loss": -0.6485, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -32.3430061340332, "rewards/margins": 90.04476165771484, "rewards/rejected": -122.38777160644531, "step": 3610 }, { "epoch": 0.7105004906771345, "grad_norm": 272.57531690224744, "learning_rate": 1.1717333160565807e-07, "logits/chosen": 0.9894776344299316, "logits/rejected": 2.2653136253356934, "logps/chosen": -389.22003173828125, "logps/rejected": -423.453369140625, "loss": -0.5542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.51241111755371, "rewards/margins": 109.59358215332031, "rewards/rejected": -141.10598754882812, "step": 3620 }, { "epoch": 0.7124631992149166, "grad_norm": 309.02009455768996, "learning_rate": 1.1572526514681874e-07, "logits/chosen": 1.7682292461395264, "logits/rejected": 1.765679955482483, "logps/chosen": -324.6111755371094, "logps/rejected": -452.42608642578125, "loss": -0.4345, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -26.59149169921875, "rewards/margins": 97.76274871826172, "rewards/rejected": -124.3542251586914, "step": 3630 }, { "epoch": 0.7144259077526988, "grad_norm": 258.0120855386704, "learning_rate": 1.1428350265118613e-07, "logits/chosen": 0.9690055847167969, "logits/rejected": 2.254654884338379, "logps/chosen": -352.84295654296875, "logps/rejected": -410.548583984375, "loss": -0.4062, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -35.663734436035156, "rewards/margins": 97.86629486083984, "rewards/rejected": -133.530029296875, "step": 3640 }, { "epoch": 0.7163886162904809, "grad_norm": 163.1368480023823, "learning_rate": 1.128481118069799e-07, "logits/chosen": 0.5297033190727234, "logits/rejected": 1.7237342596054077, "logps/chosen": -305.6944274902344, "logps/rejected": -466.70855712890625, "loss": -0.6406, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -34.66438674926758, "rewards/margins": 156.79010009765625, "rewards/rejected": -191.4544677734375, "step": 3650 }, { "epoch": 0.718351324828263, "grad_norm": 209.7828282956596, "learning_rate": 1.114191600032815e-07, "logits/chosen": 0.5942263603210449, "logits/rejected": 1.5835633277893066, "logps/chosen": -353.80645751953125, "logps/rejected": -406.839111328125, "loss": -0.566, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -42.19194030761719, "rewards/margins": 91.42488861083984, "rewards/rejected": -133.61683654785156, "step": 3660 }, { "epoch": 0.7203140333660452, "grad_norm": 165.7360403115477, "learning_rate": 1.0999671432687099e-07, "logits/chosen": 0.8667260408401489, "logits/rejected": 2.171340227127075, "logps/chosen": -323.3560485839844, "logps/rejected": -361.23687744140625, "loss": -0.5047, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -24.4227237701416, "rewards/margins": 100.17520141601562, "rewards/rejected": -124.59791564941406, "step": 3670 }, { "epoch": 0.7222767419038273, "grad_norm": 90.46623641084982, "learning_rate": 1.085808415590772e-07, "logits/chosen": -0.16272859275341034, "logits/rejected": 0.37558555603027344, "logps/chosen": -296.73712158203125, "logps/rejected": -376.70184326171875, "loss": -0.479, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -10.97760009765625, "rewards/margins": 117.96441650390625, "rewards/rejected": -128.9420166015625, "step": 3680 }, { "epoch": 0.7242394504416094, "grad_norm": 198.77277060510434, "learning_rate": 1.0717160817264217e-07, "logits/chosen": 0.6684038043022156, "logits/rejected": 2.0979228019714355, "logps/chosen": -320.0564880371094, "logps/rejected": -464.06793212890625, "loss": -0.7192, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -47.57789611816406, "rewards/margins": 147.88043212890625, "rewards/rejected": -195.4583282470703, "step": 3690 }, { "epoch": 0.7262021589793916, "grad_norm": 559.4067605726672, "learning_rate": 1.0576908032860088e-07, "logits/chosen": 1.2240560054779053, "logits/rejected": 2.251919984817505, "logps/chosen": -290.126220703125, "logps/rejected": -351.03192138671875, "loss": -0.5242, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -33.61920928955078, "rewards/margins": 110.39930725097656, "rewards/rejected": -144.0185089111328, "step": 3700 }, { "epoch": 0.7281648675171737, "grad_norm": 225.36143964638654, "learning_rate": 1.0437332387317474e-07, "logits/chosen": 0.10267148166894913, "logits/rejected": 1.5426862239837646, "logps/chosen": -275.28887939453125, "logps/rejected": -324.0962219238281, "loss": -0.3556, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -29.85512924194336, "rewards/margins": 82.32190704345703, "rewards/rejected": -112.17704772949219, "step": 3710 }, { "epoch": 0.7301275760549558, "grad_norm": 381.0312986812787, "learning_rate": 1.0298440433468048e-07, "logits/chosen": -0.1327010691165924, "logits/rejected": 1.3545982837677002, "logps/chosen": -350.61724853515625, "logps/rejected": -377.9830017089844, "loss": -0.5429, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -13.095626831054688, "rewards/margins": 112.74749755859375, "rewards/rejected": -125.84312438964844, "step": 3720 }, { "epoch": 0.732090284592738, "grad_norm": 312.7283497697748, "learning_rate": 1.0160238692045331e-07, "logits/chosen": -0.3035415709018707, "logits/rejected": 1.0891860723495483, "logps/chosen": -291.15777587890625, "logps/rejected": -338.7342224121094, "loss": -0.4859, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -39.76912307739258, "rewards/margins": 90.14669799804688, "rewards/rejected": -129.9158172607422, "step": 3730 }, { "epoch": 0.7340529931305201, "grad_norm": 323.57379442181485, "learning_rate": 1.0022733651378606e-07, "logits/chosen": -0.02376272715628147, "logits/rejected": 1.6784549951553345, "logps/chosen": -408.494140625, "logps/rejected": -411.84375, "loss": -0.3745, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -35.54701232910156, "rewards/margins": 106.29045104980469, "rewards/rejected": -141.83746337890625, "step": 3740 }, { "epoch": 0.7360157016683022, "grad_norm": 203.72492080539834, "learning_rate": 9.88593176708827e-08, "logits/chosen": 0.7709684371948242, "logits/rejected": 0.7810913920402527, "logps/chosen": -302.86419677734375, "logps/rejected": -366.42962646484375, "loss": -0.4955, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -27.58626937866211, "rewards/margins": 53.094276428222656, "rewards/rejected": -80.68054962158203, "step": 3750 }, { "epoch": 0.7379784102060843, "grad_norm": 303.43224661005456, "learning_rate": 9.749839461782769e-08, "logits/chosen": 0.011441946029663086, "logits/rejected": -0.2894682288169861, "logps/chosen": -271.73199462890625, "logps/rejected": -441.5648498535156, "loss": -0.6756, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -22.304996490478516, "rewards/margins": 126.96807861328125, "rewards/rejected": -149.27305603027344, "step": 3760 }, { "epoch": 0.7399411187438666, "grad_norm": 481.47184929772465, "learning_rate": 9.614463124757041e-08, "logits/chosen": 0.39508628845214844, "logits/rejected": 0.5891133546829224, "logps/chosen": -250.55795288085938, "logps/rejected": -323.19146728515625, "loss": -0.3047, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -14.409457206726074, "rewards/margins": 75.74601745605469, "rewards/rejected": -90.15547943115234, "step": 3770 }, { "epoch": 0.7419038272816487, "grad_norm": 222.203049531488, "learning_rate": 9.479809111692586e-08, "logits/chosen": 0.41384345293045044, "logits/rejected": -0.08077137172222137, "logps/chosen": -291.7434997558594, "logps/rejected": -386.6715393066406, "loss": -0.5309, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -56.585548400878906, "rewards/margins": 48.57170104980469, "rewards/rejected": -105.1572494506836, "step": 3780 }, { "epoch": 0.7438665358194309, "grad_norm": 358.06095742681515, "learning_rate": 9.345883744359065e-08, "logits/chosen": 0.6432726979255676, "logits/rejected": 0.2744694650173187, "logps/chosen": -305.78045654296875, "logps/rejected": -428.5538635253906, "loss": -0.5304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.73126983642578, "rewards/margins": 56.70947265625, "rewards/rejected": -92.44075012207031, "step": 3790 }, { "epoch": 0.745829244357213, "grad_norm": 257.0185079736378, "learning_rate": 9.212693310317479e-08, "logits/chosen": 0.4275744557380676, "logits/rejected": 0.7536159157752991, "logps/chosen": -286.36309814453125, "logps/rejected": -352.2890625, "loss": -0.3569, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -32.664642333984375, "rewards/margins": 77.56367492675781, "rewards/rejected": -110.22830963134766, "step": 3800 }, { "epoch": 0.7477919528949951, "grad_norm": 288.5250092689165, "learning_rate": 9.08024406262503e-08, "logits/chosen": -0.06111738830804825, "logits/rejected": 0.008484485559165478, "logps/chosen": -259.80963134765625, "logps/rejected": -386.0383605957031, "loss": -0.6137, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -14.924217224121094, "rewards/margins": 121.2860336303711, "rewards/rejected": -136.21023559570312, "step": 3810 }, { "epoch": 0.7497546614327772, "grad_norm": 97.67762932716003, "learning_rate": 8.94854221954148e-08, "logits/chosen": 0.49058040976524353, "logits/rejected": 0.7544366121292114, "logps/chosen": -242.14682006835938, "logps/rejected": -329.55914306640625, "loss": -0.5801, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -8.71400260925293, "rewards/margins": 116.04339599609375, "rewards/rejected": -124.75740051269531, "step": 3820 }, { "epoch": 0.7517173699705594, "grad_norm": 362.06192532755273, "learning_rate": 8.817593964237316e-08, "logits/chosen": 0.23223204910755157, "logits/rejected": 1.0880941152572632, "logps/chosen": -288.9664001464844, "logps/rejected": -371.0241394042969, "loss": -0.7323, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -9.658148765563965, "rewards/margins": 117.62623596191406, "rewards/rejected": -127.28438568115234, "step": 3830 }, { "epoch": 0.7536800785083415, "grad_norm": 222.45991290146569, "learning_rate": 8.68740544450334e-08, "logits/chosen": 0.6538372039794922, "logits/rejected": 2.5161030292510986, "logps/chosen": -366.0320739746094, "logps/rejected": -405.2726745605469, "loss": -0.5504, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -5.696615695953369, "rewards/margins": 131.22412109375, "rewards/rejected": -136.92074584960938, "step": 3840 }, { "epoch": 0.7556427870461236, "grad_norm": 717.138465667547, "learning_rate": 8.557982772462138e-08, "logits/chosen": 1.8596255779266357, "logits/rejected": 2.209252119064331, "logps/chosen": -282.5457458496094, "logps/rejected": -413.7879333496094, "loss": -0.8445, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -19.658872604370117, "rewards/margins": 142.85853576660156, "rewards/rejected": -162.51742553710938, "step": 3850 }, { "epoch": 0.7576054955839058, "grad_norm": 142.2537163753503, "learning_rate": 8.429332024281088e-08, "logits/chosen": 1.1737343072891235, "logits/rejected": 2.674985885620117, "logps/chosen": -313.0780944824219, "logps/rejected": -380.13140869140625, "loss": -0.3671, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -27.172000885009766, "rewards/margins": 136.07864379882812, "rewards/rejected": -163.25064086914062, "step": 3860 }, { "epoch": 0.7595682041216879, "grad_norm": 194.64636178959483, "learning_rate": 8.301459239887073e-08, "logits/chosen": 1.0153484344482422, "logits/rejected": 3.1004929542541504, "logps/chosen": -365.90277099609375, "logps/rejected": -405.7188415527344, "loss": -0.716, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -26.039794921875, "rewards/margins": 113.65787506103516, "rewards/rejected": -139.69766235351562, "step": 3870 }, { "epoch": 0.76153091265947, "grad_norm": 292.799253015931, "learning_rate": 8.17437042268298e-08, "logits/chosen": 0.9066111445426941, "logits/rejected": 1.4293513298034668, "logps/chosen": -349.82415771484375, "logps/rejected": -410.9677734375, "loss": -0.4059, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -61.107322692871094, "rewards/margins": 43.853843688964844, "rewards/rejected": -104.96116638183594, "step": 3880 }, { "epoch": 0.7634936211972522, "grad_norm": 450.23071256513026, "learning_rate": 8.048071539265761e-08, "logits/chosen": 0.8080563545227051, "logits/rejected": 2.6450655460357666, "logps/chosen": -344.237548828125, "logps/rejected": -363.1737365722656, "loss": -0.523, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -29.7846622467041, "rewards/margins": 102.0052490234375, "rewards/rejected": -131.78990173339844, "step": 3890 }, { "epoch": 0.7654563297350343, "grad_norm": 442.8918649708774, "learning_rate": 7.922568519146425e-08, "logits/chosen": 1.209207534790039, "logits/rejected": 1.6337013244628906, "logps/chosen": -236.75015258789062, "logps/rejected": -388.8421630859375, "loss": -0.6117, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -9.993242263793945, "rewards/margins": 173.70559692382812, "rewards/rejected": -183.69882202148438, "step": 3900 }, { "epoch": 0.7674190382728164, "grad_norm": 238.3320698428565, "learning_rate": 7.79786725447154e-08, "logits/chosen": 1.3150415420532227, "logits/rejected": 1.7168185710906982, "logps/chosen": -289.5064697265625, "logps/rejected": -378.62432861328125, "loss": -0.7813, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -29.609729766845703, "rewards/margins": 133.35765075683594, "rewards/rejected": -162.96737670898438, "step": 3910 }, { "epoch": 0.7693817468105987, "grad_norm": 341.2953124746272, "learning_rate": 7.6739735997467e-08, "logits/chosen": 0.30328941345214844, "logits/rejected": 0.7972557544708252, "logps/chosen": -305.1976623535156, "logps/rejected": -374.2314147949219, "loss": -0.6348, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7.398439884185791, "rewards/margins": 106.65047454833984, "rewards/rejected": -114.04891204833984, "step": 3920 }, { "epoch": 0.7713444553483808, "grad_norm": 130.20377953007142, "learning_rate": 7.550893371561593e-08, "logits/chosen": 1.5633090734481812, "logits/rejected": 1.7413053512573242, "logps/chosen": -265.47125244140625, "logps/rejected": -367.0103454589844, "loss": -0.6864, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 2.874222993850708, "rewards/margins": 125.1466064453125, "rewards/rejected": -122.27238464355469, "step": 3930 }, { "epoch": 0.7733071638861629, "grad_norm": 177.53461331251899, "learning_rate": 7.428632348317004e-08, "logits/chosen": 0.896818995475769, "logits/rejected": 1.8238928318023682, "logps/chosen": -291.6225280761719, "logps/rejected": -379.63116455078125, "loss": -0.6347, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -43.21448516845703, "rewards/margins": 77.93448638916016, "rewards/rejected": -121.14897155761719, "step": 3940 }, { "epoch": 0.7752698724239451, "grad_norm": 355.2490033380478, "learning_rate": 7.307196269953444e-08, "logits/chosen": 0.5623804330825806, "logits/rejected": 1.8873199224472046, "logps/chosen": -299.43780517578125, "logps/rejected": -382.9126281738281, "loss": -0.5757, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -16.341676712036133, "rewards/margins": 103.789306640625, "rewards/rejected": -120.1309814453125, "step": 3950 }, { "epoch": 0.7772325809617272, "grad_norm": 333.02791884820846, "learning_rate": 7.186590837681732e-08, "logits/chosen": 1.4319672584533691, "logits/rejected": 2.663729429244995, "logps/chosen": -284.7814636230469, "logps/rejected": -335.676513671875, "loss": -0.6028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.942358016967773, "rewards/margins": 106.50833892822266, "rewards/rejected": -128.45069885253906, "step": 3960 }, { "epoch": 0.7791952894995093, "grad_norm": 200.70420257471542, "learning_rate": 7.066821713715293e-08, "logits/chosen": 0.5648600459098816, "logits/rejected": 2.5612308979034424, "logps/chosen": -322.00958251953125, "logps/rejected": -450.244873046875, "loss": -0.8002, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.121159553527832, "rewards/margins": 157.29432678222656, "rewards/rejected": -164.41549682617188, "step": 3970 }, { "epoch": 0.7811579980372915, "grad_norm": 438.7939154886828, "learning_rate": 6.947894521004357e-08, "logits/chosen": 1.2530823945999146, "logits/rejected": 1.3784297704696655, "logps/chosen": -309.9731140136719, "logps/rejected": -383.11029052734375, "loss": -0.4638, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -27.896488189697266, "rewards/margins": 63.006988525390625, "rewards/rejected": -90.90348815917969, "step": 3980 }, { "epoch": 0.7831207065750736, "grad_norm": 755.0624920692909, "learning_rate": 6.829814842971965e-08, "logits/chosen": 1.7242523431777954, "logits/rejected": 1.5838003158569336, "logps/chosen": -265.7640686035156, "logps/rejected": -372.4088439941406, "loss": -0.4205, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -39.63862609863281, "rewards/margins": 85.05244445800781, "rewards/rejected": -124.6910629272461, "step": 3990 }, { "epoch": 0.7850834151128557, "grad_norm": 397.78536604269027, "learning_rate": 6.712588223251809e-08, "logits/chosen": 0.15322282910346985, "logits/rejected": 1.4203684329986572, "logps/chosen": -383.421142578125, "logps/rejected": -383.93560791015625, "loss": -0.3545, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -42.78777313232422, "rewards/margins": 61.30333709716797, "rewards/rejected": -104.09112548828125, "step": 4000 }, { "epoch": 0.7870461236506379, "grad_norm": 162.3487897649674, "learning_rate": 6.596220165428002e-08, "logits/chosen": 0.6272414922714233, "logits/rejected": 1.2616965770721436, "logps/chosen": -265.5509338378906, "logps/rejected": -371.5789794921875, "loss": -0.5927, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.53268051147461, "rewards/margins": 106.93350982666016, "rewards/rejected": -123.46620178222656, "step": 4010 }, { "epoch": 0.78900883218842, "grad_norm": 230.88984827692337, "learning_rate": 6.48071613277669e-08, "logits/chosen": 1.1987674236297607, "logits/rejected": 1.6930396556854248, "logps/chosen": -274.56378173828125, "logps/rejected": -356.0934753417969, "loss": -0.44, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -34.567588806152344, "rewards/margins": 53.38029861450195, "rewards/rejected": -87.9478988647461, "step": 4020 }, { "epoch": 0.7909715407262021, "grad_norm": 187.75679830790082, "learning_rate": 6.366081548009553e-08, "logits/chosen": 1.311037302017212, "logits/rejected": 2.419053316116333, "logps/chosen": -312.9007568359375, "logps/rejected": -362.9205017089844, "loss": -0.4142, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -45.957069396972656, "rewards/margins": 55.95958709716797, "rewards/rejected": -101.91665649414062, "step": 4030 }, { "epoch": 0.7929342492639843, "grad_norm": 145.60544370516428, "learning_rate": 6.252321793019192e-08, "logits/chosen": 1.6651710271835327, "logits/rejected": 1.679205298423767, "logps/chosen": -245.623046875, "logps/rejected": -347.30169677734375, "loss": -0.56, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -13.886880874633789, "rewards/margins": 85.67262268066406, "rewards/rejected": -99.55952453613281, "step": 4040 }, { "epoch": 0.7948969578017664, "grad_norm": 439.07124592225847, "learning_rate": 6.139442208626517e-08, "logits/chosen": 1.7780866622924805, "logits/rejected": 1.6243947744369507, "logps/chosen": -223.4034423828125, "logps/rejected": -285.50860595703125, "loss": -0.428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.53204917907715, "rewards/margins": 94.08558654785156, "rewards/rejected": -125.61763000488281, "step": 4050 }, { "epoch": 0.7968596663395485, "grad_norm": 327.0604486754587, "learning_rate": 6.027448094329963e-08, "logits/chosen": 0.4172574579715729, "logits/rejected": 0.574047863483429, "logps/chosen": -236.00741577148438, "logps/rejected": -358.6146545410156, "loss": -0.5246, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 7.08117151260376, "rewards/margins": 93.22477722167969, "rewards/rejected": -86.14361572265625, "step": 4060 }, { "epoch": 0.7988223748773308, "grad_norm": 490.6291439769595, "learning_rate": 5.916344708056681e-08, "logits/chosen": 0.927447497844696, "logits/rejected": 1.9099938869476318, "logps/chosen": -286.248779296875, "logps/rejected": -347.1673278808594, "loss": -0.561, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -17.070194244384766, "rewards/margins": 97.71923828125, "rewards/rejected": -114.7894287109375, "step": 4070 }, { "epoch": 0.8007850834151129, "grad_norm": 216.6762728168146, "learning_rate": 5.8061372659157306e-08, "logits/chosen": 0.7928574681282043, "logits/rejected": 2.0397815704345703, "logps/chosen": -347.1395263671875, "logps/rejected": -426.16302490234375, "loss": -0.7094, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 3.2770323753356934, "rewards/margins": 100.47317504882812, "rewards/rejected": -97.1961441040039, "step": 4080 }, { "epoch": 0.802747791952895, "grad_norm": 351.3322000805113, "learning_rate": 5.6968309419531376e-08, "logits/chosen": 0.5025372505187988, "logits/rejected": 1.037649393081665, "logps/chosen": -327.008544921875, "logps/rejected": -356.27886962890625, "loss": -0.4612, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -37.835044860839844, "rewards/margins": 66.51990509033203, "rewards/rejected": -104.35494232177734, "step": 4090 }, { "epoch": 0.8047105004906772, "grad_norm": 505.12633519862123, "learning_rate": 5.5884308679090525e-08, "logits/chosen": 0.9266678094863892, "logits/rejected": 3.7431774139404297, "logps/chosen": -289.0246276855469, "logps/rejected": -349.05731201171875, "loss": -0.6185, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -41.9423828125, "rewards/margins": 123.00175476074219, "rewards/rejected": -164.94412231445312, "step": 4100 }, { "epoch": 0.8066732090284593, "grad_norm": 218.91844844788974, "learning_rate": 5.480942132976732e-08, "logits/chosen": 0.7196097373962402, "logits/rejected": 2.6659862995147705, "logps/chosen": -321.7400817871094, "logps/rejected": -329.4438781738281, "loss": -0.6874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1335063874721527, "rewards/margins": 135.5712890625, "rewards/rejected": -135.70480346679688, "step": 4110 }, { "epoch": 0.8086359175662414, "grad_norm": 275.9408643046396, "learning_rate": 5.374369783563698e-08, "logits/chosen": 0.6858819127082825, "logits/rejected": 1.181862473487854, "logps/chosen": -319.489013671875, "logps/rejected": -437.139404296875, "loss": -0.5752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -40.33269500732422, "rewards/margins": 117.47265625, "rewards/rejected": -157.80535888671875, "step": 4120 }, { "epoch": 0.8105986261040236, "grad_norm": 259.9904942446092, "learning_rate": 5.268718823054752e-08, "logits/chosen": 0.6993392705917358, "logits/rejected": 1.6426143646240234, "logps/chosen": -291.2001647949219, "logps/rejected": -404.6143493652344, "loss": -0.5519, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -44.98619079589844, "rewards/margins": 112.06353759765625, "rewards/rejected": -157.0497283935547, "step": 4130 }, { "epoch": 0.8125613346418057, "grad_norm": 369.8978673367864, "learning_rate": 5.1639942115771384e-08, "logits/chosen": 1.37197744846344, "logits/rejected": 0.9585355520248413, "logps/chosen": -290.52984619140625, "logps/rejected": -297.7582702636719, "loss": -0.3268, "rewards/accuracies": 0.5, "rewards/chosen": -57.57798385620117, "rewards/margins": 22.77359962463379, "rewards/rejected": -80.35157775878906, "step": 4140 }, { "epoch": 0.8145240431795878, "grad_norm": 396.7142798917053, "learning_rate": 5.060200865767605e-08, "logits/chosen": 1.0594708919525146, "logits/rejected": 1.8188024759292603, "logps/chosen": -407.10394287109375, "logps/rejected": -454.1275939941406, "loss": -0.6956, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -33.43864822387695, "rewards/margins": 117.25416564941406, "rewards/rejected": -150.6927947998047, "step": 4150 }, { "epoch": 0.81648675171737, "grad_norm": 346.76432338866283, "learning_rate": 4.957343658541632e-08, "logits/chosen": 1.1963056325912476, "logits/rejected": 1.8911014795303345, "logps/chosen": -236.36544799804688, "logps/rejected": -420.71527099609375, "loss": -0.698, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -5.097572326660156, "rewards/margins": 145.96742248535156, "rewards/rejected": -151.06500244140625, "step": 4160 }, { "epoch": 0.8184494602551521, "grad_norm": 609.2657880796065, "learning_rate": 4.8554274188646215e-08, "logits/chosen": 0.6010076403617859, "logits/rejected": 2.1071629524230957, "logps/chosen": -310.2207946777344, "logps/rejected": -350.9004821777344, "loss": -0.3548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -50.050010681152344, "rewards/margins": 90.51728820800781, "rewards/rejected": -140.5673065185547, "step": 4170 }, { "epoch": 0.8204121687929342, "grad_norm": 299.5026463466133, "learning_rate": 4.754456931525208e-08, "logits/chosen": 1.6258262395858765, "logits/rejected": 1.4584633111953735, "logps/chosen": -285.9049987792969, "logps/rejected": -380.3423767089844, "loss": -0.6576, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -18.34375762939453, "rewards/margins": 92.11905670166016, "rewards/rejected": -110.4627914428711, "step": 4180 }, { "epoch": 0.8223748773307163, "grad_norm": 206.57628278491475, "learning_rate": 4.654436936910622e-08, "logits/chosen": -0.2231508046388626, "logits/rejected": 1.7128015756607056, "logps/chosen": -319.3303527832031, "logps/rejected": -415.04248046875, "loss": -0.6426, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -14.945584297180176, "rewards/margins": 155.222900390625, "rewards/rejected": -170.16847229003906, "step": 4190 }, { "epoch": 0.8243375858684985, "grad_norm": 105.2173841997405, "learning_rate": 4.555372130784102e-08, "logits/chosen": 0.2965846061706543, "logits/rejected": 1.2652969360351562, "logps/chosen": -401.03765869140625, "logps/rejected": -413.9496154785156, "loss": -0.7065, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -20.450271606445312, "rewards/margins": 93.99417114257812, "rewards/rejected": -114.4444580078125, "step": 4200 }, { "epoch": 0.8263002944062807, "grad_norm": 410.71746386465617, "learning_rate": 4.45726716406449e-08, "logits/chosen": 0.3647512197494507, "logits/rejected": 0.4564463198184967, "logps/chosen": -344.83673095703125, "logps/rejected": -329.1011657714844, "loss": -0.3075, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -35.848228454589844, "rewards/margins": 34.870399475097656, "rewards/rejected": -70.7186279296875, "step": 4210 }, { "epoch": 0.8282630029440629, "grad_norm": 384.917290181706, "learning_rate": 4.360126642607842e-08, "logits/chosen": 0.2658080756664276, "logits/rejected": 2.3667030334472656, "logps/chosen": -358.5342102050781, "logps/rejected": -396.6856384277344, "loss": -0.6292, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -28.129785537719727, "rewards/margins": 130.65847778320312, "rewards/rejected": -158.7882537841797, "step": 4220 }, { "epoch": 0.830225711481845, "grad_norm": 295.7881487019572, "learning_rate": 4.2639551269912034e-08, "logits/chosen": 0.7166069149971008, "logits/rejected": 1.645583152770996, "logps/chosen": -218.4636688232422, "logps/rejected": -284.74822998046875, "loss": -0.5446, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -9.48962688446045, "rewards/margins": 84.1449203491211, "rewards/rejected": -93.63453674316406, "step": 4230 }, { "epoch": 0.8321884200196271, "grad_norm": 351.7452805992152, "learning_rate": 4.168757132298478e-08, "logits/chosen": 1.1947740316390991, "logits/rejected": 1.9185377359390259, "logps/chosen": -316.4347229003906, "logps/rejected": -383.91925048828125, "loss": -0.3942, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -57.603797912597656, "rewards/margins": 49.7264404296875, "rewards/rejected": -107.33023834228516, "step": 4240 }, { "epoch": 0.8341511285574092, "grad_norm": 814.0917068512831, "learning_rate": 4.0745371279084976e-08, "logits/chosen": 0.057543229311704636, "logits/rejected": 1.2063575983047485, "logps/chosen": -250.46231079101562, "logps/rejected": -333.850341796875, "loss": -0.4427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1185879707336426, "rewards/margins": 107.40657043457031, "rewards/rejected": -109.5251693725586, "step": 4250 }, { "epoch": 0.8361138370951914, "grad_norm": 258.54518752910525, "learning_rate": 3.9812995372851544e-08, "logits/chosen": 1.233195424079895, "logits/rejected": 1.9567787647247314, "logps/chosen": -276.5019836425781, "logps/rejected": -377.66082763671875, "loss": -0.579, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -29.312414169311523, "rewards/margins": 123.75312805175781, "rewards/rejected": -153.06552124023438, "step": 4260 }, { "epoch": 0.8380765456329735, "grad_norm": 239.6401093504306, "learning_rate": 3.8890487377697265e-08, "logits/chosen": 0.14301064610481262, "logits/rejected": 0.6716960668563843, "logps/chosen": -271.7588195800781, "logps/rejected": -368.71533203125, "loss": -0.7104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.645185470581055, "rewards/margins": 113.19022369384766, "rewards/rejected": -130.8354034423828, "step": 4270 }, { "epoch": 0.8400392541707556, "grad_norm": 290.5098880742371, "learning_rate": 3.7977890603754e-08, "logits/chosen": 0.24163508415222168, "logits/rejected": 1.7025432586669922, "logps/chosen": -356.93133544921875, "logps/rejected": -414.83697509765625, "loss": -0.4283, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7.9239091873168945, "rewards/margins": 93.69638061523438, "rewards/rejected": -101.62028503417969, "step": 4280 }, { "epoch": 0.8420019627085378, "grad_norm": 220.99981121958942, "learning_rate": 3.707524789583891e-08, "logits/chosen": 0.3767244815826416, "logits/rejected": 1.6875295639038086, "logps/chosen": -340.16790771484375, "logps/rejected": -454.8905334472656, "loss": -0.5899, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -26.359344482421875, "rewards/margins": 119.7392578125, "rewards/rejected": -146.09860229492188, "step": 4290 }, { "epoch": 0.8439646712463199, "grad_norm": 302.29119119914856, "learning_rate": 3.6182601631443596e-08, "logits/chosen": 0.2487632930278778, "logits/rejected": 0.7517064809799194, "logps/chosen": -321.514892578125, "logps/rejected": -384.12884521484375, "loss": -0.7531, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 1.8332033157348633, "rewards/margins": 118.40811920166016, "rewards/rejected": -116.57490539550781, "step": 4300 }, { "epoch": 0.845927379784102, "grad_norm": 194.97602036985367, "learning_rate": 3.529999371874381e-08, "logits/chosen": 1.1088536977767944, "logits/rejected": 2.119696617126465, "logps/chosen": -296.379638671875, "logps/rejected": -368.9100036621094, "loss": -0.3116, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -14.844305038452148, "rewards/margins": 91.52473449707031, "rewards/rejected": -106.3690414428711, "step": 4310 }, { "epoch": 0.8478900883218842, "grad_norm": 394.0520603985482, "learning_rate": 3.4427465594632555e-08, "logits/chosen": 0.7034338712692261, "logits/rejected": 2.448606252670288, "logps/chosen": -210.2288360595703, "logps/rejected": -337.64605712890625, "loss": -0.8014, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -9.511738777160645, "rewards/margins": 150.11659240722656, "rewards/rejected": -159.62832641601562, "step": 4320 }, { "epoch": 0.8498527968596663, "grad_norm": 150.8570059388215, "learning_rate": 3.356505822277417e-08, "logits/chosen": 0.8254325985908508, "logits/rejected": 1.6962730884552002, "logps/chosen": -305.8452453613281, "logps/rejected": -367.0594177246094, "loss": -0.4724, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -35.75792694091797, "rewards/margins": 68.71248626708984, "rewards/rejected": -104.47042083740234, "step": 4330 }, { "epoch": 0.8518155053974484, "grad_norm": 353.8048196765354, "learning_rate": 3.271281209168186e-08, "logits/chosen": 0.6131759881973267, "logits/rejected": 2.7262377738952637, "logps/chosen": -309.7735595703125, "logps/rejected": -435.9186096191406, "loss": -0.6477, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -33.432395935058594, "rewards/margins": 186.08941650390625, "rewards/rejected": -219.5218048095703, "step": 4340 }, { "epoch": 0.8537782139352306, "grad_norm": 383.2116725375334, "learning_rate": 3.187076721281595e-08, "logits/chosen": -0.23545953631401062, "logits/rejected": 0.920207679271698, "logps/chosen": -248.1889190673828, "logps/rejected": -330.3977966308594, "loss": -0.5731, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -2.5350608825683594, "rewards/margins": 88.65583038330078, "rewards/rejected": -91.1908950805664, "step": 4350 }, { "epoch": 0.8557409224730128, "grad_norm": 341.9486490830222, "learning_rate": 3.1038963118706244e-08, "logits/chosen": 1.2642277479171753, "logits/rejected": 2.6628224849700928, "logps/chosen": -305.7078552246094, "logps/rejected": -417.4297790527344, "loss": -0.6431, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -28.319299697875977, "rewards/margins": 156.50613403320312, "rewards/rejected": -184.82545471191406, "step": 4360 }, { "epoch": 0.8577036310107949, "grad_norm": 273.98996550540824, "learning_rate": 3.0217438861095315e-08, "logits/chosen": 0.22947652637958527, "logits/rejected": 0.4285706579685211, "logps/chosen": -212.7989501953125, "logps/rejected": -338.49627685546875, "loss": -0.4624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.231478691101074, "rewards/margins": 110.1903305053711, "rewards/rejected": -116.42181396484375, "step": 4370 }, { "epoch": 0.8596663395485771, "grad_norm": 185.10845009673469, "learning_rate": 2.940623300910572e-08, "logits/chosen": -0.6255972385406494, "logits/rejected": 3.2109932899475098, "logps/chosen": -319.77105712890625, "logps/rejected": -424.9895935058594, "loss": -0.795, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.09875945746898651, "rewards/margins": 223.38894653320312, "rewards/rejected": -223.29019165039062, "step": 4380 }, { "epoch": 0.8616290480863592, "grad_norm": 323.2968043969208, "learning_rate": 2.860538364742898e-08, "logits/chosen": -0.2271687537431717, "logits/rejected": 1.6766523122787476, "logps/chosen": -372.1040954589844, "logps/rejected": -314.25537109375, "loss": -0.5056, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -2.2704625129699707, "rewards/margins": 91.76817321777344, "rewards/rejected": -94.03864288330078, "step": 4390 }, { "epoch": 0.8635917566241413, "grad_norm": 277.72244186597015, "learning_rate": 2.7814928374537334e-08, "logits/chosen": 0.20215849578380585, "logits/rejected": 1.1914180517196655, "logps/chosen": -246.9439239501953, "logps/rejected": -311.5885009765625, "loss": -0.471, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -20.785297393798828, "rewards/margins": 87.80458068847656, "rewards/rejected": -108.58988189697266, "step": 4400 }, { "epoch": 0.8655544651619235, "grad_norm": 420.6807121605461, "learning_rate": 2.7034904300918982e-08, "logits/chosen": 1.1174386739730835, "logits/rejected": 1.056130051612854, "logps/chosen": -253.21585083007812, "logps/rejected": -417.9261169433594, "loss": -0.5231, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -31.55215072631836, "rewards/margins": 123.5484619140625, "rewards/rejected": -155.10061645507812, "step": 4410 }, { "epoch": 0.8675171736997056, "grad_norm": 250.21456798870338, "learning_rate": 2.62653480473356e-08, "logits/chosen": -0.5521323680877686, "logits/rejected": 0.5378388166427612, "logps/chosen": -293.2830505371094, "logps/rejected": -350.2652893066406, "loss": -0.4848, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -29.301555633544922, "rewards/margins": 73.45783996582031, "rewards/rejected": -102.75938415527344, "step": 4420 }, { "epoch": 0.8694798822374877, "grad_norm": 415.18101305656563, "learning_rate": 2.550629574310309e-08, "logits/chosen": 1.223892331123352, "logits/rejected": 1.8861782550811768, "logps/chosen": -275.0828857421875, "logps/rejected": -405.38543701171875, "loss": -0.6553, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -35.61486053466797, "rewards/margins": 86.40784454345703, "rewards/rejected": -122.022705078125, "step": 4430 }, { "epoch": 0.8714425907752699, "grad_norm": 208.67365775539722, "learning_rate": 2.475778302439524e-08, "logits/chosen": -0.16906538605690002, "logits/rejected": 1.70730721950531, "logps/chosen": -348.5227355957031, "logps/rejected": -451.517822265625, "loss": -0.7052, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -31.86871910095215, "rewards/margins": 170.1761016845703, "rewards/rejected": -202.04483032226562, "step": 4440 }, { "epoch": 0.873405299313052, "grad_norm": 349.6349361120186, "learning_rate": 2.4019845032570875e-08, "logits/chosen": 0.5358771085739136, "logits/rejected": 1.282859444618225, "logps/chosen": -285.733154296875, "logps/rejected": -432.317626953125, "loss": -0.7335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.590060234069824, "rewards/margins": 141.32090759277344, "rewards/rejected": -154.91098022460938, "step": 4450 }, { "epoch": 0.8753680078508341, "grad_norm": 283.2495966315424, "learning_rate": 2.3292516412524054e-08, "logits/chosen": -0.04940909147262573, "logits/rejected": 1.5180885791778564, "logps/chosen": -352.1752014160156, "logps/rejected": -361.323974609375, "loss": -0.5562, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -57.4222297668457, "rewards/margins": 83.98561096191406, "rewards/rejected": -141.4078369140625, "step": 4460 }, { "epoch": 0.8773307163886163, "grad_norm": 590.2024998291921, "learning_rate": 2.2575831311057225e-08, "logits/chosen": 0.27069857716560364, "logits/rejected": 2.1589179039001465, "logps/chosen": -250.70681762695312, "logps/rejected": -436.9722595214844, "loss": -0.4881, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -17.34808921813965, "rewards/margins": 185.95175170898438, "rewards/rejected": -203.2998504638672, "step": 4470 }, { "epoch": 0.8792934249263984, "grad_norm": 234.56569368498648, "learning_rate": 2.1869823375278483e-08, "logits/chosen": 0.6429430842399597, "logits/rejected": 2.53413987159729, "logps/chosen": -273.86456298828125, "logps/rejected": -452.06494140625, "loss": -0.7035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -60.50865936279297, "rewards/margins": 189.64010620117188, "rewards/rejected": -250.1487579345703, "step": 4480 }, { "epoch": 0.8812561334641805, "grad_norm": 133.18651889639486, "learning_rate": 2.1174525751021578e-08, "logits/chosen": 1.0968296527862549, "logits/rejected": 1.6707656383514404, "logps/chosen": -286.2945556640625, "logps/rejected": -429.34136962890625, "loss": -0.6453, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -14.320622444152832, "rewards/margins": 143.9236602783203, "rewards/rejected": -158.24429321289062, "step": 4490 }, { "epoch": 0.8832188420019627, "grad_norm": 208.06989814235044, "learning_rate": 2.0489971081290193e-08, "logits/chosen": 0.28894519805908203, "logits/rejected": 1.3963991403579712, "logps/chosen": -314.00762939453125, "logps/rejected": -371.5, "loss": -0.5372, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -15.929452896118164, "rewards/margins": 115.1006088256836, "rewards/rejected": -131.03005981445312, "step": 4500 }, { "epoch": 0.8851815505397449, "grad_norm": 290.0724683505124, "learning_rate": 1.9816191504724826e-08, "logits/chosen": -0.5656202435493469, "logits/rejected": 0.5285595655441284, "logps/chosen": -236.2646484375, "logps/rejected": -363.3451843261719, "loss": -0.7221, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -10.014857292175293, "rewards/margins": 138.04251098632812, "rewards/rejected": -148.05735778808594, "step": 4510 }, { "epoch": 0.887144259077527, "grad_norm": 256.04984750637175, "learning_rate": 1.9153218654094498e-08, "logits/chosen": 0.3918920159339905, "logits/rejected": 1.6313140392303467, "logps/chosen": -289.8053894042969, "logps/rejected": -390.0482177734375, "loss": -0.6173, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -10.703710556030273, "rewards/margins": 137.1112060546875, "rewards/rejected": -147.81491088867188, "step": 4520 }, { "epoch": 0.8891069676153092, "grad_norm": 289.09621609689054, "learning_rate": 1.8501083654811206e-08, "logits/chosen": 0.5931957364082336, "logits/rejected": 0.6276336908340454, "logps/chosen": -338.95697021484375, "logps/rejected": -414.01190185546875, "loss": -0.7391, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -42.674705505371094, "rewards/margins": 110.20075988769531, "rewards/rejected": -152.87545776367188, "step": 4530 }, { "epoch": 0.8910696761530913, "grad_norm": 351.31332864253557, "learning_rate": 1.7859817123469068e-08, "logits/chosen": 1.5052107572555542, "logits/rejected": 1.5364171266555786, "logps/chosen": -233.75607299804688, "logps/rejected": -320.6417236328125, "loss": -0.388, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -32.718772888183594, "rewards/margins": 58.3486442565918, "rewards/rejected": -91.06742858886719, "step": 4540 }, { "epoch": 0.8930323846908734, "grad_norm": 222.72125554681512, "learning_rate": 1.7229449166406477e-08, "logits/chosen": 0.37584689259529114, "logits/rejected": 1.6475623846054077, "logps/chosen": -358.82379150390625, "logps/rejected": -423.95526123046875, "loss": -0.6306, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -28.88714027404785, "rewards/margins": 132.3856201171875, "rewards/rejected": -161.2727508544922, "step": 4550 }, { "epoch": 0.8949950932286556, "grad_norm": 269.894549429393, "learning_rate": 1.66100093782931e-08, "logits/chosen": 0.5159591436386108, "logits/rejected": 1.579828143119812, "logps/chosen": -282.75341796875, "logps/rejected": -495.56402587890625, "loss": -0.8634, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -20.95271110534668, "rewards/margins": 183.48341369628906, "rewards/rejected": -204.4361114501953, "step": 4560 }, { "epoch": 0.8969578017664377, "grad_norm": 252.9639222655981, "learning_rate": 1.600152684074005e-08, "logits/chosen": 0.23654143512248993, "logits/rejected": 1.355732798576355, "logps/chosen": -366.46575927734375, "logps/rejected": -434.37322998046875, "loss": -0.4786, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -43.1453742980957, "rewards/margins": 74.18260192871094, "rewards/rejected": -117.3279800415039, "step": 4570 }, { "epoch": 0.8989205103042198, "grad_norm": 258.081907888107, "learning_rate": 1.540403012093483e-08, "logits/chosen": 0.13490648567676544, "logits/rejected": 1.4607415199279785, "logps/chosen": -297.9033203125, "logps/rejected": -315.2236633300781, "loss": -0.7211, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 10.252466201782227, "rewards/margins": 86.24171447753906, "rewards/rejected": -75.98924255371094, "step": 4580 }, { "epoch": 0.900883218842002, "grad_norm": 322.93829995750394, "learning_rate": 1.4817547270300185e-08, "logits/chosen": 0.5269209146499634, "logits/rejected": 0.8512266874313354, "logps/chosen": -317.64691162109375, "logps/rejected": -471.1598205566406, "loss": -0.3715, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -49.822837829589844, "rewards/margins": 70.89936828613281, "rewards/rejected": -120.72221374511719, "step": 4590 }, { "epoch": 0.9028459273797841, "grad_norm": 217.8857894740021, "learning_rate": 1.4242105823176837e-08, "logits/chosen": 0.020373066887259483, "logits/rejected": 2.338036298751831, "logps/chosen": -308.81036376953125, "logps/rejected": -413.1044921875, "loss": -0.7181, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 11.094378471374512, "rewards/margins": 169.42678833007812, "rewards/rejected": -158.33242797851562, "step": 4600 }, { "epoch": 0.9048086359175662, "grad_norm": 375.1278537276694, "learning_rate": 1.3677732795531083e-08, "logits/chosen": 0.8167325854301453, "logits/rejected": 1.3168516159057617, "logps/chosen": -307.87786865234375, "logps/rejected": -451.4591369628906, "loss": -0.544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -43.726715087890625, "rewards/margins": 93.41045379638672, "rewards/rejected": -137.1371612548828, "step": 4610 }, { "epoch": 0.9067713444553483, "grad_norm": 147.87957779599003, "learning_rate": 1.3124454683686364e-08, "logits/chosen": 0.45479816198349, "logits/rejected": 0.7868109941482544, "logps/chosen": -317.99322509765625, "logps/rejected": -419.85888671875, "loss": -0.5282, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -68.1338119506836, "rewards/margins": 88.10193634033203, "rewards/rejected": -156.23574829101562, "step": 4620 }, { "epoch": 0.9087340529931305, "grad_norm": 170.49463587556912, "learning_rate": 1.2582297463079288e-08, "logits/chosen": 0.4195406436920166, "logits/rejected": 2.507051706314087, "logps/chosen": -266.1393127441406, "logps/rejected": -328.5851135253906, "loss": -0.5295, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -37.0715446472168, "rewards/margins": 136.57232666015625, "rewards/rejected": -173.6438751220703, "step": 4630 }, { "epoch": 0.9106967615309126, "grad_norm": 124.71832674996668, "learning_rate": 1.2051286587040049e-08, "logits/chosen": 0.41768568754196167, "logits/rejected": 1.3550302982330322, "logps/chosen": -302.3821716308594, "logps/rejected": -406.23291015625, "loss": -0.6192, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -22.018672943115234, "rewards/margins": 107.51679992675781, "rewards/rejected": -129.5354766845703, "step": 4640 }, { "epoch": 0.9126594700686947, "grad_norm": 174.39805397430672, "learning_rate": 1.1531446985597604e-08, "logits/chosen": 0.2835673391819, "logits/rejected": 1.0853602886199951, "logps/chosen": -398.7691345214844, "logps/rejected": -419.5128479003906, "loss": -0.5427, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -23.566162109375, "rewards/margins": 109.63841247558594, "rewards/rejected": -133.20458984375, "step": 4650 }, { "epoch": 0.914622178606477, "grad_norm": 185.3400576110855, "learning_rate": 1.1022803064309194e-08, "logits/chosen": 0.5096344351768494, "logits/rejected": 1.0800716876983643, "logps/chosen": -350.9262390136719, "logps/rejected": -509.3897399902344, "loss": -0.6601, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -33.37424850463867, "rewards/margins": 95.1714096069336, "rewards/rejected": -128.54566955566406, "step": 4660 }, { "epoch": 0.9165848871442591, "grad_norm": 162.28773941593772, "learning_rate": 1.0525378703114401e-08, "logits/chosen": -1.083046793937683, "logits/rejected": 0.14212393760681152, "logps/chosen": -238.0006866455078, "logps/rejected": -280.20257568359375, "loss": -0.4034, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -27.042510986328125, "rewards/margins": 63.006874084472656, "rewards/rejected": -90.04937744140625, "step": 4670 }, { "epoch": 0.9185475956820413, "grad_norm": 272.7718579878306, "learning_rate": 1.0039197255214238e-08, "logits/chosen": 0.2609715163707733, "logits/rejected": 0.8051837086677551, "logps/chosen": -244.71841430664062, "logps/rejected": -375.2208251953125, "loss": -0.5975, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -56.447486877441406, "rewards/margins": 87.3301010131836, "rewards/rejected": -143.777587890625, "step": 4680 }, { "epoch": 0.9205103042198234, "grad_norm": 257.3650997787213, "learning_rate": 9.564281545974661e-09, "logits/chosen": 0.007027420215308666, "logits/rejected": 0.8426044583320618, "logps/chosen": -259.75238037109375, "logps/rejected": -389.84857177734375, "loss": -0.5895, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -1.872671127319336, "rewards/margins": 138.10362243652344, "rewards/rejected": -139.97628784179688, "step": 4690 }, { "epoch": 0.9224730127576055, "grad_norm": 371.0909879912845, "learning_rate": 9.100653871854963e-09, "logits/chosen": -0.013437772169709206, "logits/rejected": 0.2783178687095642, "logps/chosen": -333.48175048828125, "logps/rejected": -392.1842346191406, "loss": -0.6072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -37.490272521972656, "rewards/margins": 65.58151245117188, "rewards/rejected": -103.07179260253906, "step": 4700 }, { "epoch": 0.9244357212953876, "grad_norm": 281.8196916198271, "learning_rate": 8.648335999360934e-09, "logits/chosen": 0.1921078860759735, "logits/rejected": 1.6628574132919312, "logps/chosen": -286.959716796875, "logps/rejected": -335.0383605957031, "loss": -0.6888, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -25.70328140258789, "rewards/margins": 99.55555725097656, "rewards/rejected": -125.25882720947266, "step": 4710 }, { "epoch": 0.9263984298331698, "grad_norm": 349.0223170273162, "learning_rate": 8.207349164023047e-09, "logits/chosen": 1.5743603706359863, "logits/rejected": 1.799330711364746, "logps/chosen": -289.4337463378906, "logps/rejected": -403.5376892089844, "loss": -0.4601, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -24.794387817382812, "rewards/margins": 117.5296630859375, "rewards/rejected": -142.32403564453125, "step": 4720 }, { "epoch": 0.9283611383709519, "grad_norm": 475.055288626442, "learning_rate": 7.777714069399532e-09, "logits/chosen": 0.0723201259970665, "logits/rejected": 1.574939489364624, "logps/chosen": -309.9425048828125, "logps/rejected": -379.8987121582031, "loss": -0.606, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -36.66429138183594, "rewards/margins": 88.56461334228516, "rewards/rejected": -125.2289047241211, "step": 4730 }, { "epoch": 0.930323846908734, "grad_norm": 221.80323352458163, "learning_rate": 7.359450886104263e-09, "logits/chosen": 0.6704687476158142, "logits/rejected": 2.1186952590942383, "logps/chosen": -345.60015869140625, "logps/rejected": -446.39013671875, "loss": -0.5609, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -47.869842529296875, "rewards/margins": 155.35009765625, "rewards/rejected": -203.21994018554688, "step": 4740 }, { "epoch": 0.9322865554465162, "grad_norm": 275.5212677879844, "learning_rate": 6.9525792508597634e-09, "logits/chosen": -0.15146800875663757, "logits/rejected": -0.13504230976104736, "logps/chosen": -305.78619384765625, "logps/rejected": -354.84466552734375, "loss": -0.6163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.82454490661621, "rewards/margins": 51.58939743041992, "rewards/rejected": -68.41394805908203, "step": 4750 }, { "epoch": 0.9342492639842983, "grad_norm": 503.8696271643102, "learning_rate": 6.557118265575451e-09, "logits/chosen": 0.7123690843582153, "logits/rejected": 1.1601670980453491, "logps/chosen": -345.5361328125, "logps/rejected": -394.48638916015625, "loss": -0.7137, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -40.36691665649414, "rewards/margins": 78.15025329589844, "rewards/rejected": -118.51716613769531, "step": 4760 }, { "epoch": 0.9362119725220804, "grad_norm": 363.9435644530817, "learning_rate": 6.1730864964507636e-09, "logits/chosen": -0.12066509574651718, "logits/rejected": 1.1250253915786743, "logps/chosen": -306.2756652832031, "logps/rejected": -320.68316650390625, "loss": -0.7471, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9.50562572479248, "rewards/margins": 72.00847625732422, "rewards/rejected": -81.51410675048828, "step": 4770 }, { "epoch": 0.9381746810598626, "grad_norm": 219.3754347694064, "learning_rate": 5.8005019731033615e-09, "logits/chosen": -0.09917403757572174, "logits/rejected": 1.4547569751739502, "logps/chosen": -298.7986145019531, "logps/rejected": -388.39215087890625, "loss": -0.5021, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -18.909420013427734, "rewards/margins": 146.1449432373047, "rewards/rejected": -165.0543670654297, "step": 4780 }, { "epoch": 0.9401373895976447, "grad_norm": 391.55353836484426, "learning_rate": 5.439382187722968e-09, "logits/chosen": -0.4104865491390228, "logits/rejected": 1.3586753606796265, "logps/chosen": -417.8433532714844, "logps/rejected": -426.7972106933594, "loss": -0.4973, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -34.476287841796875, "rewards/margins": 105.99763488769531, "rewards/rejected": -140.4739227294922, "step": 4790 }, { "epoch": 0.9421000981354269, "grad_norm": 266.94716437698855, "learning_rate": 5.089744094249837e-09, "logits/chosen": -0.7528950572013855, "logits/rejected": 2.4677882194519043, "logps/chosen": -420.2793884277344, "logps/rejected": -461.388916015625, "loss": -0.4952, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -50.36171340942383, "rewards/margins": 146.85055541992188, "rewards/rejected": -197.2122802734375, "step": 4800 }, { "epoch": 0.9440628066732091, "grad_norm": 409.1331980960002, "learning_rate": 4.751604107579077e-09, "logits/chosen": -0.7290471792221069, "logits/rejected": 0.8242856860160828, "logps/chosen": -294.11724853515625, "logps/rejected": -413.388427734375, "loss": -0.6069, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -10.515897750854492, "rewards/margins": 156.26760864257812, "rewards/rejected": -166.78350830078125, "step": 4810 }, { "epoch": 0.9460255152109912, "grad_norm": 329.83492348886745, "learning_rate": 4.424978102789661e-09, "logits/chosen": -0.3126320242881775, "logits/rejected": 1.815192461013794, "logps/chosen": -415.0078125, "logps/rejected": -398.8657531738281, "loss": -0.5737, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -8.481783866882324, "rewards/margins": 138.76304626464844, "rewards/rejected": -147.2448272705078, "step": 4820 }, { "epoch": 0.9479882237487733, "grad_norm": 146.79394961172602, "learning_rate": 4.109881414399524e-09, "logits/chosen": 0.2610158324241638, "logits/rejected": 1.550054907798767, "logps/chosen": -296.30718994140625, "logps/rejected": -419.08721923828125, "loss": -0.5933, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -17.670312881469727, "rewards/margins": 120.4037857055664, "rewards/rejected": -138.0740966796875, "step": 4830 }, { "epoch": 0.9499509322865555, "grad_norm": 354.72111363647065, "learning_rate": 3.806328835645272e-09, "logits/chosen": 1.0623828172683716, "logits/rejected": 1.783556580543518, "logps/chosen": -293.881591796875, "logps/rejected": -383.89410400390625, "loss": -0.3878, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -52.4915771484375, "rewards/margins": 102.1007308959961, "rewards/rejected": -154.59231567382812, "step": 4840 }, { "epoch": 0.9519136408243376, "grad_norm": 328.23163632228284, "learning_rate": 3.5143346177878565e-09, "logits/chosen": -0.5292250514030457, "logits/rejected": 0.4050324559211731, "logps/chosen": -361.75628662109375, "logps/rejected": -405.6460266113281, "loss": -0.5574, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 7.2587385177612305, "rewards/margins": 143.06585693359375, "rewards/rejected": -135.80711364746094, "step": 4850 }, { "epoch": 0.9538763493621197, "grad_norm": 523.8293870872152, "learning_rate": 3.233912469443545e-09, "logits/chosen": 0.2311282902956009, "logits/rejected": 2.5509095191955566, "logps/chosen": -357.62530517578125, "logps/rejected": -387.13946533203125, "loss": -0.4784, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -29.86185646057129, "rewards/margins": 137.68093872070312, "rewards/rejected": -167.5428009033203, "step": 4860 }, { "epoch": 0.9558390578999019, "grad_norm": 761.3471269440555, "learning_rate": 2.9650755559401388e-09, "logits/chosen": 0.3595990538597107, "logits/rejected": 1.5829707384109497, "logps/chosen": -351.6505432128906, "logps/rejected": -460.19842529296875, "loss": -0.4697, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -4.7347412109375, "rewards/margins": 141.64712524414062, "rewards/rejected": -146.38186645507812, "step": 4870 }, { "epoch": 0.957801766437684, "grad_norm": 373.4846896173969, "learning_rate": 2.7078364986990175e-09, "logits/chosen": 1.0008618831634521, "logits/rejected": 1.9765441417694092, "logps/chosen": -432.39422607421875, "logps/rejected": -442.600341796875, "loss": -0.6348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -36.65483474731445, "rewards/margins": 96.07183837890625, "rewards/rejected": -132.7266845703125, "step": 4880 }, { "epoch": 0.9597644749754661, "grad_norm": 559.491522237486, "learning_rate": 2.4622073746426165e-09, "logits/chosen": 0.026527557522058487, "logits/rejected": 0.9185472726821899, "logps/chosen": -299.48248291015625, "logps/rejected": -402.83929443359375, "loss": -0.4808, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -25.574132919311523, "rewards/margins": 152.12991333007812, "rewards/rejected": -177.70404052734375, "step": 4890 }, { "epoch": 0.9617271835132483, "grad_norm": 555.2586103731929, "learning_rate": 2.2281997156273213e-09, "logits/chosen": 0.6086570024490356, "logits/rejected": 1.4897328615188599, "logps/chosen": -335.24920654296875, "logps/rejected": -352.65753173828125, "loss": -0.3788, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -22.903039932250977, "rewards/margins": 71.0635757446289, "rewards/rejected": -93.96661376953125, "step": 4900 }, { "epoch": 0.9636898920510304, "grad_norm": 156.6382222757462, "learning_rate": 2.0058245079021265e-09, "logits/chosen": -0.09430718421936035, "logits/rejected": 1.1005871295928955, "logps/chosen": -274.4385986328125, "logps/rejected": -320.4398193359375, "loss": -0.7222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.693513870239258, "rewards/margins": 103.73075103759766, "rewards/rejected": -118.42427825927734, "step": 4910 }, { "epoch": 0.9656526005888125, "grad_norm": 151.0863886461919, "learning_rate": 1.7950921915928784e-09, "logits/chosen": 1.0376745462417603, "logits/rejected": 2.0094356536865234, "logps/chosen": -289.60052490234375, "logps/rejected": -330.909912109375, "loss": -0.5896, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -24.24953269958496, "rewards/margins": 95.75595092773438, "rewards/rejected": -120.00547790527344, "step": 4920 }, { "epoch": 0.9676153091265947, "grad_norm": 166.83864869826812, "learning_rate": 1.596012660212087e-09, "logits/chosen": 0.8260341882705688, "logits/rejected": 2.2165873050689697, "logps/chosen": -346.9764709472656, "logps/rejected": -354.48797607421875, "loss": -0.6885, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.28257179260254, "rewards/margins": 106.68492126464844, "rewards/rejected": -134.96749877929688, "step": 4930 }, { "epoch": 0.9695780176643768, "grad_norm": 448.83254470903984, "learning_rate": 1.408595260194434e-09, "logits/chosen": 0.4163898527622223, "logits/rejected": 2.3370816707611084, "logps/chosen": -365.4043884277344, "logps/rejected": -368.92205810546875, "loss": -0.4861, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -50.28551483154297, "rewards/margins": 117.5450668334961, "rewards/rejected": -167.83059692382812, "step": 4940 }, { "epoch": 0.971540726202159, "grad_norm": 384.94449721636045, "learning_rate": 1.2328487904580131e-09, "logits/chosen": 0.29007938504219055, "logits/rejected": 1.312359094619751, "logps/chosen": -285.5650939941406, "logps/rejected": -396.947021484375, "loss": -0.594, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -71.74400329589844, "rewards/margins": 105.836669921875, "rewards/rejected": -177.58067321777344, "step": 4950 }, { "epoch": 0.9735034347399412, "grad_norm": 179.93188002657797, "learning_rate": 1.0687815019912173e-09, "logits/chosen": 1.147339940071106, "logits/rejected": 1.9018316268920898, "logps/chosen": -297.005126953125, "logps/rejected": -486.56402587890625, "loss": -0.8956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.530776977539062, "rewards/margins": 138.8377685546875, "rewards/rejected": -155.36854553222656, "step": 4960 }, { "epoch": 0.9754661432777233, "grad_norm": 171.77469049926168, "learning_rate": 9.164010974653802e-10, "logits/chosen": 0.19017255306243896, "logits/rejected": 0.98823082447052, "logps/chosen": -268.375732421875, "logps/rejected": -376.2149353027344, "loss": -0.5995, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": -0.4377630352973938, "rewards/margins": 103.22297668457031, "rewards/rejected": -103.6607437133789, "step": 4970 }, { "epoch": 0.9774288518155054, "grad_norm": 315.626629910148, "learning_rate": 7.757147308731504e-10, "logits/chosen": -0.06003303453326225, "logits/rejected": 1.8267772197723389, "logps/chosen": -317.6167907714844, "logps/rejected": -414.8966369628906, "loss": -0.5966, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -9.810222625732422, "rewards/margins": 135.85130310058594, "rewards/rejected": -145.66152954101562, "step": 4980 }, { "epoch": 0.9793915603532876, "grad_norm": 209.33760832517214, "learning_rate": 6.467290071925646e-10, "logits/chosen": 0.8949087858200073, "logits/rejected": 0.8340100049972534, "logps/chosen": -269.3585205078125, "logps/rejected": -375.28399658203125, "loss": -0.4299, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -55.363548278808594, "rewards/margins": 97.962646484375, "rewards/rejected": -153.32620239257812, "step": 4990 }, { "epoch": 0.9813542688910697, "grad_norm": 197.62618569733775, "learning_rate": 5.29449982077046e-10, "logits/chosen": 0.3139314651489258, "logits/rejected": 1.3526438474655151, "logps/chosen": -324.19903564453125, "logps/rejected": -368.36492919921875, "loss": -0.6841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -39.67112350463867, "rewards/margins": 114.58052062988281, "rewards/rejected": -154.25164794921875, "step": 5000 }, { "epoch": 0.9833169774288518, "grad_norm": 226.4959541478614, "learning_rate": 4.2388316157104806e-10, "logits/chosen": 0.5193920731544495, "logits/rejected": 2.0736169815063477, "logps/chosen": -278.8150634765625, "logps/rejected": -372.239990234375, "loss": -0.7287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -10.922646522521973, "rewards/margins": 139.96482849121094, "rewards/rejected": -150.88748168945312, "step": 5010 }, { "epoch": 0.985279685966634, "grad_norm": 237.75346125543064, "learning_rate": 3.300335018515676e-10, "logits/chosen": -0.4149685800075531, "logits/rejected": 1.0835611820220947, "logps/chosen": -233.6608428955078, "logps/rejected": -297.69451904296875, "loss": -0.6249, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -19.882762908935547, "rewards/margins": 113.03089904785156, "rewards/rejected": -132.9136505126953, "step": 5020 }, { "epoch": 0.9872423945044161, "grad_norm": 162.94130412364044, "learning_rate": 2.4790540899546907e-10, "logits/chosen": 0.4125348627567291, "logits/rejected": 1.145112156867981, "logps/chosen": -248.94131469726562, "logps/rejected": -460.9149475097656, "loss": -0.6704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.890644073486328, "rewards/margins": 158.34779357910156, "rewards/rejected": -185.23843383789062, "step": 5030 }, { "epoch": 0.9892051030421982, "grad_norm": 364.84350185093984, "learning_rate": 1.7750273877262244e-10, "logits/chosen": 0.2277032881975174, "logits/rejected": 1.6701894998550415, "logps/chosen": -314.9948425292969, "logps/rejected": -410.34228515625, "loss": -0.704, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -26.898975372314453, "rewards/margins": 122.16634368896484, "rewards/rejected": -149.06533813476562, "step": 5040 }, { "epoch": 0.9911678115799804, "grad_norm": 279.13631035832304, "learning_rate": 1.1882879646485379e-10, "logits/chosen": 0.7624462842941284, "logits/rejected": 2.067471504211426, "logps/chosen": -255.6316375732422, "logps/rejected": -410.8294372558594, "loss": -0.9275, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.428985595703125, "rewards/margins": 164.67681884765625, "rewards/rejected": -195.10580444335938, "step": 5050 }, { "epoch": 0.9931305201177625, "grad_norm": 248.91485646622564, "learning_rate": 7.188633671079136e-11, "logits/chosen": 0.11063379049301147, "logits/rejected": 1.7005183696746826, "logps/chosen": -313.30157470703125, "logps/rejected": -358.89654541015625, "loss": -0.404, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -25.09392738342285, "rewards/margins": 124.01396179199219, "rewards/rejected": -149.10787963867188, "step": 5060 }, { "epoch": 0.9950932286555446, "grad_norm": 184.30903455395242, "learning_rate": 3.6677563376580344e-11, "logits/chosen": -0.1692899465560913, "logits/rejected": 1.1702044010162354, "logps/chosen": -288.71173095703125, "logps/rejected": -450.014404296875, "loss": -0.4779, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -23.89110565185547, "rewards/margins": 115.5859146118164, "rewards/rejected": -139.47702026367188, "step": 5070 }, { "epoch": 0.9970559371933267, "grad_norm": 356.10024359954843, "learning_rate": 1.3204129452354385e-11, "logits/chosen": 0.4353364109992981, "logits/rejected": 1.390868067741394, "logps/chosen": -288.8840637207031, "logps/rejected": -421.11041259765625, "loss": -0.6044, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.802814483642578, "rewards/margins": 118.05680847167969, "rewards/rejected": -135.859619140625, "step": 5080 }, { "epoch": 0.9990186457311089, "grad_norm": 342.9402544399462, "learning_rate": 1.467136974631078e-12, "logits/chosen": 0.0052385092712938786, "logits/rejected": 1.605820655822754, "logps/chosen": -289.46038818359375, "logps/rejected": -320.14251708984375, "loss": -0.8449, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -17.538249969482422, "rewards/margins": 100.76835632324219, "rewards/rejected": -118.3066177368164, "step": 5090 }, { "epoch": 1.0, "step": 5095, "total_flos": 0.0, "train_loss": -0.4546862483018428, "train_runtime": 14609.9717, "train_samples_per_second": 4.184, "train_steps_per_second": 0.349 } ], "logging_steps": 10, "max_steps": 5095, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }