diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7692 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5095, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019627085377821394, + "grad_norm": 55.42576454689225, + "learning_rate": 9.803921568627451e-10, + "logits/chosen": -2.9195547103881836, + "logits/rejected": -2.4565553665161133, + "logps/chosen": -421.782470703125, + "logps/rejected": -89.33955383300781, + "loss": 0.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.001962708537782139, + "grad_norm": 52.06135973086457, + "learning_rate": 9.803921568627451e-09, + "logits/chosen": -2.558222770690918, + "logits/rejected": -2.5535826683044434, + "logps/chosen": -328.5440673828125, + "logps/rejected": -224.7199249267578, + "loss": 0.0003, + "rewards/accuracies": 0.3333333134651184, + "rewards/chosen": -0.03320746868848801, + "rewards/margins": -0.14528942108154297, + "rewards/rejected": 0.11208193749189377, + "step": 10 + }, + { + "epoch": 0.003925417075564278, + "grad_norm": 56.89658523828712, + "learning_rate": 1.9607843137254902e-08, + "logits/chosen": -2.7485036849975586, + "logits/rejected": -2.6489720344543457, + "logps/chosen": -241.45883178710938, + "logps/rejected": -228.8603515625, + "loss": -0.0004, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1173335462808609, + "rewards/margins": 0.1355145424604416, + "rewards/rejected": -0.018181007355451584, + "step": 20 + }, + { + "epoch": 0.005888125613346418, + "grad_norm": 45.255056406635454, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -2.8057525157928467, + "logits/rejected": -2.749558687210083, + "logps/chosen": -271.7970886230469, + "logps/rejected": -277.11260986328125, + "loss": 0.0009, + "rewards/accuracies": 0.36666667461395264, + "rewards/chosen": -0.06314592063426971, + "rewards/margins": -0.1372196227312088, + "rewards/rejected": 0.0740736722946167, + "step": 30 + }, + { + "epoch": 0.007850834151128557, + "grad_norm": 55.405585195341, + "learning_rate": 3.9215686274509804e-08, + "logits/chosen": -2.5288026332855225, + "logits/rejected": -2.613417148590088, + "logps/chosen": -236.00308227539062, + "logps/rejected": -199.1337890625, + "loss": -0.0002, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 0.15636861324310303, + "rewards/margins": 0.1928229033946991, + "rewards/rejected": -0.036454297602176666, + "step": 40 + }, + { + "epoch": 0.009813542688910697, + "grad_norm": 49.01526447310567, + "learning_rate": 4.901960784313725e-08, + "logits/chosen": -2.7653937339782715, + "logits/rejected": -2.7310125827789307, + "logps/chosen": -265.25726318359375, + "logps/rejected": -283.57275390625, + "loss": -0.0002, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 0.16415968537330627, + "rewards/margins": 0.13651719689369202, + "rewards/rejected": 0.027642499655485153, + "step": 50 + }, + { + "epoch": 0.011776251226692836, + "grad_norm": 46.62264667378852, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -2.7559211254119873, + "logits/rejected": -2.671520233154297, + "logps/chosen": -254.8046112060547, + "logps/rejected": -236.09078979492188, + "loss": -0.0004, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08886684477329254, + "rewards/margins": 0.049677345901727676, + "rewards/rejected": 0.039189498871564865, + "step": 60 + }, + { + "epoch": 0.013738959764474975, + "grad_norm": 48.26772678837694, + "learning_rate": 6.862745098039216e-08, + "logits/chosen": -2.837052345275879, + "logits/rejected": -2.7726972103118896, + "logps/chosen": -301.92010498046875, + "logps/rejected": -237.0230255126953, + "loss": -0.0006, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 0.18614128232002258, + "rewards/margins": -0.07164065539836884, + "rewards/rejected": 0.2577819228172302, + "step": 70 + }, + { + "epoch": 0.015701668302257114, + "grad_norm": 56.72082827271601, + "learning_rate": 7.843137254901961e-08, + "logits/chosen": -2.7584192752838135, + "logits/rejected": -2.5587003231048584, + "logps/chosen": -324.06365966796875, + "logps/rejected": -213.2407684326172, + "loss": -0.0006, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2116052806377411, + "rewards/margins": 0.003895642701536417, + "rewards/rejected": 0.2077096402645111, + "step": 80 + }, + { + "epoch": 0.017664376840039256, + "grad_norm": 56.49868797565438, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -2.821058750152588, + "logits/rejected": -2.7977123260498047, + "logps/chosen": -269.60736083984375, + "logps/rejected": -272.5506896972656, + "loss": -0.0022, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 0.5445754528045654, + "rewards/margins": 0.1933358609676361, + "rewards/rejected": 0.35123956203460693, + "step": 90 + }, + { + "epoch": 0.019627085377821395, + "grad_norm": 47.073945639503265, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -2.792196750640869, + "logits/rejected": -2.6972861289978027, + "logps/chosen": -295.4858093261719, + "logps/rejected": -259.53302001953125, + "loss": -0.002, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 0.536602258682251, + "rewards/margins": 0.15680420398712158, + "rewards/rejected": 0.379798024892807, + "step": 100 + }, + { + "epoch": 0.021589793915603533, + "grad_norm": 55.31346254119081, + "learning_rate": 1.0784313725490195e-07, + "logits/chosen": -2.7970364093780518, + "logits/rejected": -2.7083146572113037, + "logps/chosen": -331.5827331542969, + "logps/rejected": -292.47857666015625, + "loss": -0.0036, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.154811143875122, + "rewards/margins": 0.6904684901237488, + "rewards/rejected": 0.4643428325653076, + "step": 110 + }, + { + "epoch": 0.023552502453385672, + "grad_norm": 42.62283781546687, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -2.647566080093384, + "logits/rejected": -2.5511529445648193, + "logps/chosen": -202.06785583496094, + "logps/rejected": -188.60385131835938, + "loss": -0.0076, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": 0.8458110690116882, + "rewards/margins": 1.0164639949798584, + "rewards/rejected": -0.17065294086933136, + "step": 120 + }, + { + "epoch": 0.02551521099116781, + "grad_norm": 49.90985311951694, + "learning_rate": 1.2745098039215685e-07, + "logits/chosen": -2.591831922531128, + "logits/rejected": -2.6268253326416016, + "logps/chosen": -359.3841552734375, + "logps/rejected": -300.1329650878906, + "loss": -0.0079, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 1.7609771490097046, + "rewards/margins": 1.052795171737671, + "rewards/rejected": 0.7081820368766785, + "step": 130 + }, + { + "epoch": 0.02747791952894995, + "grad_norm": 69.08265173603021, + "learning_rate": 1.3725490196078432e-07, + "logits/chosen": -2.686288833618164, + "logits/rejected": -2.7173526287078857, + "logps/chosen": -192.25914001464844, + "logps/rejected": -193.92724609375, + "loss": -0.0115, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.9333054423332214, + "rewards/margins": 1.2931725978851318, + "rewards/rejected": -0.3598671555519104, + "step": 140 + }, + { + "epoch": 0.029440628066732092, + "grad_norm": 54.562941227728246, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -2.8407418727874756, + "logits/rejected": -2.7342238426208496, + "logps/chosen": -233.2815399169922, + "logps/rejected": -225.36019897460938, + "loss": -0.0131, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 0.8612493276596069, + "rewards/margins": 0.8497702479362488, + "rewards/rejected": 0.011479055508971214, + "step": 150 + }, + { + "epoch": 0.03140333660451423, + "grad_norm": 63.47402541398964, + "learning_rate": 1.5686274509803921e-07, + "logits/chosen": -2.7791740894317627, + "logits/rejected": -2.6811375617980957, + "logps/chosen": -277.4623107910156, + "logps/rejected": -219.72213745117188, + "loss": -0.0285, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": 1.0771534442901611, + "rewards/margins": 2.586575984954834, + "rewards/rejected": -1.5094225406646729, + "step": 160 + }, + { + "epoch": 0.033366045142296366, + "grad_norm": 56.67134098750124, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -2.669949769973755, + "logits/rejected": -2.6452701091766357, + "logps/chosen": -248.5420684814453, + "logps/rejected": -208.90786743164062, + "loss": -0.0263, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -0.44815006852149963, + "rewards/margins": 1.9631726741790771, + "rewards/rejected": -2.411322832107544, + "step": 170 + }, + { + "epoch": 0.03532875368007851, + "grad_norm": 39.40690681772726, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -2.751427173614502, + "logits/rejected": -2.6481759548187256, + "logps/chosen": -259.8966369628906, + "logps/rejected": -251.5900421142578, + "loss": -0.0233, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.0558886528015137, + "rewards/margins": 3.4843649864196777, + "rewards/rejected": -4.540253639221191, + "step": 180 + }, + { + "epoch": 0.03729146221786065, + "grad_norm": 62.47158031773993, + "learning_rate": 1.8627450980392158e-07, + "logits/chosen": -2.6491169929504395, + "logits/rejected": -2.590907335281372, + "logps/chosen": -318.29046630859375, + "logps/rejected": -260.68206787109375, + "loss": -0.0243, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -0.11146416515111923, + "rewards/margins": 3.8212521076202393, + "rewards/rejected": -3.9327163696289062, + "step": 190 + }, + { + "epoch": 0.03925417075564279, + "grad_norm": 67.22803692873129, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": -2.7688140869140625, + "logits/rejected": -2.574352741241455, + "logps/chosen": -285.4090881347656, + "logps/rejected": -201.56417846679688, + "loss": -0.0417, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4215115308761597, + "rewards/margins": 6.695929050445557, + "rewards/rejected": -5.274416923522949, + "step": 200 + }, + { + "epoch": 0.04121687929342493, + "grad_norm": 55.69330259375479, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -2.5630764961242676, + "logits/rejected": -2.483159303665161, + "logps/chosen": -244.29983520507812, + "logps/rejected": -241.3026885986328, + "loss": -0.0648, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9128227233886719, + "rewards/margins": 3.919031858444214, + "rewards/rejected": -5.831854343414307, + "step": 210 + }, + { + "epoch": 0.04317958783120707, + "grad_norm": 57.469004520548765, + "learning_rate": 2.156862745098039e-07, + "logits/chosen": -2.7338674068450928, + "logits/rejected": -2.6867470741271973, + "logps/chosen": -291.15673828125, + "logps/rejected": -295.14581298828125, + "loss": -0.0637, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.957396984100342, + "rewards/margins": 6.729005336761475, + "rewards/rejected": -9.6864013671875, + "step": 220 + }, + { + "epoch": 0.045142296368989206, + "grad_norm": 52.20088600279648, + "learning_rate": 2.2549019607843137e-07, + "logits/chosen": -2.765036106109619, + "logits/rejected": -2.6319448947906494, + "logps/chosen": -271.0700378417969, + "logps/rejected": -213.5701904296875, + "loss": -0.0735, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -4.162587642669678, + "rewards/margins": 6.882012367248535, + "rewards/rejected": -11.044599533081055, + "step": 230 + }, + { + "epoch": 0.047105004906771344, + "grad_norm": 61.77295677969488, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -2.6707522869110107, + "logits/rejected": -2.6531193256378174, + "logps/chosen": -251.86428833007812, + "logps/rejected": -264.7287292480469, + "loss": -0.0829, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.9166555404663086, + "rewards/margins": 15.396039962768555, + "rewards/rejected": -17.312694549560547, + "step": 240 + }, + { + "epoch": 0.04906771344455348, + "grad_norm": 73.89409979808553, + "learning_rate": 2.4509803921568627e-07, + "logits/chosen": -2.629805564880371, + "logits/rejected": -2.540875196456909, + "logps/chosen": -247.9143829345703, + "logps/rejected": -218.74600219726562, + "loss": -0.0971, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -8.48637866973877, + "rewards/margins": 7.239996433258057, + "rewards/rejected": -15.726374626159668, + "step": 250 + }, + { + "epoch": 0.05103042198233562, + "grad_norm": 66.29074269764887, + "learning_rate": 2.549019607843137e-07, + "logits/chosen": -2.72763991355896, + "logits/rejected": -2.7005534172058105, + "logps/chosen": -320.3201904296875, + "logps/rejected": -263.1239318847656, + "loss": -0.0952, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.5693891048431396, + "rewards/margins": 11.3147554397583, + "rewards/rejected": -13.88414478302002, + "step": 260 + }, + { + "epoch": 0.05299313052011776, + "grad_norm": 69.40161302997811, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -2.6740689277648926, + "logits/rejected": -2.636819362640381, + "logps/chosen": -230.25936889648438, + "logps/rejected": -232.55703735351562, + "loss": -0.0559, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.6644139289855957, + "rewards/margins": 6.941485404968262, + "rewards/rejected": -9.605899810791016, + "step": 270 + }, + { + "epoch": 0.0549558390578999, + "grad_norm": 65.05492968329817, + "learning_rate": 2.7450980392156863e-07, + "logits/chosen": -2.69507098197937, + "logits/rejected": -2.600647449493408, + "logps/chosen": -255.4451446533203, + "logps/rejected": -230.79037475585938, + "loss": -0.1374, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -10.016531944274902, + "rewards/margins": 18.90683937072754, + "rewards/rejected": -28.923370361328125, + "step": 280 + }, + { + "epoch": 0.05691854759568204, + "grad_norm": 80.60280571552023, + "learning_rate": 2.8431372549019607e-07, + "logits/chosen": -2.8089046478271484, + "logits/rejected": -2.670240879058838, + "logps/chosen": -299.865234375, + "logps/rejected": -243.9931640625, + "loss": -0.1361, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 4.855011463165283, + "rewards/margins": 17.643901824951172, + "rewards/rejected": -12.788888931274414, + "step": 290 + }, + { + "epoch": 0.058881256133464184, + "grad_norm": 114.26931921522811, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -2.6850318908691406, + "logits/rejected": -2.6374764442443848, + "logps/chosen": -279.58856201171875, + "logps/rejected": -308.7133483886719, + "loss": -0.1432, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 3.7034823894500732, + "rewards/margins": 12.2323637008667, + "rewards/rejected": -8.528882026672363, + "step": 300 + }, + { + "epoch": 0.06084396467124632, + "grad_norm": 98.35876940848675, + "learning_rate": 3.0392156862745094e-07, + "logits/chosen": -2.6629366874694824, + "logits/rejected": -2.5560824871063232, + "logps/chosen": -322.3955993652344, + "logps/rejected": -268.99041748046875, + "loss": -0.1189, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.34133529663086, + "rewards/margins": 12.180948257446289, + "rewards/rejected": -32.52228546142578, + "step": 310 + }, + { + "epoch": 0.06280667320902845, + "grad_norm": 142.72415154736706, + "learning_rate": 3.1372549019607843e-07, + "logits/chosen": -2.648869514465332, + "logits/rejected": -2.6653428077697754, + "logps/chosen": -231.400390625, + "logps/rejected": -255.35140991210938, + "loss": -0.0871, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": -27.71343421936035, + "rewards/margins": 3.059386730194092, + "rewards/rejected": -30.772823333740234, + "step": 320 + }, + { + "epoch": 0.0647693817468106, + "grad_norm": 122.94034076448233, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -2.735877513885498, + "logits/rejected": -2.6852054595947266, + "logps/chosen": -296.5848083496094, + "logps/rejected": -266.68878173828125, + "loss": -0.1574, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -20.550838470458984, + "rewards/margins": 32.51291275024414, + "rewards/rejected": -53.063751220703125, + "step": 330 + }, + { + "epoch": 0.06673209028459273, + "grad_norm": 90.76559205067144, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -2.83514404296875, + "logits/rejected": -2.64210844039917, + "logps/chosen": -364.6646423339844, + "logps/rejected": -285.5331115722656, + "loss": -0.1582, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.436984062194824, + "rewards/margins": 30.28008460998535, + "rewards/rejected": -44.717071533203125, + "step": 340 + }, + { + "epoch": 0.06869479882237488, + "grad_norm": 105.41764293594291, + "learning_rate": 3.431372549019608e-07, + "logits/chosen": -2.7746009826660156, + "logits/rejected": -2.6759238243103027, + "logps/chosen": -202.76531982421875, + "logps/rejected": -191.0487060546875, + "loss": -0.1096, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -12.143823623657227, + "rewards/margins": 1.596701741218567, + "rewards/rejected": -13.74052906036377, + "step": 350 + }, + { + "epoch": 0.07065750736015702, + "grad_norm": 86.16876458236432, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -2.8188443183898926, + "logits/rejected": -2.593196392059326, + "logps/chosen": -353.32891845703125, + "logps/rejected": -291.8623352050781, + "loss": -0.2637, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.967794895172119, + "rewards/margins": 32.80992889404297, + "rewards/rejected": -35.77772521972656, + "step": 360 + }, + { + "epoch": 0.07262021589793916, + "grad_norm": 125.37414457272092, + "learning_rate": 3.6274509803921566e-07, + "logits/chosen": -2.741210460662842, + "logits/rejected": -2.6491539478302, + "logps/chosen": -282.52386474609375, + "logps/rejected": -295.5267333984375, + "loss": -0.1618, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -8.858935356140137, + "rewards/margins": 23.869136810302734, + "rewards/rejected": -32.72806930541992, + "step": 370 + }, + { + "epoch": 0.0745829244357213, + "grad_norm": 107.59446022811072, + "learning_rate": 3.7254901960784315e-07, + "logits/chosen": -2.559868335723877, + "logits/rejected": -2.710339069366455, + "logps/chosen": -239.18466186523438, + "logps/rejected": -320.7379150390625, + "loss": -0.2261, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": 0.9056112170219421, + "rewards/margins": 28.805679321289062, + "rewards/rejected": -27.900070190429688, + "step": 380 + }, + { + "epoch": 0.07654563297350343, + "grad_norm": 104.1567948683286, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -2.5698726177215576, + "logits/rejected": -2.3517141342163086, + "logps/chosen": -283.9027404785156, + "logps/rejected": -334.44110107421875, + "loss": -0.2289, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -17.196475982666016, + "rewards/margins": 38.552696228027344, + "rewards/rejected": -55.749176025390625, + "step": 390 + }, + { + "epoch": 0.07850834151128558, + "grad_norm": 72.72058445266771, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": -2.6728971004486084, + "logits/rejected": -2.540029287338257, + "logps/chosen": -277.1605529785156, + "logps/rejected": -303.30194091796875, + "loss": -0.1376, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -12.780940055847168, + "rewards/margins": 14.76891803741455, + "rewards/rejected": -27.54986000061035, + "step": 400 + }, + { + "epoch": 0.08047105004906771, + "grad_norm": 80.71430874425238, + "learning_rate": 4.019607843137255e-07, + "logits/chosen": -2.6524100303649902, + "logits/rejected": -2.6495046615600586, + "logps/chosen": -322.28900146484375, + "logps/rejected": -308.7735290527344, + "loss": -0.2358, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -20.927623748779297, + "rewards/margins": 34.26749038696289, + "rewards/rejected": -55.19511795043945, + "step": 410 + }, + { + "epoch": 0.08243375858684986, + "grad_norm": 217.0300462879297, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -2.6198325157165527, + "logits/rejected": -2.5762195587158203, + "logps/chosen": -291.5865783691406, + "logps/rejected": -358.6512451171875, + "loss": -0.303, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.95696449279785, + "rewards/margins": 51.48836135864258, + "rewards/rejected": -82.44532775878906, + "step": 420 + }, + { + "epoch": 0.08439646712463199, + "grad_norm": 130.86544986831066, + "learning_rate": 4.215686274509804e-07, + "logits/chosen": -2.739365577697754, + "logits/rejected": -2.461707353591919, + "logps/chosen": -391.75482177734375, + "logps/rejected": -300.4351806640625, + "loss": -0.2969, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -27.522998809814453, + "rewards/margins": 32.6249885559082, + "rewards/rejected": -60.14799118041992, + "step": 430 + }, + { + "epoch": 0.08635917566241413, + "grad_norm": 171.9338418158325, + "learning_rate": 4.313725490196078e-07, + "logits/chosen": -2.3868844509124756, + "logits/rejected": -2.1997270584106445, + "logps/chosen": -308.11358642578125, + "logps/rejected": -293.33380126953125, + "loss": -0.3469, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -8.658950805664062, + "rewards/margins": 41.66270446777344, + "rewards/rejected": -50.32164764404297, + "step": 440 + }, + { + "epoch": 0.08832188420019627, + "grad_norm": 174.1066758269077, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -2.186063289642334, + "logits/rejected": -2.3623898029327393, + "logps/chosen": -346.97161865234375, + "logps/rejected": -400.64044189453125, + "loss": -0.19, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -66.03451538085938, + "rewards/margins": 45.21254348754883, + "rewards/rejected": -111.2470703125, + "step": 450 + }, + { + "epoch": 0.09028459273797841, + "grad_norm": 85.4775852556709, + "learning_rate": 4.5098039215686274e-07, + "logits/chosen": -2.2321205139160156, + "logits/rejected": -1.9455502033233643, + "logps/chosen": -277.85137939453125, + "logps/rejected": -344.2044677734375, + "loss": -0.2591, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -28.514850616455078, + "rewards/margins": 57.593421936035156, + "rewards/rejected": -86.1082763671875, + "step": 460 + }, + { + "epoch": 0.09224730127576054, + "grad_norm": 153.81064718064303, + "learning_rate": 4.6078431372549013e-07, + "logits/chosen": -2.497727632522583, + "logits/rejected": -2.2833471298217773, + "logps/chosen": -276.8794250488281, + "logps/rejected": -389.337646484375, + "loss": -0.4183, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -13.273488998413086, + "rewards/margins": 39.83915328979492, + "rewards/rejected": -53.112640380859375, + "step": 470 + }, + { + "epoch": 0.09421000981354269, + "grad_norm": 445.8429087813216, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -2.6095690727233887, + "logits/rejected": -2.1548361778259277, + "logps/chosen": -361.65338134765625, + "logps/rejected": -426.72601318359375, + "loss": -0.3216, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -41.51061248779297, + "rewards/margins": 62.375877380371094, + "rewards/rejected": -103.8864974975586, + "step": 480 + }, + { + "epoch": 0.09617271835132483, + "grad_norm": 89.04254958231228, + "learning_rate": 4.803921568627451e-07, + "logits/chosen": -2.368504047393799, + "logits/rejected": -2.2150702476501465, + "logps/chosen": -319.50006103515625, + "logps/rejected": -393.1334533691406, + "loss": -0.319, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -24.828651428222656, + "rewards/margins": 46.397682189941406, + "rewards/rejected": -71.22633361816406, + "step": 490 + }, + { + "epoch": 0.09813542688910697, + "grad_norm": 241.76895654119772, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -2.57795786857605, + "logits/rejected": -2.4584553241729736, + "logps/chosen": -326.4345703125, + "logps/rejected": -306.2259826660156, + "loss": -0.2756, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -26.8358154296875, + "rewards/margins": 28.74239730834961, + "rewards/rejected": -55.578216552734375, + "step": 500 + }, + { + "epoch": 0.10009813542688911, + "grad_norm": 264.5165149997937, + "learning_rate": 5e-07, + "logits/chosen": -2.301535129547119, + "logits/rejected": -1.850098967552185, + "logps/chosen": -335.3379821777344, + "logps/rejected": -351.9546813964844, + "loss": -0.2603, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": -47.27622604370117, + "rewards/margins": 31.92726707458496, + "rewards/rejected": -79.20349884033203, + "step": 510 + }, + { + "epoch": 0.10206084396467124, + "grad_norm": 256.12236152073547, + "learning_rate": 4.999941314693213e-07, + "logits/chosen": -2.387305974960327, + "logits/rejected": -2.093822479248047, + "logps/chosen": -279.55377197265625, + "logps/rejected": -288.3088073730469, + "loss": -0.2526, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -29.910446166992188, + "rewards/margins": 51.61469268798828, + "rewards/rejected": -81.52513122558594, + "step": 520 + }, + { + "epoch": 0.10402355250245339, + "grad_norm": 84.11574617516239, + "learning_rate": 4.999765261528027e-07, + "logits/chosen": -2.428924560546875, + "logits/rejected": -2.2418971061706543, + "logps/chosen": -308.13690185546875, + "logps/rejected": -425.5289611816406, + "loss": -0.2481, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -18.78133773803711, + "rewards/margins": 90.52960968017578, + "rewards/rejected": -109.31095886230469, + "step": 530 + }, + { + "epoch": 0.10598626104023552, + "grad_norm": 138.96263435109205, + "learning_rate": 4.999471848769828e-07, + "logits/chosen": -2.2968649864196777, + "logits/rejected": -2.2856202125549316, + "logps/chosen": -350.42437744140625, + "logps/rejected": -404.6186828613281, + "loss": -0.2945, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -60.3192253112793, + "rewards/margins": 30.69902992248535, + "rewards/rejected": -91.01825714111328, + "step": 540 + }, + { + "epoch": 0.10794896957801767, + "grad_norm": 195.9722588481125, + "learning_rate": 4.999061090193831e-07, + "logits/chosen": -2.743603229522705, + "logits/rejected": -2.556018352508545, + "logps/chosen": -370.62225341796875, + "logps/rejected": -375.95867919921875, + "loss": -0.2633, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": -47.81446838378906, + "rewards/margins": 26.41641616821289, + "rewards/rejected": -74.23088073730469, + "step": 550 + }, + { + "epoch": 0.1099116781157998, + "grad_norm": 106.73375585216431, + "learning_rate": 4.998533005084428e-07, + "logits/chosen": -2.743039846420288, + "logits/rejected": -2.6808793544769287, + "logps/chosen": -302.07318115234375, + "logps/rejected": -326.42840576171875, + "loss": -0.1725, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -20.548267364501953, + "rewards/margins": 54.09697341918945, + "rewards/rejected": -74.64524841308594, + "step": 560 + }, + { + "epoch": 0.11187438665358194, + "grad_norm": 129.19431986296829, + "learning_rate": 4.997887618234292e-07, + "logits/chosen": -2.690355062484741, + "logits/rejected": -2.696227550506592, + "logps/chosen": -289.22882080078125, + "logps/rejected": -368.32159423828125, + "loss": -0.5496, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -7.5803728103637695, + "rewards/margins": 51.98181915283203, + "rewards/rejected": -59.56218719482422, + "step": 570 + }, + { + "epoch": 0.11383709519136408, + "grad_norm": 366.7196058070477, + "learning_rate": 4.997124959943201e-07, + "logits/chosen": -2.523827314376831, + "logits/rejected": -1.6506551504135132, + "logps/chosen": -284.63677978515625, + "logps/rejected": -301.85748291015625, + "loss": -0.0144, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -40.72862243652344, + "rewards/margins": 47.8609733581543, + "rewards/rejected": -88.58959197998047, + "step": 580 + }, + { + "epoch": 0.11579980372914622, + "grad_norm": 178.59574489981918, + "learning_rate": 4.996245066016623e-07, + "logits/chosen": -2.2600531578063965, + "logits/rejected": -1.7368602752685547, + "logps/chosen": -277.12286376953125, + "logps/rejected": -323.1368408203125, + "loss": -0.394, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.99538803100586, + "rewards/margins": 69.44538116455078, + "rewards/rejected": -91.4407730102539, + "step": 590 + }, + { + "epoch": 0.11776251226692837, + "grad_norm": 265.57856442599115, + "learning_rate": 4.995247977764035e-07, + "logits/chosen": -2.4274303913116455, + "logits/rejected": -2.0409669876098633, + "logps/chosen": -247.5201416015625, + "logps/rejected": -297.291748046875, + "loss": -0.3023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -12.959405899047852, + "rewards/margins": 74.5136947631836, + "rewards/rejected": -87.47310638427734, + "step": 600 + }, + { + "epoch": 0.1197252208047105, + "grad_norm": 128.9123408742007, + "learning_rate": 4.994133741996982e-07, + "logits/chosen": -2.3767576217651367, + "logits/rejected": -2.219869375228882, + "logps/chosen": -315.9458312988281, + "logps/rejected": -308.843994140625, + "loss": -0.3328, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -34.130062103271484, + "rewards/margins": 35.57612228393555, + "rewards/rejected": -69.70618438720703, + "step": 610 + }, + { + "epoch": 0.12168792934249265, + "grad_norm": 276.5185615656701, + "learning_rate": 4.992902411026877e-07, + "logits/chosen": -2.181988477706909, + "logits/rejected": -1.965958595275879, + "logps/chosen": -312.49957275390625, + "logps/rejected": -435.63128662109375, + "loss": -0.3255, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -46.6572380065918, + "rewards/margins": 46.17048645019531, + "rewards/rejected": -92.82772064208984, + "step": 620 + }, + { + "epoch": 0.12365063788027478, + "grad_norm": 287.629427646687, + "learning_rate": 4.991554042662548e-07, + "logits/chosen": -2.2768828868865967, + "logits/rejected": -2.15800404548645, + "logps/chosen": -305.6785888671875, + "logps/rejected": -325.3777770996094, + "loss": -0.1643, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -56.725929260253906, + "rewards/margins": 28.1070499420166, + "rewards/rejected": -84.83297729492188, + "step": 630 + }, + { + "epoch": 0.1256133464180569, + "grad_norm": 93.51813893030197, + "learning_rate": 4.990088700207525e-07, + "logits/chosen": -2.442497491836548, + "logits/rejected": -2.4978153705596924, + "logps/chosen": -260.5741882324219, + "logps/rejected": -327.4527893066406, + "loss": -0.3092, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -50.749755859375, + "rewards/margins": 36.45991134643555, + "rewards/rejected": -87.20967102050781, + "step": 640 + }, + { + "epoch": 0.12757605495583907, + "grad_norm": 291.7759176839187, + "learning_rate": 4.988506452457066e-07, + "logits/chosen": -2.610625743865967, + "logits/rejected": -2.3595685958862305, + "logps/chosen": -312.53509521484375, + "logps/rejected": -378.19525146484375, + "loss": -0.3342, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.899950981140137, + "rewards/margins": 66.04251861572266, + "rewards/rejected": -81.94246673583984, + "step": 650 + }, + { + "epoch": 0.1295387634936212, + "grad_norm": 144.9833191500008, + "learning_rate": 4.986807373694925e-07, + "logits/chosen": -2.5556507110595703, + "logits/rejected": -2.404313087463379, + "logps/chosen": -280.6217346191406, + "logps/rejected": -315.28131103515625, + "loss": -0.2526, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -16.632827758789062, + "rewards/margins": 30.876800537109375, + "rewards/rejected": -47.50963592529297, + "step": 660 + }, + { + "epoch": 0.13150147203140333, + "grad_norm": 125.84079358691508, + "learning_rate": 4.984991543689869e-07, + "logits/chosen": -2.311565637588501, + "logits/rejected": -2.0786004066467285, + "logps/chosen": -326.7909851074219, + "logps/rejected": -359.30078125, + "loss": -0.2437, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -43.393157958984375, + "rewards/margins": 35.29859924316406, + "rewards/rejected": -78.69175720214844, + "step": 670 + }, + { + "epoch": 0.13346418056918546, + "grad_norm": 191.02038814443432, + "learning_rate": 4.983059047691931e-07, + "logits/chosen": -2.4630486965179443, + "logits/rejected": -2.2847580909729004, + "logps/chosen": -284.15216064453125, + "logps/rejected": -270.96478271484375, + "loss": -0.3101, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -25.678781509399414, + "rewards/margins": 31.067684173583984, + "rewards/rejected": -56.74646759033203, + "step": 680 + }, + { + "epoch": 0.13542688910696762, + "grad_norm": 284.2254776791221, + "learning_rate": 4.981009976428408e-07, + "logits/chosen": -2.2247262001037598, + "logits/rejected": -1.9255733489990234, + "logps/chosen": -341.8725280761719, + "logps/rejected": -359.5609436035156, + "loss": -0.264, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -30.570653915405273, + "rewards/margins": 62.6126594543457, + "rewards/rejected": -93.18330383300781, + "step": 690 + }, + { + "epoch": 0.13738959764474976, + "grad_norm": 86.57370030845577, + "learning_rate": 4.9788444260996e-07, + "logits/chosen": -2.268033504486084, + "logits/rejected": -2.1153342723846436, + "logps/chosen": -277.9055480957031, + "logps/rejected": -311.05645751953125, + "loss": -0.3819, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -13.68879222869873, + "rewards/margins": 39.499717712402344, + "rewards/rejected": -53.188514709472656, + "step": 700 + }, + { + "epoch": 0.1393523061825319, + "grad_norm": 338.05295305391684, + "learning_rate": 4.976562498374295e-07, + "logits/chosen": -2.362950325012207, + "logits/rejected": -1.6634302139282227, + "logps/chosen": -310.96356201171875, + "logps/rejected": -385.9927673339844, + "loss": -0.2462, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -21.215097427368164, + "rewards/margins": 105.8913345336914, + "rewards/rejected": -127.10643005371094, + "step": 710 + }, + { + "epoch": 0.14131501472031405, + "grad_norm": 94.651325061684, + "learning_rate": 4.974164300384997e-07, + "logits/chosen": -2.023195743560791, + "logits/rejected": -2.026101589202881, + "logps/chosen": -250.32742309570312, + "logps/rejected": -377.23101806640625, + "loss": -0.4751, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -19.123937606811523, + "rewards/margins": 64.68328094482422, + "rewards/rejected": -83.80722045898438, + "step": 720 + }, + { + "epoch": 0.14327772325809618, + "grad_norm": 188.47080569588115, + "learning_rate": 4.971649944722893e-07, + "logits/chosen": -1.4261937141418457, + "logits/rejected": -1.3943569660186768, + "logps/chosen": -304.92669677734375, + "logps/rejected": -393.43450927734375, + "loss": -0.3547, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -52.48748016357422, + "rewards/margins": 57.24298858642578, + "rewards/rejected": -109.73048400878906, + "step": 730 + }, + { + "epoch": 0.1452404317958783, + "grad_norm": 119.65538293437197, + "learning_rate": 4.96901954943257e-07, + "logits/chosen": -1.623649001121521, + "logits/rejected": -0.6015797853469849, + "logps/chosen": -316.39544677734375, + "logps/rejected": -308.2103271484375, + "loss": -0.3321, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -50.778038024902344, + "rewards/margins": 90.8272705078125, + "rewards/rejected": -141.60531616210938, + "step": 740 + }, + { + "epoch": 0.14720314033366044, + "grad_norm": 86.48810999349877, + "learning_rate": 4.96627323800647e-07, + "logits/chosen": -2.2582385540008545, + "logits/rejected": -1.9543392658233643, + "logps/chosen": -261.2178039550781, + "logps/rejected": -318.0521240234375, + "loss": -0.4172, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -23.52887725830078, + "rewards/margins": 50.16834259033203, + "rewards/rejected": -73.69721221923828, + "step": 750 + }, + { + "epoch": 0.1491658488714426, + "grad_norm": 113.97308557258582, + "learning_rate": 4.963411139379099e-07, + "logits/chosen": -2.4743828773498535, + "logits/rejected": -1.6739752292633057, + "logps/chosen": -298.59271240234375, + "logps/rejected": -342.79595947265625, + "loss": -0.4882, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -7.983345031738281, + "rewards/margins": 74.26821899414062, + "rewards/rejected": -82.25157165527344, + "step": 760 + }, + { + "epoch": 0.15112855740922473, + "grad_norm": 392.30494311916834, + "learning_rate": 4.960433387920964e-07, + "logits/chosen": -1.2702064514160156, + "logits/rejected": -1.4976896047592163, + "logps/chosen": -225.3599853515625, + "logps/rejected": -356.6004333496094, + "loss": -0.4134, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": -55.614784240722656, + "rewards/margins": 8.209315299987793, + "rewards/rejected": -63.82408905029297, + "step": 770 + }, + { + "epoch": 0.15309126594700687, + "grad_norm": 127.10227967472944, + "learning_rate": 4.957340123432271e-07, + "logits/chosen": -1.3343271017074585, + "logits/rejected": -0.10657083988189697, + "logps/chosen": -352.56072998046875, + "logps/rejected": -341.1171875, + "loss": -0.3394, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -19.992746353149414, + "rewards/margins": 89.02325439453125, + "rewards/rejected": -109.01600646972656, + "step": 780 + }, + { + "epoch": 0.155053974484789, + "grad_norm": 420.58536554983976, + "learning_rate": 4.954131491136361e-07, + "logits/chosen": -1.0421171188354492, + "logits/rejected": -0.5474944710731506, + "logps/chosen": -353.1336364746094, + "logps/rejected": -364.11773681640625, + "loss": -0.269, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.73956298828125, + "rewards/margins": 47.741920471191406, + "rewards/rejected": -75.48148345947266, + "step": 790 + }, + { + "epoch": 0.15701668302257116, + "grad_norm": 195.75815539361142, + "learning_rate": 4.95080764167289e-07, + "logits/chosen": -1.1912492513656616, + "logits/rejected": -0.9618164300918579, + "logps/chosen": -253.3438262939453, + "logps/rejected": -374.35675048828125, + "loss": -0.597, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -3.0270016193389893, + "rewards/margins": 97.99317932128906, + "rewards/rejected": -101.02018737792969, + "step": 800 + }, + { + "epoch": 0.1589793915603533, + "grad_norm": 139.25684009769012, + "learning_rate": 4.94736873109076e-07, + "logits/chosen": -1.2744401693344116, + "logits/rejected": -0.7750533223152161, + "logps/chosen": -281.9678039550781, + "logps/rejected": -328.2683410644531, + "loss": -0.3557, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -29.657827377319336, + "rewards/margins": 66.78199768066406, + "rewards/rejected": -96.4398193359375, + "step": 810 + }, + { + "epoch": 0.16094210009813542, + "grad_norm": 178.30783895621693, + "learning_rate": 4.943814920840787e-07, + "logits/chosen": -0.9328937530517578, + "logits/rejected": -0.6445346474647522, + "logps/chosen": -305.2715148925781, + "logps/rejected": -317.13470458984375, + "loss": -0.2182, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": -27.077402114868164, + "rewards/margins": 24.533750534057617, + "rewards/rejected": -51.61115646362305, + "step": 820 + }, + { + "epoch": 0.16290480863591755, + "grad_norm": 171.86147694269948, + "learning_rate": 4.940146377768126e-07, + "logits/chosen": -1.30418860912323, + "logits/rejected": -0.7124021053314209, + "logps/chosen": -290.5502014160156, + "logps/rejected": -321.49639892578125, + "loss": -0.4875, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -3.149672031402588, + "rewards/margins": 88.61860656738281, + "rewards/rejected": -91.76827239990234, + "step": 830 + }, + { + "epoch": 0.1648675171736997, + "grad_norm": 274.5543658349456, + "learning_rate": 4.936363274104441e-07, + "logits/chosen": -1.2539094686508179, + "logits/rejected": -0.7504829168319702, + "logps/chosen": -345.98040771484375, + "logps/rejected": -350.09796142578125, + "loss": -0.3229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -59.561668395996094, + "rewards/margins": 63.72674560546875, + "rewards/rejected": -123.2884292602539, + "step": 840 + }, + { + "epoch": 0.16683022571148184, + "grad_norm": 190.34962104283832, + "learning_rate": 4.932465787459808e-07, + "logits/chosen": -0.9200321435928345, + "logits/rejected": -0.39313608407974243, + "logps/chosen": -297.28704833984375, + "logps/rejected": -332.4362487792969, + "loss": -0.4113, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -31.210453033447266, + "rewards/margins": 48.063751220703125, + "rewards/rejected": -79.27420806884766, + "step": 850 + }, + { + "epoch": 0.16879293424926398, + "grad_norm": 192.06158458050416, + "learning_rate": 4.92845410081439e-07, + "logits/chosen": 0.14147694408893585, + "logits/rejected": 0.6419375538825989, + "logps/chosen": -296.87158203125, + "logps/rejected": -445.063232421875, + "loss": -0.2896, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -37.98882293701172, + "rewards/margins": 123.41569519042969, + "rewards/rejected": -161.404541015625, + "step": 860 + }, + { + "epoch": 0.17075564278704614, + "grad_norm": 122.74008106568301, + "learning_rate": 4.924328402509833e-07, + "logits/chosen": 0.125508114695549, + "logits/rejected": 0.6455780267715454, + "logps/chosen": -335.5021667480469, + "logps/rejected": -378.15625, + "loss": -0.4273, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -61.71384811401367, + "rewards/margins": 79.3321533203125, + "rewards/rejected": -141.04600524902344, + "step": 870 + }, + { + "epoch": 0.17271835132482827, + "grad_norm": 685.9402691010895, + "learning_rate": 4.920088886240434e-07, + "logits/chosen": 0.07589732110500336, + "logits/rejected": 1.1146031618118286, + "logps/chosen": -292.1110534667969, + "logps/rejected": -382.05706787109375, + "loss": -0.5777, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -7.556271553039551, + "rewards/margins": 114.8202133178711, + "rewards/rejected": -122.3764877319336, + "step": 880 + }, + { + "epoch": 0.1746810598626104, + "grad_norm": 271.1024175100246, + "learning_rate": 4.915735751044045e-07, + "logits/chosen": -0.5838578939437866, + "logits/rejected": 0.1533234566450119, + "logps/chosen": -347.8789367675781, + "logps/rejected": -408.5445251464844, + "loss": -0.3337, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -59.50645065307617, + "rewards/margins": 102.5400161743164, + "rewards/rejected": -162.0464630126953, + "step": 890 + }, + { + "epoch": 0.17664376840039253, + "grad_norm": 235.51558694045585, + "learning_rate": 4.911269201292724e-07, + "logits/chosen": -0.6416040658950806, + "logits/rejected": -0.19059182703495026, + "logps/chosen": -354.4915466308594, + "logps/rejected": -370.1239013671875, + "loss": -0.1454, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -60.1268310546875, + "rewards/margins": 51.82355499267578, + "rewards/rejected": -111.95039367675781, + "step": 900 + }, + { + "epoch": 0.1786064769381747, + "grad_norm": 145.00758891751065, + "learning_rate": 4.906689446683146e-07, + "logits/chosen": -1.3398195505142212, + "logits/rejected": -1.218482255935669, + "logps/chosen": -255.09414672851562, + "logps/rejected": -401.0498046875, + "loss": -0.3631, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -17.96017074584961, + "rewards/margins": 65.85710144042969, + "rewards/rejected": -83.81727600097656, + "step": 910 + }, + { + "epoch": 0.18056918547595682, + "grad_norm": 167.58121540373645, + "learning_rate": 4.901996702226755e-07, + "logits/chosen": -1.3835618495941162, + "logits/rejected": -1.1535968780517578, + "logps/chosen": -317.39752197265625, + "logps/rejected": -416.4386291503906, + "loss": -0.5316, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -16.77207374572754, + "rewards/margins": 61.89057159423828, + "rewards/rejected": -78.66264343261719, + "step": 920 + }, + { + "epoch": 0.18253189401373895, + "grad_norm": 231.2481969777746, + "learning_rate": 4.897191188239667e-07, + "logits/chosen": -1.480469822883606, + "logits/rejected": -0.03306392580270767, + "logps/chosen": -375.44720458984375, + "logps/rejected": -326.720458984375, + "loss": -0.3226, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -51.41065216064453, + "rewards/margins": 64.13352966308594, + "rewards/rejected": -115.54417419433594, + "step": 930 + }, + { + "epoch": 0.1844946025515211, + "grad_norm": 95.89451364951198, + "learning_rate": 4.892273130332334e-07, + "logits/chosen": -1.559515357017517, + "logits/rejected": -1.0601645708084106, + "logps/chosen": -366.4732360839844, + "logps/rejected": -480.1495666503906, + "loss": -0.3171, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -38.44028854370117, + "rewards/margins": 82.14427185058594, + "rewards/rejected": -120.58455657958984, + "step": 940 + }, + { + "epoch": 0.18645731108930325, + "grad_norm": 165.28594620525925, + "learning_rate": 4.887242759398945e-07, + "logits/chosen": -1.6824363470077515, + "logits/rejected": -0.6552220582962036, + "logps/chosen": -223.1175537109375, + "logps/rejected": -305.62994384765625, + "loss": -0.5034, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.53265380859375, + "rewards/margins": 86.1128158569336, + "rewards/rejected": -111.64546966552734, + "step": 950 + }, + { + "epoch": 0.18842001962708538, + "grad_norm": 352.8798167060443, + "learning_rate": 4.88210031160659e-07, + "logits/chosen": -1.1741770505905151, + "logits/rejected": -0.8634139895439148, + "logps/chosen": -291.512939453125, + "logps/rejected": -352.5907897949219, + "loss": -0.4225, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -24.746952056884766, + "rewards/margins": 75.15422821044922, + "rewards/rejected": -99.90117645263672, + "step": 960 + }, + { + "epoch": 0.1903827281648675, + "grad_norm": 560.8621196580634, + "learning_rate": 4.876846028384169e-07, + "logits/chosen": -1.093379259109497, + "logits/rejected": -0.6906472444534302, + "logps/chosen": -271.27606201171875, + "logps/rejected": -361.41082763671875, + "loss": -0.3826, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -67.39205169677734, + "rewards/margins": 58.437278747558594, + "rewards/rejected": -125.82933044433594, + "step": 970 + }, + { + "epoch": 0.19234543670264967, + "grad_norm": 159.55606280882097, + "learning_rate": 4.87148015641106e-07, + "logits/chosen": -1.4224119186401367, + "logits/rejected": -1.2690508365631104, + "logps/chosen": -292.9847717285156, + "logps/rejected": -399.9471740722656, + "loss": -0.427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -41.93357849121094, + "rewards/margins": 83.13137817382812, + "rewards/rejected": -125.06495666503906, + "step": 980 + }, + { + "epoch": 0.1943081452404318, + "grad_norm": 641.5121807143399, + "learning_rate": 4.866002947605539e-07, + "logits/chosen": -1.591728925704956, + "logits/rejected": -0.5198885202407837, + "logps/chosen": -258.06512451171875, + "logps/rejected": -357.8591613769531, + "loss": -0.4375, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -21.172761917114258, + "rewards/margins": 94.05410766601562, + "rewards/rejected": -115.22686767578125, + "step": 990 + }, + { + "epoch": 0.19627085377821393, + "grad_norm": 294.8038593268707, + "learning_rate": 4.860414659112948e-07, + "logits/chosen": -1.466378927230835, + "logits/rejected": -0.14748302102088928, + "logps/chosen": -287.17327880859375, + "logps/rejected": -319.5938415527344, + "loss": -0.2781, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -49.524131774902344, + "rewards/margins": 77.31150817871094, + "rewards/rejected": -126.83563232421875, + "step": 1000 + }, + { + "epoch": 0.19823356231599606, + "grad_norm": 156.36694012458494, + "learning_rate": 4.854715553293627e-07, + "logits/chosen": -2.3596818447113037, + "logits/rejected": -1.8426835536956787, + "logps/chosen": -309.9227600097656, + "logps/rejected": -283.349609375, + "loss": -0.3961, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 4.278321266174316, + "rewards/margins": 85.08187103271484, + "rewards/rejected": -80.80355834960938, + "step": 1010 + }, + { + "epoch": 0.20019627085377822, + "grad_norm": 381.72564192876956, + "learning_rate": 4.848905897710595e-07, + "logits/chosen": -1.2412532567977905, + "logits/rejected": -0.6532396078109741, + "logps/chosen": -364.1349182128906, + "logps/rejected": -307.4398498535156, + "loss": -0.3027, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -53.071319580078125, + "rewards/margins": 37.50835418701172, + "rewards/rejected": -90.57967376708984, + "step": 1020 + }, + { + "epoch": 0.20215897939156036, + "grad_norm": 78.98141096690985, + "learning_rate": 4.842985965116987e-07, + "logits/chosen": -1.6987800598144531, + "logits/rejected": -1.1645739078521729, + "logps/chosen": -369.7084655761719, + "logps/rejected": -363.26177978515625, + "loss": -0.2136, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -28.673961639404297, + "rewards/margins": 52.8286018371582, + "rewards/rejected": -81.50257110595703, + "step": 1030 + }, + { + "epoch": 0.2041216879293425, + "grad_norm": 346.925274350411, + "learning_rate": 4.836956033443253e-07, + "logits/chosen": -1.106367826461792, + "logits/rejected": -0.31603357195854187, + "logps/chosen": -356.8329162597656, + "logps/rejected": -444.78179931640625, + "loss": -0.3064, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": 10.349096298217773, + "rewards/margins": 73.9643783569336, + "rewards/rejected": -63.61528396606445, + "step": 1040 + }, + { + "epoch": 0.20608439646712462, + "grad_norm": 438.4198715642591, + "learning_rate": 4.830816385784104e-07, + "logits/chosen": -0.49576035141944885, + "logits/rejected": 0.03637387603521347, + "logps/chosen": -348.09161376953125, + "logps/rejected": -368.6332092285156, + "loss": -0.3907, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -40.26350021362305, + "rewards/margins": 49.70841598510742, + "rewards/rejected": -89.97191619873047, + "step": 1050 + }, + { + "epoch": 0.20804710500490678, + "grad_norm": 79.5420107330979, + "learning_rate": 4.824567310385226e-07, + "logits/chosen": -1.0224738121032715, + "logits/rejected": -0.25595730543136597, + "logps/chosen": -327.381103515625, + "logps/rejected": -342.5819396972656, + "loss": -0.5246, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -10.011220932006836, + "rewards/margins": 70.51642608642578, + "rewards/rejected": -80.52764892578125, + "step": 1060 + }, + { + "epoch": 0.2100098135426889, + "grad_norm": 253.531339773475, + "learning_rate": 4.818209100629744e-07, + "logits/chosen": -0.9710432291030884, + "logits/rejected": -0.8155088424682617, + "logps/chosen": -245.9547119140625, + "logps/rejected": -328.13055419921875, + "loss": -0.5165, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 2.715376377105713, + "rewards/margins": 59.916221618652344, + "rewards/rejected": -57.20084762573242, + "step": 1070 + }, + { + "epoch": 0.21197252208047104, + "grad_norm": 146.43974810880522, + "learning_rate": 4.81174205502445e-07, + "logits/chosen": -0.2731146216392517, + "logits/rejected": -0.33066827058792114, + "logps/chosen": -261.10858154296875, + "logps/rejected": -319.43499755859375, + "loss": -0.4185, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -39.90928268432617, + "rewards/margins": 67.99629974365234, + "rewards/rejected": -107.90559387207031, + "step": 1080 + }, + { + "epoch": 0.2139352306182532, + "grad_norm": 165.87682456896025, + "learning_rate": 4.80516647718579e-07, + "logits/chosen": 0.37342050671577454, + "logits/rejected": 0.8275140523910522, + "logps/chosen": -284.4266357421875, + "logps/rejected": -429.19451904296875, + "loss": -0.4425, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -44.65375518798828, + "rewards/margins": 130.34715270996094, + "rewards/rejected": -175.00088500976562, + "step": 1090 + }, + { + "epoch": 0.21589793915603533, + "grad_norm": 237.45769793544008, + "learning_rate": 4.798482675825602e-07, + "logits/chosen": -0.37835073471069336, + "logits/rejected": -0.46622657775878906, + "logps/chosen": -219.25900268554688, + "logps/rejected": -347.93817138671875, + "loss": -0.2896, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -11.062115669250488, + "rewards/margins": 88.63566589355469, + "rewards/rejected": -99.69776916503906, + "step": 1100 + }, + { + "epoch": 0.21786064769381747, + "grad_norm": 91.18145998015851, + "learning_rate": 4.791690964736636e-07, + "logits/chosen": -0.28714293241500854, + "logits/rejected": 0.12103681266307831, + "logps/chosen": -320.0545959472656, + "logps/rejected": -316.3763122558594, + "loss": -0.2683, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -8.284218788146973, + "rewards/margins": 93.08128356933594, + "rewards/rejected": -101.3655014038086, + "step": 1110 + }, + { + "epoch": 0.2198233562315996, + "grad_norm": 121.42603440193854, + "learning_rate": 4.78479166277781e-07, + "logits/chosen": -0.0407433919608593, + "logits/rejected": 0.8031107187271118, + "logps/chosen": -345.73785400390625, + "logps/rejected": -389.3685302734375, + "loss": -0.6513, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.530865669250488, + "rewards/margins": 89.83076477050781, + "rewards/rejected": -104.36163330078125, + "step": 1120 + }, + { + "epoch": 0.22178606476938176, + "grad_norm": 178.85598494929042, + "learning_rate": 4.777785093859247e-07, + "logits/chosen": -0.24294237792491913, + "logits/rejected": 1.3924249410629272, + "logps/chosen": -319.64617919921875, + "logps/rejected": -478.07318115234375, + "loss": -0.4545, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -34.97998046875, + "rewards/margins": 150.94078063964844, + "rewards/rejected": -185.92076110839844, + "step": 1130 + }, + { + "epoch": 0.2237487733071639, + "grad_norm": 203.87624153803125, + "learning_rate": 4.770671586927063e-07, + "logits/chosen": -1.1225866079330444, + "logits/rejected": -0.2693057954311371, + "logps/chosen": -378.0223693847656, + "logps/rejected": -405.68939208984375, + "loss": -0.4597, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -21.15289878845215, + "rewards/margins": 55.96300506591797, + "rewards/rejected": -77.11591339111328, + "step": 1140 + }, + { + "epoch": 0.22571148184494602, + "grad_norm": 151.87784988687667, + "learning_rate": 4.7634514759479275e-07, + "logits/chosen": -0.641355574131012, + "logits/rejected": 0.8348206281661987, + "logps/chosen": -320.2981262207031, + "logps/rejected": -383.2963562011719, + "loss": -0.5063, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -34.637332916259766, + "rewards/margins": 128.01498413085938, + "rewards/rejected": -162.65231323242188, + "step": 1150 + }, + { + "epoch": 0.22767419038272815, + "grad_norm": 163.72370258460307, + "learning_rate": 4.7561250998933835e-07, + "logits/chosen": -0.26535776257514954, + "logits/rejected": 0.9920859336853027, + "logps/chosen": -341.9827575683594, + "logps/rejected": -310.3675231933594, + "loss": -0.5729, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": 2.338268756866455, + "rewards/margins": 98.30659484863281, + "rewards/rejected": -95.96833038330078, + "step": 1160 + }, + { + "epoch": 0.2296368989205103, + "grad_norm": 238.124688408163, + "learning_rate": 4.7486928027239304e-07, + "logits/chosen": 0.680788516998291, + "logits/rejected": 1.1130828857421875, + "logps/chosen": -212.67501831054688, + "logps/rejected": -307.6126403808594, + "loss": -0.2993, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.026186466217041, + "rewards/margins": 88.6893539428711, + "rewards/rejected": -95.71553039550781, + "step": 1170 + }, + { + "epoch": 0.23159960745829244, + "grad_norm": 805.0980357395986, + "learning_rate": 4.7411549333728807e-07, + "logits/chosen": 0.15952681005001068, + "logits/rejected": -0.0020960806868970394, + "logps/chosen": -371.7433166503906, + "logps/rejected": -403.44757080078125, + "loss": -0.2357, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": -93.2693862915039, + "rewards/margins": 25.92207908630371, + "rewards/rejected": -119.19146728515625, + "step": 1180 + }, + { + "epoch": 0.23356231599607458, + "grad_norm": 258.9523686769121, + "learning_rate": 4.7335118457299756e-07, + "logits/chosen": 0.36494073271751404, + "logits/rejected": 0.283443421125412, + "logps/chosen": -318.1351318359375, + "logps/rejected": -366.8081359863281, + "loss": -0.3282, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -30.163848876953125, + "rewards/margins": 75.2159423828125, + "rewards/rejected": -105.3797836303711, + "step": 1190 + }, + { + "epoch": 0.23552502453385674, + "grad_norm": 133.20915104852426, + "learning_rate": 4.7257638986247684e-07, + "logits/chosen": -0.5547437071800232, + "logits/rejected": -0.1863251030445099, + "logps/chosen": -341.30316162109375, + "logps/rejected": -451.8502502441406, + "loss": -0.3821, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -44.10496520996094, + "rewards/margins": 62.69295120239258, + "rewards/rejected": -106.79791259765625, + "step": 1200 + }, + { + "epoch": 0.23748773307163887, + "grad_norm": 106.00598993940797, + "learning_rate": 4.7179114558097814e-07, + "logits/chosen": -0.6369959115982056, + "logits/rejected": 0.20513415336608887, + "logps/chosen": -265.24688720703125, + "logps/rejected": -304.51287841796875, + "loss": -0.5352, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.943883419036865, + "rewards/margins": 93.66683959960938, + "rewards/rejected": -99.6107177734375, + "step": 1210 + }, + { + "epoch": 0.239450441609421, + "grad_norm": 267.91703907799683, + "learning_rate": 4.709954885943428e-07, + "logits/chosen": -0.5717731714248657, + "logits/rejected": -0.352356493473053, + "logps/chosen": -311.2847900390625, + "logps/rejected": -283.5633544921875, + "loss": -0.166, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -38.53837203979492, + "rewards/margins": 27.605804443359375, + "rewards/rejected": -66.14418029785156, + "step": 1220 + }, + { + "epoch": 0.24141315014720313, + "grad_norm": 134.52001393314518, + "learning_rate": 4.7018945625727026e-07, + "logits/chosen": -1.4019057750701904, + "logits/rejected": -0.9080744981765747, + "logps/chosen": -313.23150634765625, + "logps/rejected": -345.84368896484375, + "loss": -0.3121, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -35.53964614868164, + "rewards/margins": 41.04530334472656, + "rewards/rejected": -76.58494567871094, + "step": 1230 + }, + { + "epoch": 0.2433758586849853, + "grad_norm": 114.79403816792266, + "learning_rate": 4.6937308641156447e-07, + "logits/chosen": -0.7664046883583069, + "logits/rejected": -0.3152869939804077, + "logps/chosen": -207.9265594482422, + "logps/rejected": -280.22601318359375, + "loss": -0.3627, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 2.171916961669922, + "rewards/margins": 70.08987426757812, + "rewards/rejected": -67.9179458618164, + "step": 1240 + }, + { + "epoch": 0.24533856722276742, + "grad_norm": 650.163880963966, + "learning_rate": 4.685464173843574e-07, + "logits/chosen": -0.9291526079177856, + "logits/rejected": -0.560367226600647, + "logps/chosen": -236.35678100585938, + "logps/rejected": -341.97314453125, + "loss": -0.4531, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.365212917327881, + "rewards/margins": 84.2957763671875, + "rewards/rejected": -86.6609878540039, + "step": 1250 + }, + { + "epoch": 0.24730127576054955, + "grad_norm": 141.00731130720393, + "learning_rate": 4.677094879863093e-07, + "logits/chosen": -0.3296302556991577, + "logits/rejected": 0.6696518063545227, + "logps/chosen": -274.39739990234375, + "logps/rejected": -319.5539245605469, + "loss": -0.2632, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -25.98756980895996, + "rewards/margins": 76.83675384521484, + "rewards/rejected": -102.82432556152344, + "step": 1260 + }, + { + "epoch": 0.2492639842983317, + "grad_norm": 257.82707549351966, + "learning_rate": 4.66862337509787e-07, + "logits/chosen": -0.4417852461338043, + "logits/rejected": 0.5099334716796875, + "logps/chosen": -332.7542724609375, + "logps/rejected": -348.40850830078125, + "loss": -0.2051, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -11.930026054382324, + "rewards/margins": 116.28517150878906, + "rewards/rejected": -128.21517944335938, + "step": 1270 + }, + { + "epoch": 0.2512266928361138, + "grad_norm": 187.1587822045199, + "learning_rate": 4.660050057270191e-07, + "logits/chosen": -0.007944846525788307, + "logits/rejected": 1.442101240158081, + "logps/chosen": -262.4355163574219, + "logps/rejected": -349.79510498046875, + "loss": -0.4732, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.347912788391113, + "rewards/margins": 129.41015625, + "rewards/rejected": -135.758056640625, + "step": 1280 + }, + { + "epoch": 0.25318940137389595, + "grad_norm": 666.1245726262127, + "learning_rate": 4.6513753288822833e-07, + "logits/chosen": 0.024620437994599342, + "logits/rejected": 0.3650778830051422, + "logps/chosen": -207.1399383544922, + "logps/rejected": -286.62640380859375, + "loss": -0.337, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -47.83836364746094, + "rewards/margins": 66.05262756347656, + "rewards/rejected": -113.8909912109375, + "step": 1290 + }, + { + "epoch": 0.25515210991167814, + "grad_norm": 99.35924285009612, + "learning_rate": 4.6425995971974265e-07, + "logits/chosen": 0.296586811542511, + "logits/rejected": 0.6352599263191223, + "logps/chosen": -343.1158447265625, + "logps/rejected": -315.1025390625, + "loss": 0.0791, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -59.8315315246582, + "rewards/margins": 38.25086212158203, + "rewards/rejected": -98.08238220214844, + "step": 1300 + }, + { + "epoch": 0.25711481844946027, + "grad_norm": 92.30107684645584, + "learning_rate": 4.633723274220824e-07, + "logits/chosen": -0.5543586611747742, + "logits/rejected": -0.5389699935913086, + "logps/chosen": -332.3842468261719, + "logps/rejected": -396.8943176269531, + "loss": -0.329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -35.449684143066406, + "rewards/margins": 37.95604705810547, + "rewards/rejected": -73.40573120117188, + "step": 1310 + }, + { + "epoch": 0.2590775269872424, + "grad_norm": 361.873429330949, + "learning_rate": 4.624746776680267e-07, + "logits/chosen": -0.004599392414093018, + "logits/rejected": 0.009301548823714256, + "logps/chosen": -268.728515625, + "logps/rejected": -327.2678527832031, + "loss": -0.3963, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -20.148090362548828, + "rewards/margins": 60.392845153808594, + "rewards/rejected": -80.54093933105469, + "step": 1320 + }, + { + "epoch": 0.26104023552502453, + "grad_norm": 109.14778662061352, + "learning_rate": 4.6156705260065634e-07, + "logits/chosen": -0.5415644645690918, + "logits/rejected": -0.6133627891540527, + "logps/chosen": -220.20889282226562, + "logps/rejected": -311.1971130371094, + "loss": -0.3628, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -11.035032272338867, + "rewards/margins": 75.02830505371094, + "rewards/rejected": -86.0633316040039, + "step": 1330 + }, + { + "epoch": 0.26300294406280667, + "grad_norm": 97.3276437552577, + "learning_rate": 4.606494948313758e-07, + "logits/chosen": -0.067128024995327, + "logits/rejected": 0.13759474456310272, + "logps/chosen": -287.49169921875, + "logps/rejected": -361.89947509765625, + "loss": -0.417, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -45.38007736206055, + "rewards/margins": 60.861473083496094, + "rewards/rejected": -106.24156188964844, + "step": 1340 + }, + { + "epoch": 0.2649656526005888, + "grad_norm": 214.23452407810288, + "learning_rate": 4.597220474379125e-07, + "logits/chosen": -0.6385024785995483, + "logits/rejected": -0.8114584684371948, + "logps/chosen": -374.2613830566406, + "logps/rejected": -419.7872619628906, + "loss": -0.1842, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -46.98633575439453, + "rewards/margins": 49.645164489746094, + "rewards/rejected": -96.63150024414062, + "step": 1350 + }, + { + "epoch": 0.26692836113837093, + "grad_norm": 161.01599172509583, + "learning_rate": 4.587847539622942e-07, + "logits/chosen": -0.8062373995780945, + "logits/rejected": -0.5399857759475708, + "logps/chosen": -404.7890625, + "logps/rejected": -394.0738220214844, + "loss": -0.5806, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -26.630985260009766, + "rewards/margins": 44.17183303833008, + "rewards/rejected": -70.80281829833984, + "step": 1360 + }, + { + "epoch": 0.2688910696761531, + "grad_norm": 311.7019525663922, + "learning_rate": 4.5783765840880505e-07, + "logits/chosen": -0.8065996170043945, + "logits/rejected": -0.20719997584819794, + "logps/chosen": -352.0925598144531, + "logps/rejected": -424.54632568359375, + "loss": -0.3118, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -24.066116333007812, + "rewards/margins": 93.7226333618164, + "rewards/rejected": -117.78874206542969, + "step": 1370 + }, + { + "epoch": 0.27085377821393525, + "grad_norm": 242.40925811797663, + "learning_rate": 4.568808052419196e-07, + "logits/chosen": -0.4659969210624695, + "logits/rejected": -0.025981564074754715, + "logps/chosen": -250.2906036376953, + "logps/rejected": -320.0798645019531, + "loss": -0.5674, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.68593406677246, + "rewards/margins": 76.80311584472656, + "rewards/rejected": -102.48905181884766, + "step": 1380 + }, + { + "epoch": 0.2728164867517174, + "grad_norm": 150.67743335056548, + "learning_rate": 4.5591423938421513e-07, + "logits/chosen": -0.09468124061822891, + "logits/rejected": 0.4379270672798157, + "logps/chosen": -341.0621643066406, + "logps/rejected": -393.36834716796875, + "loss": -0.2812, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -57.19630813598633, + "rewards/margins": 77.68519592285156, + "rewards/rejected": -134.88150024414062, + "step": 1390 + }, + { + "epoch": 0.2747791952894995, + "grad_norm": 189.2180154396537, + "learning_rate": 4.549380062142627e-07, + "logits/chosen": -0.7744480967521667, + "logits/rejected": -0.47629514336586, + "logps/chosen": -297.24383544921875, + "logps/rejected": -367.6190490722656, + "loss": -0.2708, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -36.73484420776367, + "rewards/margins": 37.12657928466797, + "rewards/rejected": -73.8614273071289, + "step": 1400 + }, + { + "epoch": 0.27674190382728164, + "grad_norm": 398.1433179588958, + "learning_rate": 4.5395215156449683e-07, + "logits/chosen": -0.35732418298721313, + "logits/rejected": -0.7713747024536133, + "logps/chosen": -320.3706970214844, + "logps/rejected": -427.70147705078125, + "loss": -0.5049, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -17.494029998779297, + "rewards/margins": 61.1119384765625, + "rewards/rejected": -78.60597229003906, + "step": 1410 + }, + { + "epoch": 0.2787046123650638, + "grad_norm": 176.93165495809393, + "learning_rate": 4.5295672171906365e-07, + "logits/chosen": -0.8126991391181946, + "logits/rejected": -0.05773216485977173, + "logps/chosen": -278.0550231933594, + "logps/rejected": -299.2767639160156, + "loss": -0.4139, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -12.590118408203125, + "rewards/margins": 58.349082946777344, + "rewards/rejected": -70.93920135498047, + "step": 1420 + }, + { + "epoch": 0.2806673209028459, + "grad_norm": 143.0438988447445, + "learning_rate": 4.5195176341164765e-07, + "logits/chosen": -0.9170870780944824, + "logits/rejected": -0.9549194574356079, + "logps/chosen": -302.0723876953125, + "logps/rejected": -421.36395263671875, + "loss": -0.2934, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -33.58391571044922, + "rewards/margins": 92.94100952148438, + "rewards/rejected": -126.5249252319336, + "step": 1430 + }, + { + "epoch": 0.2826300294406281, + "grad_norm": 173.87703703799082, + "learning_rate": 4.509373238232782e-07, + "logits/chosen": -0.795932412147522, + "logits/rejected": -0.2218722403049469, + "logps/chosen": -323.84259033203125, + "logps/rejected": -302.75732421875, + "loss": -0.4701, + "rewards/accuracies": 0.533333420753479, + "rewards/chosen": -25.0303955078125, + "rewards/margins": 43.92810821533203, + "rewards/rejected": -68.95849609375, + "step": 1440 + }, + { + "epoch": 0.2845927379784102, + "grad_norm": 121.8127323301411, + "learning_rate": 4.499134505801141e-07, + "logits/chosen": -0.306710422039032, + "logits/rejected": -0.17278121411800385, + "logps/chosen": -237.3304443359375, + "logps/rejected": -352.43231201171875, + "loss": -0.4569, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -6.329686164855957, + "rewards/margins": 93.90690612792969, + "rewards/rejected": -100.23661041259766, + "step": 1450 + }, + { + "epoch": 0.28655544651619236, + "grad_norm": 149.5512301219652, + "learning_rate": 4.488801917512076e-07, + "logits/chosen": -0.38846054673194885, + "logits/rejected": -0.6771696209907532, + "logps/chosen": -319.62164306640625, + "logps/rejected": -426.58294677734375, + "loss": -0.2105, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -30.19916343688965, + "rewards/margins": 59.85419845581055, + "rewards/rejected": -90.0533676147461, + "step": 1460 + }, + { + "epoch": 0.2885181550539745, + "grad_norm": 254.60225906008407, + "learning_rate": 4.478375958462479e-07, + "logits/chosen": -0.31459730863571167, + "logits/rejected": 0.6876497268676758, + "logps/chosen": -352.12396240234375, + "logps/rejected": -366.0809631347656, + "loss": -0.2637, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -53.21332931518555, + "rewards/margins": 87.39404296875, + "rewards/rejected": -140.6073760986328, + "step": 1470 + }, + { + "epoch": 0.2904808635917566, + "grad_norm": 220.4961511024023, + "learning_rate": 4.467857118132833e-07, + "logits/chosen": -0.6099443435668945, + "logits/rejected": -0.48984870314598083, + "logps/chosen": -280.97027587890625, + "logps/rejected": -304.026611328125, + "loss": -0.237, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": -34.057838439941406, + "rewards/margins": 25.877832412719727, + "rewards/rejected": -59.93566131591797, + "step": 1480 + }, + { + "epoch": 0.29244357212953875, + "grad_norm": 271.888383810607, + "learning_rate": 4.457245890364235e-07, + "logits/chosen": -0.9527867436408997, + "logits/rejected": 0.04681504890322685, + "logps/chosen": -348.16107177734375, + "logps/rejected": -335.77044677734375, + "loss": -0.3423, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": 1.4055097103118896, + "rewards/margins": 73.13142395019531, + "rewards/rejected": -71.72590637207031, + "step": 1490 + }, + { + "epoch": 0.2944062806673209, + "grad_norm": 630.4514646091122, + "learning_rate": 4.4465427733352124e-07, + "logits/chosen": -0.7063072919845581, + "logits/rejected": -0.4565967917442322, + "logps/chosen": -295.3616943359375, + "logps/rejected": -383.42950439453125, + "loss": -0.4628, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -12.71705436706543, + "rewards/margins": 98.17766571044922, + "rewards/rejected": -110.89472961425781, + "step": 1500 + }, + { + "epoch": 0.296368989205103, + "grad_norm": 190.15994178997343, + "learning_rate": 4.43574826953833e-07, + "logits/chosen": -0.5849586725234985, + "logits/rejected": 0.1167064756155014, + "logps/chosen": -318.72027587890625, + "logps/rejected": -442.87738037109375, + "loss": -0.3713, + "rewards/accuracies": 0.8333331942558289, + "rewards/chosen": -7.379508018493652, + "rewards/margins": 109.5814437866211, + "rewards/rejected": -116.96095275878906, + "step": 1510 + }, + { + "epoch": 0.2983316977428852, + "grad_norm": 129.60364723768552, + "learning_rate": 4.4248628857565997e-07, + "logits/chosen": -0.01909918151795864, + "logits/rejected": 0.701531171798706, + "logps/chosen": -355.89434814453125, + "logps/rejected": -316.97027587890625, + "loss": -0.461, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -19.56877899169922, + "rewards/margins": 84.98934173583984, + "rewards/rejected": -104.55812072753906, + "step": 1520 + }, + { + "epoch": 0.30029440628066734, + "grad_norm": 273.44359518179436, + "learning_rate": 4.413887133039692e-07, + "logits/chosen": -0.8855515718460083, + "logits/rejected": 0.502372682094574, + "logps/chosen": -429.783203125, + "logps/rejected": -412.05401611328125, + "loss": -0.4716, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.15018081665039, + "rewards/margins": 117.2563247680664, + "rewards/rejected": -137.40650939941406, + "step": 1530 + }, + { + "epoch": 0.30225711481844947, + "grad_norm": 131.2149314160654, + "learning_rate": 4.4028215266799395e-07, + "logits/chosen": -0.3892694115638733, + "logits/rejected": 0.39369386434555054, + "logps/chosen": -282.8835754394531, + "logps/rejected": -336.5223693847656, + "loss": -0.4009, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -9.758750915527344, + "rewards/margins": 105.216064453125, + "rewards/rejected": -114.9748306274414, + "step": 1540 + }, + { + "epoch": 0.3042198233562316, + "grad_norm": 577.338101015651, + "learning_rate": 4.391666586188145e-07, + "logits/chosen": 0.2062421292066574, + "logits/rejected": 0.7659724950790405, + "logps/chosen": -243.0502166748047, + "logps/rejected": -323.6511535644531, + "loss": -0.4572, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.352008819580078, + "rewards/margins": 67.0520248413086, + "rewards/rejected": -97.4040298461914, + "step": 1550 + }, + { + "epoch": 0.30618253189401373, + "grad_norm": 146.4717861452119, + "learning_rate": 4.380422835269193e-07, + "logits/chosen": -0.4887501299381256, + "logits/rejected": 0.06172027066349983, + "logps/chosen": -322.8172607421875, + "logps/rejected": -423.1553649902344, + "loss": -0.367, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -68.84202575683594, + "rewards/margins": 88.07933044433594, + "rewards/rejected": -156.92135620117188, + "step": 1560 + }, + { + "epoch": 0.30814524043179586, + "grad_norm": 138.86670589752217, + "learning_rate": 4.3690908017974596e-07, + "logits/chosen": -0.2648061215877533, + "logits/rejected": -0.24680499732494354, + "logps/chosen": -245.3034210205078, + "logps/rejected": -377.53033447265625, + "loss": -0.3897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.56615447998047, + "rewards/margins": 84.67030334472656, + "rewards/rejected": -104.2364501953125, + "step": 1570 + }, + { + "epoch": 0.310107948969578, + "grad_norm": 361.99759490114053, + "learning_rate": 4.3576710177920356e-07, + "logits/chosen": -0.7863548398017883, + "logits/rejected": -0.24762578308582306, + "logps/chosen": -265.63458251953125, + "logps/rejected": -330.4261779785156, + "loss": -0.4518, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -29.226421356201172, + "rewards/margins": 70.5469970703125, + "rewards/rejected": -99.77342224121094, + "step": 1580 + }, + { + "epoch": 0.3120706575073602, + "grad_norm": 110.26161190271637, + "learning_rate": 4.346164019391742e-07, + "logits/chosen": -0.8181453943252563, + "logits/rejected": -0.41510826349258423, + "logps/chosen": -379.28326416015625, + "logps/rejected": -444.27020263671875, + "loss": -0.4288, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -17.949371337890625, + "rewards/margins": 93.24482727050781, + "rewards/rejected": -111.1942138671875, + "step": 1590 + }, + { + "epoch": 0.3140333660451423, + "grad_norm": 418.14545445046167, + "learning_rate": 4.3345703468299634e-07, + "logits/chosen": -0.04152932018041611, + "logits/rejected": -0.17715571820735931, + "logps/chosen": -317.2388610839844, + "logps/rejected": -370.34503173828125, + "loss": -0.077, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -39.51045608520508, + "rewards/margins": 59.95743942260742, + "rewards/rejected": -99.4678955078125, + "step": 1600 + }, + { + "epoch": 0.31599607458292445, + "grad_norm": 341.5697755314112, + "learning_rate": 4.322890544409286e-07, + "logits/chosen": -0.933850109577179, + "logits/rejected": 0.7957839369773865, + "logps/chosen": -325.163818359375, + "logps/rejected": -417.4605407714844, + "loss": -0.8099, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.714933395385742, + "rewards/margins": 148.0409698486328, + "rewards/rejected": -158.7559051513672, + "step": 1610 + }, + { + "epoch": 0.3179587831207066, + "grad_norm": 1172.1117342934133, + "learning_rate": 4.311125160475938e-07, + "logits/chosen": 0.015446802601218224, + "logits/rejected": 0.40264415740966797, + "logps/chosen": -313.5318603515625, + "logps/rejected": -489.9974670410156, + "loss": -0.5845, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -41.430572509765625, + "rewards/margins": 96.05662536621094, + "rewards/rejected": -137.48721313476562, + "step": 1620 + }, + { + "epoch": 0.3199214916584887, + "grad_norm": 292.0814140958923, + "learning_rate": 4.299274747394055e-07, + "logits/chosen": -0.01684349775314331, + "logits/rejected": 0.5265442728996277, + "logps/chosen": -288.3865966796875, + "logps/rejected": -414.86944580078125, + "loss": -0.6777, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -22.02135467529297, + "rewards/margins": 150.4295654296875, + "rewards/rejected": -172.45091247558594, + "step": 1630 + }, + { + "epoch": 0.32188420019627084, + "grad_norm": 303.0218494139286, + "learning_rate": 4.287339861519737e-07, + "logits/chosen": -0.4546899199485779, + "logits/rejected": 0.34309619665145874, + "logps/chosen": -338.341064453125, + "logps/rejected": -413.78448486328125, + "loss": -0.3411, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -38.64856719970703, + "rewards/margins": 92.2686996459961, + "rewards/rejected": -130.91726684570312, + "step": 1640 + }, + { + "epoch": 0.323846908734053, + "grad_norm": 406.3278604007249, + "learning_rate": 4.275321063174936e-07, + "logits/chosen": -1.1363162994384766, + "logits/rejected": -0.6217517852783203, + "logps/chosen": -352.1303405761719, + "logps/rejected": -346.18896484375, + "loss": -0.4748, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -15.482063293457031, + "rewards/margins": 75.24620056152344, + "rewards/rejected": -90.728271484375, + "step": 1650 + }, + { + "epoch": 0.3258096172718351, + "grad_norm": 266.73343262711734, + "learning_rate": 4.2632189166211454e-07, + "logits/chosen": -0.22011294960975647, + "logits/rejected": -0.6874805688858032, + "logps/chosen": -276.23443603515625, + "logps/rejected": -361.9880065917969, + "loss": -0.514, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -38.76226806640625, + "rewards/margins": 59.25056076049805, + "rewards/rejected": -98.01282501220703, + "step": 1660 + }, + { + "epoch": 0.3277723258096173, + "grad_norm": 186.8053048885168, + "learning_rate": 4.251033990032912e-07, + "logits/chosen": -0.20579977333545685, + "logits/rejected": 0.3252968490123749, + "logps/chosen": -328.60992431640625, + "logps/rejected": -461.1065979003906, + "loss": -0.4945, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.5130558013916, + "rewards/margins": 125.23563385009766, + "rewards/rejected": -156.74867248535156, + "step": 1670 + }, + { + "epoch": 0.3297350343473994, + "grad_norm": 146.48868065371127, + "learning_rate": 4.238766855471161e-07, + "logits/chosen": -0.5992350578308105, + "logits/rejected": 0.19298234581947327, + "logps/chosen": -372.07769775390625, + "logps/rejected": -344.19244384765625, + "loss": -0.5042, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -25.195465087890625, + "rewards/margins": 83.94126892089844, + "rewards/rejected": -109.13673400878906, + "step": 1680 + }, + { + "epoch": 0.33169774288518156, + "grad_norm": 174.31540598051274, + "learning_rate": 4.226418088856335e-07, + "logits/chosen": -0.520244836807251, + "logits/rejected": -0.006069634575396776, + "logps/chosen": -268.96282958984375, + "logps/rejected": -441.2184143066406, + "loss": -0.4037, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -7.5589494705200195, + "rewards/margins": 101.68757629394531, + "rewards/rejected": -109.24654388427734, + "step": 1690 + }, + { + "epoch": 0.3336604514229637, + "grad_norm": 125.96035649492566, + "learning_rate": 4.2139882699413613e-07, + "logits/chosen": -0.971671462059021, + "logits/rejected": 0.798663318157196, + "logps/chosen": -299.2193603515625, + "logps/rejected": -302.34381103515625, + "loss": -0.5299, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -27.212772369384766, + "rewards/margins": 84.49937438964844, + "rewards/rejected": -111.712158203125, + "step": 1700 + }, + { + "epoch": 0.3356231599607458, + "grad_norm": 309.9836127821377, + "learning_rate": 4.2014779822844274e-07, + "logits/chosen": -0.3972320556640625, + "logits/rejected": 0.03052547574043274, + "logps/chosen": -267.49505615234375, + "logps/rejected": -406.4791259765625, + "loss": -0.4363, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -37.45825958251953, + "rewards/margins": 112.9147720336914, + "rewards/rejected": -150.373046875, + "step": 1710 + }, + { + "epoch": 0.33758586849852795, + "grad_norm": 246.7349155956216, + "learning_rate": 4.18888781322159e-07, + "logits/chosen": -0.15653717517852783, + "logits/rejected": 0.8107942342758179, + "logps/chosen": -289.45263671875, + "logps/rejected": -403.25396728515625, + "loss": -0.6618, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -42.41587829589844, + "rewards/margins": 103.66729736328125, + "rewards/rejected": -146.08316040039062, + "step": 1720 + }, + { + "epoch": 0.3395485770363101, + "grad_norm": 144.78021131836027, + "learning_rate": 4.176218353839195e-07, + "logits/chosen": -0.9188981056213379, + "logits/rejected": -0.6861734986305237, + "logps/chosen": -301.84503173828125, + "logps/rejected": -298.54827880859375, + "loss": -0.2465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -39.52213668823242, + "rewards/margins": 48.964603424072266, + "rewards/rejected": -88.48674011230469, + "step": 1730 + }, + { + "epoch": 0.34151128557409227, + "grad_norm": 1346.2376860191207, + "learning_rate": 4.1634701989461325e-07, + "logits/chosen": -0.4513590931892395, + "logits/rejected": -0.4555346965789795, + "logps/chosen": -302.19073486328125, + "logps/rejected": -408.97137451171875, + "loss": -0.5585, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -33.58020782470703, + "rewards/margins": 97.23760986328125, + "rewards/rejected": -130.81781005859375, + "step": 1740 + }, + { + "epoch": 0.3434739941118744, + "grad_norm": 150.10453473273904, + "learning_rate": 4.1506439470459056e-07, + "logits/chosen": 0.05149317905306816, + "logits/rejected": 0.23694534599781036, + "logps/chosen": -286.8460388183594, + "logps/rejected": -367.44732666015625, + "loss": -0.4108, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -52.61028289794922, + "rewards/margins": 93.46243286132812, + "rewards/rejected": -146.07272338867188, + "step": 1750 + }, + { + "epoch": 0.34543670264965654, + "grad_norm": 113.95944393481646, + "learning_rate": 4.137740200308537e-07, + "logits/chosen": -0.5160384178161621, + "logits/rejected": -0.018883686512708664, + "logps/chosen": -301.40863037109375, + "logps/rejected": -341.28314208984375, + "loss": -0.3418, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -11.017160415649414, + "rewards/margins": 62.98115158081055, + "rewards/rejected": -73.99830627441406, + "step": 1760 + }, + { + "epoch": 0.34739941118743867, + "grad_norm": 285.0767935629754, + "learning_rate": 4.124759564542295e-07, + "logits/chosen": -0.6595760583877563, + "logits/rejected": -0.058022283017635345, + "logps/chosen": -338.2293701171875, + "logps/rejected": -303.3883056640625, + "loss": -0.3546, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -42.089073181152344, + "rewards/margins": 48.59239959716797, + "rewards/rejected": -90.68148040771484, + "step": 1770 + }, + { + "epoch": 0.3493621197252208, + "grad_norm": 101.03236183328043, + "learning_rate": 4.111702649165255e-07, + "logits/chosen": -0.9161526560783386, + "logits/rejected": 0.10425040870904922, + "logps/chosen": -283.76446533203125, + "logps/rejected": -302.06591796875, + "loss": -0.2768, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -32.11731719970703, + "rewards/margins": 54.53614044189453, + "rewards/rejected": -86.65345764160156, + "step": 1780 + }, + { + "epoch": 0.35132482826300293, + "grad_norm": 263.8868367231896, + "learning_rate": 4.0985700671766834e-07, + "logits/chosen": -0.9468552470207214, + "logits/rejected": 0.9110676646232605, + "logps/chosen": -364.1922302246094, + "logps/rejected": -412.3247985839844, + "loss": -0.605, + "rewards/accuracies": 0.8333331942558289, + "rewards/chosen": -9.130239486694336, + "rewards/margins": 121.54182434082031, + "rewards/rejected": -130.6720733642578, + "step": 1790 + }, + { + "epoch": 0.35328753680078506, + "grad_norm": 130.85053749132823, + "learning_rate": 4.085362435128262e-07, + "logits/chosen": -0.405953586101532, + "logits/rejected": 1.244800329208374, + "logps/chosen": -306.9328308105469, + "logps/rejected": -437.663818359375, + "loss": -0.6196, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -19.635013580322266, + "rewards/margins": 167.39654541015625, + "rewards/rejected": -187.0315704345703, + "step": 1800 + }, + { + "epoch": 0.35525024533856725, + "grad_norm": 223.96094505752646, + "learning_rate": 4.0720803730951423e-07, + "logits/chosen": -1.1470909118652344, + "logits/rejected": 0.7912918925285339, + "logps/chosen": -343.0729675292969, + "logps/rejected": -324.56890869140625, + "loss": -0.5042, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -32.91019058227539, + "rewards/margins": 98.570068359375, + "rewards/rejected": -131.48025512695312, + "step": 1810 + }, + { + "epoch": 0.3572129538763494, + "grad_norm": 162.33950498911926, + "learning_rate": 4.058724504646834e-07, + "logits/chosen": -0.558147668838501, + "logits/rejected": 0.019284352660179138, + "logps/chosen": -248.22433471679688, + "logps/rejected": -336.770751953125, + "loss": -0.5787, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -19.4455623626709, + "rewards/margins": 90.05865478515625, + "rewards/rejected": -109.50422668457031, + "step": 1820 + }, + { + "epoch": 0.3591756624141315, + "grad_norm": 278.1738166038469, + "learning_rate": 4.045295456817924e-07, + "logits/chosen": -0.25461921095848083, + "logits/rejected": 0.1365271359682083, + "logps/chosen": -326.7425231933594, + "logps/rejected": -372.71990966796875, + "loss": -0.4594, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -49.863792419433594, + "rewards/margins": 45.01863479614258, + "rewards/rejected": -94.88243103027344, + "step": 1830 + }, + { + "epoch": 0.36113837095191365, + "grad_norm": 108.83183167859887, + "learning_rate": 4.0317938600786484e-07, + "logits/chosen": -0.7870214581489563, + "logits/rejected": -0.34110045433044434, + "logps/chosen": -341.3463134765625, + "logps/rejected": -372.3836364746094, + "loss": -0.3748, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -46.37664794921875, + "rewards/margins": 50.43975830078125, + "rewards/rejected": -96.81640625, + "step": 1840 + }, + { + "epoch": 0.3631010794896958, + "grad_norm": 206.2312367532649, + "learning_rate": 4.0182203483052825e-07, + "logits/chosen": -0.487540066242218, + "logits/rejected": 0.014774179086089134, + "logps/chosen": -356.26446533203125, + "logps/rejected": -345.57916259765625, + "loss": -0.5075, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -75.3117904663086, + "rewards/margins": 64.49422454833984, + "rewards/rejected": -139.80601501464844, + "step": 1850 + }, + { + "epoch": 0.3650637880274779, + "grad_norm": 183.16497932938577, + "learning_rate": 4.004575558750389e-07, + "logits/chosen": -1.6905851364135742, + "logits/rejected": -0.9367521405220032, + "logps/chosen": -396.78436279296875, + "logps/rejected": -417.2666015625, + "loss": -0.4071, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -46.17219543457031, + "rewards/margins": 69.63795471191406, + "rewards/rejected": -115.81014251708984, + "step": 1860 + }, + { + "epoch": 0.36702649656526004, + "grad_norm": 160.43816794717725, + "learning_rate": 3.9908601320128976e-07, + "logits/chosen": -0.1876428872346878, + "logits/rejected": -0.25627821683883667, + "logps/chosen": -285.857177734375, + "logps/rejected": -381.7496337890625, + "loss": -0.2739, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -48.417816162109375, + "rewards/margins": 91.4716567993164, + "rewards/rejected": -139.88946533203125, + "step": 1870 + }, + { + "epoch": 0.3689892051030422, + "grad_norm": 86.10127515629074, + "learning_rate": 3.9770747120080284e-07, + "logits/chosen": -1.1608283519744873, + "logits/rejected": -0.7629313468933105, + "logps/chosen": -252.29898071289062, + "logps/rejected": -324.89410400390625, + "loss": -0.4254, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -26.761028289794922, + "rewards/margins": 88.71576690673828, + "rewards/rejected": -115.4767837524414, + "step": 1880 + }, + { + "epoch": 0.37095191364082436, + "grad_norm": 128.40419421883988, + "learning_rate": 3.963219945937063e-07, + "logits/chosen": -1.359933614730835, + "logits/rejected": -1.0561379194259644, + "logps/chosen": -242.55032348632812, + "logps/rejected": -280.73297119140625, + "loss": -0.3566, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -9.378472328186035, + "rewards/margins": 45.709903717041016, + "rewards/rejected": -55.0883674621582, + "step": 1890 + }, + { + "epoch": 0.3729146221786065, + "grad_norm": 263.02338259002346, + "learning_rate": 3.949296484256959e-07, + "logits/chosen": -1.4414275884628296, + "logits/rejected": -0.8082484006881714, + "logps/chosen": -282.81085205078125, + "logps/rejected": -333.346435546875, + "loss": -0.4909, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -27.606435775756836, + "rewards/margins": 62.13264846801758, + "rewards/rejected": -89.73908996582031, + "step": 1900 + }, + { + "epoch": 0.3748773307163886, + "grad_norm": 172.0701384014494, + "learning_rate": 3.935304980649813e-07, + "logits/chosen": -1.2286367416381836, + "logits/rejected": 0.15929050743579865, + "logps/chosen": -325.7023620605469, + "logps/rejected": -418.928466796875, + "loss": -0.3225, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -25.623632431030273, + "rewards/margins": 114.02363586425781, + "rewards/rejected": -139.64724731445312, + "step": 1910 + }, + { + "epoch": 0.37684003925417076, + "grad_norm": 157.22760578014413, + "learning_rate": 3.92124609199217e-07, + "logits/chosen": -1.2718513011932373, + "logits/rejected": -0.8121629953384399, + "logps/chosen": -223.34976196289062, + "logps/rejected": -311.38446044921875, + "loss": -0.6499, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -19.075206756591797, + "rewards/margins": 75.05207824707031, + "rewards/rejected": -94.1272964477539, + "step": 1920 + }, + { + "epoch": 0.3788027477919529, + "grad_norm": 116.06395870716385, + "learning_rate": 3.907120478324185e-07, + "logits/chosen": -1.3779296875, + "logits/rejected": -0.646757960319519, + "logps/chosen": -333.4664611816406, + "logps/rejected": -378.027099609375, + "loss": -0.5177, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -43.617271423339844, + "rewards/margins": 63.46489334106445, + "rewards/rejected": -107.08216857910156, + "step": 1930 + }, + { + "epoch": 0.380765456329735, + "grad_norm": 164.47337878624106, + "learning_rate": 3.8929288028186364e-07, + "logits/chosen": -1.277091383934021, + "logits/rejected": 0.1221558004617691, + "logps/chosen": -237.57803344726562, + "logps/rejected": -303.90631103515625, + "loss": -0.6118, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -18.589313507080078, + "rewards/margins": 88.36052703857422, + "rewards/rejected": -106.9498291015625, + "step": 1940 + }, + { + "epoch": 0.38272816486751715, + "grad_norm": 255.51215973354533, + "learning_rate": 3.8786717317497875e-07, + "logits/chosen": -0.8515610694885254, + "logits/rejected": -0.052010197192430496, + "logps/chosen": -350.1195373535156, + "logps/rejected": -392.09185791015625, + "loss": -0.5892, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -36.454444885253906, + "rewards/margins": 85.26441955566406, + "rewards/rejected": -121.71885681152344, + "step": 1950 + }, + { + "epoch": 0.38469087340529934, + "grad_norm": 264.2380100054303, + "learning_rate": 3.864349934462111e-07, + "logits/chosen": -1.441253662109375, + "logits/rejected": -0.5022369623184204, + "logps/chosen": -285.413818359375, + "logps/rejected": -365.9031677246094, + "loss": -0.6713, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -22.97023582458496, + "rewards/margins": 97.99842834472656, + "rewards/rejected": -120.96867370605469, + "step": 1960 + }, + { + "epoch": 0.38665358194308147, + "grad_norm": 406.9420345826437, + "learning_rate": 3.84996408333886e-07, + "logits/chosen": -1.703598976135254, + "logits/rejected": -0.07386224716901779, + "logps/chosen": -329.97857666015625, + "logps/rejected": -355.3179016113281, + "loss": -0.6019, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.744609832763672, + "rewards/margins": 110.19657135009766, + "rewards/rejected": -133.941162109375, + "step": 1970 + }, + { + "epoch": 0.3886162904808636, + "grad_norm": 213.0231162676536, + "learning_rate": 3.8355148537705047e-07, + "logits/chosen": -1.3146978616714478, + "logits/rejected": -0.5318170785903931, + "logps/chosen": -246.01046752929688, + "logps/rejected": -320.2939147949219, + "loss": -0.5452, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -15.361353874206543, + "rewards/margins": 100.29632568359375, + "rewards/rejected": -115.6576919555664, + "step": 1980 + }, + { + "epoch": 0.39057899901864573, + "grad_norm": 363.43679686200045, + "learning_rate": 3.8210029241230204e-07, + "logits/chosen": -1.2556743621826172, + "logits/rejected": -0.2555163502693176, + "logps/chosen": -381.35833740234375, + "logps/rejected": -415.9195251464844, + "loss": -0.4048, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -33.422019958496094, + "rewards/margins": 98.73605346679688, + "rewards/rejected": -132.15806579589844, + "step": 1990 + }, + { + "epoch": 0.39254170755642787, + "grad_norm": 496.3924980435426, + "learning_rate": 3.806428975706042e-07, + "logits/chosen": -0.2570883631706238, + "logits/rejected": 0.7770005464553833, + "logps/chosen": -236.70166015625, + "logps/rejected": -321.8598937988281, + "loss": -0.5504, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2519516944885254, + "rewards/margins": 89.99324798583984, + "rewards/rejected": -92.24520111083984, + "step": 2000 + }, + { + "epoch": 0.39450441609421, + "grad_norm": 166.07660009760488, + "learning_rate": 3.791793692740876e-07, + "logits/chosen": -0.29146766662597656, + "logits/rejected": 1.1259669065475464, + "logps/chosen": -241.04556274414062, + "logps/rejected": -278.5406188964844, + "loss": -0.4738, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -8.584646224975586, + "rewards/margins": 101.43263244628906, + "rewards/rejected": -110.01727294921875, + "step": 2010 + }, + { + "epoch": 0.39646712463199213, + "grad_norm": 151.7333757685711, + "learning_rate": 3.777097762328381e-07, + "logits/chosen": -0.20288319885730743, + "logits/rejected": 1.1260716915130615, + "logps/chosen": -303.77459716796875, + "logps/rejected": -375.0235595703125, + "loss": -0.4607, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -7.3858842849731445, + "rewards/margins": 106.99356842041016, + "rewards/rejected": -114.37945556640625, + "step": 2020 + }, + { + "epoch": 0.39842983316977426, + "grad_norm": 202.86625448009454, + "learning_rate": 3.762341874416702e-07, + "logits/chosen": -0.1994774490594864, + "logits/rejected": 1.9289014339447021, + "logps/chosen": -249.23831176757812, + "logps/rejected": -315.8790283203125, + "loss": -0.6322, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -21.778350830078125, + "rewards/margins": 130.3926544189453, + "rewards/rejected": -152.1710205078125, + "step": 2030 + }, + { + "epoch": 0.40039254170755645, + "grad_norm": 119.7268601709802, + "learning_rate": 3.7475267217688896e-07, + "logits/chosen": -0.04818441718816757, + "logits/rejected": -0.041399337351322174, + "logps/chosen": -212.044921875, + "logps/rejected": -358.59783935546875, + "loss": -0.5411, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -11.626317024230957, + "rewards/margins": 94.14241790771484, + "rewards/rejected": -105.76873779296875, + "step": 2040 + }, + { + "epoch": 0.4023552502453386, + "grad_norm": 292.7655773162498, + "learning_rate": 3.7326529999303633e-07, + "logits/chosen": 0.3846183717250824, + "logits/rejected": 0.9397007822990417, + "logps/chosen": -231.75955200195312, + "logps/rejected": -397.31243896484375, + "loss": -0.421, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -16.065950393676758, + "rewards/margins": 123.8441162109375, + "rewards/rejected": -139.91006469726562, + "step": 2050 + }, + { + "epoch": 0.4043179587831207, + "grad_norm": 1041.89610513327, + "learning_rate": 3.7177214071962684e-07, + "logits/chosen": 1.1130757331848145, + "logits/rejected": 1.1956154108047485, + "logps/chosen": -282.42291259765625, + "logps/rejected": -441.60626220703125, + "loss": -0.2097, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -30.17438316345215, + "rewards/margins": 96.41376495361328, + "rewards/rejected": -126.58815002441406, + "step": 2060 + }, + { + "epoch": 0.40628066732090284, + "grad_norm": 399.61740628981335, + "learning_rate": 3.7027326445786835e-07, + "logits/chosen": 0.5155268907546997, + "logits/rejected": 1.5318434238433838, + "logps/chosen": -312.35400390625, + "logps/rejected": -379.3057861328125, + "loss": -0.3747, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -55.69513702392578, + "rewards/margins": 85.74750518798828, + "rewards/rejected": -141.44264221191406, + "step": 2070 + }, + { + "epoch": 0.408243375858685, + "grad_norm": 357.5283076916106, + "learning_rate": 3.6876874157737167e-07, + "logits/chosen": 0.27408498525619507, + "logits/rejected": 0.6839532256126404, + "logps/chosen": -341.238037109375, + "logps/rejected": -381.3678283691406, + "loss": -0.3677, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -80.13575744628906, + "rewards/margins": 16.930858612060547, + "rewards/rejected": -97.06661224365234, + "step": 2080 + }, + { + "epoch": 0.4102060843964671, + "grad_norm": 445.980529199806, + "learning_rate": 3.67258642712846e-07, + "logits/chosen": -0.5547662973403931, + "logits/rejected": 0.8245648145675659, + "logps/chosen": -298.9428405761719, + "logps/rejected": -269.8735046386719, + "loss": -0.5025, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -45.47167205810547, + "rewards/margins": 26.456161499023438, + "rewards/rejected": -71.92784118652344, + "step": 2090 + }, + { + "epoch": 0.41216879293424924, + "grad_norm": 481.9606172116737, + "learning_rate": 3.6574303876078366e-07, + "logits/chosen": -0.28301194310188293, + "logits/rejected": 0.8084600567817688, + "logps/chosen": -312.2306823730469, + "logps/rejected": -391.3448486328125, + "loss": -0.6373, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -29.412633895874023, + "rewards/margins": 88.322998046875, + "rewards/rejected": -117.73564147949219, + "step": 2100 + }, + { + "epoch": 0.4141315014720314, + "grad_norm": 649.9205592074017, + "learning_rate": 3.642220008761309e-07, + "logits/chosen": -0.38110026717185974, + "logits/rejected": 1.2338255643844604, + "logps/chosen": -338.86151123046875, + "logps/rejected": -433.78594970703125, + "loss": -0.6796, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -11.297021865844727, + "rewards/margins": 135.54347229003906, + "rewards/rejected": -146.84048461914062, + "step": 2110 + }, + { + "epoch": 0.41609421000981356, + "grad_norm": 1304.34383157871, + "learning_rate": 3.626956004689476e-07, + "logits/chosen": -0.011320591904222965, + "logits/rejected": 1.5619409084320068, + "logps/chosen": -403.05279541015625, + "logps/rejected": -323.76910400390625, + "loss": -0.3211, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -41.494529724121094, + "rewards/margins": 67.07801818847656, + "rewards/rejected": -108.57255554199219, + "step": 2120 + }, + { + "epoch": 0.4180569185475957, + "grad_norm": 398.74765502857923, + "learning_rate": 3.6116390920105474e-07, + "logits/chosen": -0.46276599168777466, + "logits/rejected": 0.021661901846528053, + "logps/chosen": -336.35540771484375, + "logps/rejected": -342.64715576171875, + "loss": -0.3147, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -69.02780151367188, + "rewards/margins": 38.62982940673828, + "rewards/rejected": -107.65763092041016, + "step": 2130 + }, + { + "epoch": 0.4200196270853778, + "grad_norm": 297.17454237795334, + "learning_rate": 3.5962699898266983e-07, + "logits/chosen": -1.0453838109970093, + "logits/rejected": -1.0867758989334106, + "logps/chosen": -272.9595947265625, + "logps/rejected": -308.0858459472656, + "loss": -0.3608, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -28.943002700805664, + "rewards/margins": 62.91179656982422, + "rewards/rejected": -91.85479736328125, + "step": 2140 + }, + { + "epoch": 0.42198233562315995, + "grad_norm": 209.24545493520753, + "learning_rate": 3.5808494196903117e-07, + "logits/chosen": -0.6677152514457703, + "logits/rejected": -0.7276838421821594, + "logps/chosen": -350.9630432128906, + "logps/rejected": -321.83221435546875, + "loss": -0.4785, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -24.715778350830078, + "rewards/margins": 82.55493927001953, + "rewards/rejected": -107.27071380615234, + "step": 2150 + }, + { + "epoch": 0.4239450441609421, + "grad_norm": 219.7334740840983, + "learning_rate": 3.565378105570097e-07, + "logits/chosen": -0.871170699596405, + "logits/rejected": -0.5837348699569702, + "logps/chosen": -325.93682861328125, + "logps/rejected": -333.42138671875, + "loss": -0.4111, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -41.86164093017578, + "rewards/margins": 84.16949462890625, + "rewards/rejected": -126.03114318847656, + "step": 2160 + }, + { + "epoch": 0.4259077526987242, + "grad_norm": 115.94671281018573, + "learning_rate": 3.549856773817107e-07, + "logits/chosen": -0.40060538053512573, + "logits/rejected": -0.1994091272354126, + "logps/chosen": -260.1810607910156, + "logps/rejected": -327.2917785644531, + "loss": -0.5332, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -21.62350845336914, + "rewards/margins": 89.61854553222656, + "rewards/rejected": -111.2420425415039, + "step": 2170 + }, + { + "epoch": 0.4278704612365064, + "grad_norm": 139.96643791845855, + "learning_rate": 3.5342861531306344e-07, + "logits/chosen": -0.9581565856933594, + "logits/rejected": -0.5796898007392883, + "logps/chosen": -255.9517822265625, + "logps/rejected": -313.5719909667969, + "loss": -0.4206, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7591590881347656, + "rewards/margins": 95.80101776123047, + "rewards/rejected": -97.5601806640625, + "step": 2180 + }, + { + "epoch": 0.42983316977428854, + "grad_norm": 265.53739183803077, + "learning_rate": 3.518666974524002e-07, + "logits/chosen": -0.8339200019836426, + "logits/rejected": -0.09470844268798828, + "logps/chosen": -339.45281982421875, + "logps/rejected": -399.4952392578125, + "loss": -0.4712, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -23.41973876953125, + "rewards/margins": 102.7240982055664, + "rewards/rejected": -126.1438217163086, + "step": 2190 + }, + { + "epoch": 0.43179587831207067, + "grad_norm": 146.12788059648364, + "learning_rate": 3.5029999712902387e-07, + "logits/chosen": -1.008504867553711, + "logits/rejected": -0.9239922761917114, + "logps/chosen": -337.4940490722656, + "logps/rejected": -437.9051208496094, + "loss": -0.5673, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -8.190189361572266, + "rewards/margins": 75.09192657470703, + "rewards/rejected": -83.28211975097656, + "step": 2200 + }, + { + "epoch": 0.4337585868498528, + "grad_norm": 166.643715503973, + "learning_rate": 3.4872858789676583e-07, + "logits/chosen": 0.4045542776584625, + "logits/rejected": 0.12168149650096893, + "logps/chosen": -279.0066833496094, + "logps/rejected": -310.8238525390625, + "loss": -0.3537, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -46.079505920410156, + "rewards/margins": 30.016101837158203, + "rewards/rejected": -76.09561157226562, + "step": 2210 + }, + { + "epoch": 0.43572129538763493, + "grad_norm": 149.6903066102506, + "learning_rate": 3.4715254353053236e-07, + "logits/chosen": 0.14150777459144592, + "logits/rejected": 0.6271403431892395, + "logps/chosen": -296.629638671875, + "logps/rejected": -407.3511657714844, + "loss": -0.5133, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -24.697423934936523, + "rewards/margins": 101.6956787109375, + "rewards/rejected": -126.39311218261719, + "step": 2220 + }, + { + "epoch": 0.43768400392541706, + "grad_norm": 246.7681146350035, + "learning_rate": 3.4557193802284123e-07, + "logits/chosen": -0.16007724404335022, + "logits/rejected": 0.5914433598518372, + "logps/chosen": -307.1545104980469, + "logps/rejected": -362.4603271484375, + "loss": -0.1753, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -42.424102783203125, + "rewards/margins": 61.89319610595703, + "rewards/rejected": -104.31729888916016, + "step": 2230 + }, + { + "epoch": 0.4396467124631992, + "grad_norm": 92.52457640772026, + "learning_rate": 3.4398684558034763e-07, + "logits/chosen": 0.8897304534912109, + "logits/rejected": 0.8202276229858398, + "logps/chosen": -281.07421875, + "logps/rejected": -344.7154541015625, + "loss": -0.4262, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -12.59819507598877, + "rewards/margins": 82.18891906738281, + "rewards/rejected": -94.78712463378906, + "step": 2240 + }, + { + "epoch": 0.44160942100098133, + "grad_norm": 235.72772785387758, + "learning_rate": 3.4239734062036067e-07, + "logits/chosen": 0.2785721719264984, + "logits/rejected": 0.6844779849052429, + "logps/chosen": -319.9175109863281, + "logps/rejected": -397.5693664550781, + "loss": -0.4579, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -35.869171142578125, + "rewards/margins": 81.81893157958984, + "rewards/rejected": -117.68809509277344, + "step": 2250 + }, + { + "epoch": 0.4435721295387635, + "grad_norm": 123.2801996558972, + "learning_rate": 3.4080349776734924e-07, + "logits/chosen": 0.30322542786598206, + "logits/rejected": 1.1189171075820923, + "logps/chosen": -308.3179626464844, + "logps/rejected": -398.7733459472656, + "loss": -0.4235, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -9.656079292297363, + "rewards/margins": 111.47917175292969, + "rewards/rejected": -121.13525390625, + "step": 2260 + }, + { + "epoch": 0.44553483807654565, + "grad_norm": 422.7951249805826, + "learning_rate": 3.392053918494389e-07, + "logits/chosen": 0.1557307094335556, + "logits/rejected": 0.32888704538345337, + "logps/chosen": -312.52972412109375, + "logps/rejected": -353.87396240234375, + "loss": -0.3878, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -6.256176948547363, + "rewards/margins": 77.47406005859375, + "rewards/rejected": -83.73023986816406, + "step": 2270 + }, + { + "epoch": 0.4474975466143278, + "grad_norm": 381.7477138574789, + "learning_rate": 3.376030978948983e-07, + "logits/chosen": 0.04796195402741432, + "logits/rejected": 1.2978018522262573, + "logps/chosen": -382.04931640625, + "logps/rejected": -459.36932373046875, + "loss": -0.6204, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -59.7066535949707, + "rewards/margins": 112.34959411621094, + "rewards/rejected": -172.05625915527344, + "step": 2280 + }, + { + "epoch": 0.4494602551521099, + "grad_norm": 182.29284763727256, + "learning_rate": 3.3599669112861756e-07, + "logits/chosen": 0.24235352873802185, + "logits/rejected": 0.5556513071060181, + "logps/chosen": -301.8162841796875, + "logps/rejected": -431.4063415527344, + "loss": -0.3233, + "rewards/accuracies": 0.7000001072883606, + "rewards/chosen": -32.53940963745117, + "rewards/margins": 106.93785095214844, + "rewards/rejected": -139.47727966308594, + "step": 2290 + }, + { + "epoch": 0.45142296368989204, + "grad_norm": 145.6022579835461, + "learning_rate": 3.343862469685755e-07, + "logits/chosen": 0.16065433621406555, + "logits/rejected": 0.9648358225822449, + "logps/chosen": -261.49786376953125, + "logps/rejected": -349.4292907714844, + "loss": -0.5335, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -21.281448364257812, + "rewards/margins": 100.9854965209961, + "rewards/rejected": -122.2669448852539, + "step": 2300 + }, + { + "epoch": 0.4533856722276742, + "grad_norm": 137.10951415660998, + "learning_rate": 3.3277184102230004e-07, + "logits/chosen": -0.3531091511249542, + "logits/rejected": -0.30775076150894165, + "logps/chosen": -281.5614929199219, + "logps/rejected": -357.7174072265625, + "loss": -0.3896, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -11.92430591583252, + "rewards/margins": 71.40213775634766, + "rewards/rejected": -83.32644653320312, + "step": 2310 + }, + { + "epoch": 0.4553483807654563, + "grad_norm": 193.8212343916491, + "learning_rate": 3.311535490833176e-07, + "logits/chosen": 0.26778221130371094, + "logits/rejected": 0.6380144953727722, + "logps/chosen": -269.3844299316406, + "logps/rejected": -392.78302001953125, + "loss": -0.4826, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -20.432802200317383, + "rewards/margins": 85.39807891845703, + "rewards/rejected": -105.83088684082031, + "step": 2320 + }, + { + "epoch": 0.4573110893032385, + "grad_norm": 167.41836148316835, + "learning_rate": 3.2953144712759537e-07, + "logits/chosen": -0.6841514110565186, + "logits/rejected": 0.876046359539032, + "logps/chosen": -359.23260498046875, + "logps/rejected": -413.22320556640625, + "loss": -0.6022, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -16.56559181213379, + "rewards/margins": 147.17495727539062, + "rewards/rejected": -163.74053955078125, + "step": 2330 + }, + { + "epoch": 0.4592737978410206, + "grad_norm": 679.0340078732252, + "learning_rate": 3.279056113099742e-07, + "logits/chosen": -0.7158817052841187, + "logits/rejected": 0.32771921157836914, + "logps/chosen": -294.82647705078125, + "logps/rejected": -476.7598571777344, + "loss": -0.5756, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.300946235656738, + "rewards/margins": 140.09725952148438, + "rewards/rejected": -147.39822387695312, + "step": 2340 + }, + { + "epoch": 0.46123650637880276, + "grad_norm": 224.68757055105792, + "learning_rate": 3.2627611796059283e-07, + "logits/chosen": 0.038073696196079254, + "logits/rejected": 0.6189560294151306, + "logps/chosen": -309.27520751953125, + "logps/rejected": -335.2940368652344, + "loss": -0.4494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -36.30601501464844, + "rewards/margins": 60.565773010253906, + "rewards/rejected": -96.87178039550781, + "step": 2350 + }, + { + "epoch": 0.4631992149165849, + "grad_norm": 192.03484144768032, + "learning_rate": 3.246430435813051e-07, + "logits/chosen": -0.3521607518196106, + "logits/rejected": 0.6606711745262146, + "logps/chosen": -341.88897705078125, + "logps/rejected": -338.5372314453125, + "loss": -0.4172, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -68.22154235839844, + "rewards/margins": 52.46270751953125, + "rewards/rejected": -120.68424987792969, + "step": 2360 + }, + { + "epoch": 0.465161923454367, + "grad_norm": 285.66527165372344, + "learning_rate": 3.230064648420878e-07, + "logits/chosen": -0.6408053636550903, + "logits/rejected": 0.7174103260040283, + "logps/chosen": -294.173583984375, + "logps/rejected": -284.22052001953125, + "loss": -0.4603, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -17.475337982177734, + "rewards/margins": 73.92915344238281, + "rewards/rejected": -91.40447998046875, + "step": 2370 + }, + { + "epoch": 0.46712463199214915, + "grad_norm": 238.2655596420244, + "learning_rate": 3.2136645857744114e-07, + "logits/chosen": 0.41769298911094666, + "logits/rejected": 0.5791391134262085, + "logps/chosen": -255.065673828125, + "logps/rejected": -362.6512451171875, + "loss": -0.5383, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -22.547447204589844, + "rewards/margins": 63.11621856689453, + "rewards/rejected": -85.66365814208984, + "step": 2380 + }, + { + "epoch": 0.4690873405299313, + "grad_norm": 157.67920365410467, + "learning_rate": 3.197231017827818e-07, + "logits/chosen": -0.4802599549293518, + "logits/rejected": 0.2917357385158539, + "logps/chosen": -271.93975830078125, + "logps/rejected": -321.23828125, + "loss": -0.5056, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 9.564969062805176, + "rewards/margins": 69.64512634277344, + "rewards/rejected": -60.08015823364258, + "step": 2390 + }, + { + "epoch": 0.47105004906771347, + "grad_norm": 148.06614127642922, + "learning_rate": 3.1807647161082797e-07, + "logits/chosen": -0.18546701967716217, + "logits/rejected": 0.9859294891357422, + "logps/chosen": -277.1785888671875, + "logps/rejected": -354.2449951171875, + "loss": -0.5329, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.37544584274292, + "rewards/margins": 94.2538070678711, + "rewards/rejected": -96.62925720214844, + "step": 2400 + }, + { + "epoch": 0.4730127576054956, + "grad_norm": 356.8778111878992, + "learning_rate": 3.1642664536797693e-07, + "logits/chosen": -0.04275861382484436, + "logits/rejected": 1.0751268863677979, + "logps/chosen": -305.48382568359375, + "logps/rejected": -389.33551025390625, + "loss": -0.4543, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -32.48809051513672, + "rewards/margins": 82.50044250488281, + "rewards/rejected": -114.988525390625, + "step": 2410 + }, + { + "epoch": 0.47497546614327774, + "grad_norm": 310.96917376343254, + "learning_rate": 3.147737005106762e-07, + "logits/chosen": 0.6562548875808716, + "logits/rejected": 0.7398786544799805, + "logps/chosen": -342.5672607421875, + "logps/rejected": -414.20123291015625, + "loss": -0.4527, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -43.527381896972656, + "rewards/margins": 96.02046966552734, + "rewards/rejected": -139.54783630371094, + "step": 2420 + }, + { + "epoch": 0.47693817468105987, + "grad_norm": 233.7335407647502, + "learning_rate": 3.1311771464178655e-07, + "logits/chosen": 0.7289456129074097, + "logits/rejected": 1.8622195720672607, + "logps/chosen": -312.67974853515625, + "logps/rejected": -346.43865966796875, + "loss": -0.6878, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -17.431365966796875, + "rewards/margins": 112.78873443603516, + "rewards/rejected": -130.2200927734375, + "step": 2430 + }, + { + "epoch": 0.478900883218842, + "grad_norm": 423.8615337902106, + "learning_rate": 3.1145876550693893e-07, + "logits/chosen": -0.29180586338043213, + "logits/rejected": 1.4814244508743286, + "logps/chosen": -320.2853088378906, + "logps/rejected": -407.13165283203125, + "loss": -0.6833, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -34.53456115722656, + "rewards/margins": 138.62493896484375, + "rewards/rejected": -173.1595001220703, + "step": 2440 + }, + { + "epoch": 0.48086359175662413, + "grad_norm": 744.701122420697, + "learning_rate": 3.097969309908847e-07, + "logits/chosen": 0.9363061189651489, + "logits/rejected": 1.8906930685043335, + "logps/chosen": -251.7770538330078, + "logps/rejected": -304.05963134765625, + "loss": -0.6316, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -12.564455032348633, + "rewards/margins": 84.75445556640625, + "rewards/rejected": -97.31892395019531, + "step": 2450 + }, + { + "epoch": 0.48282630029440626, + "grad_norm": 251.38223075018033, + "learning_rate": 3.081322891138382e-07, + "logits/chosen": 0.20053064823150635, + "logits/rejected": 0.4371492266654968, + "logps/chosen": -320.5032958984375, + "logps/rejected": -346.4121398925781, + "loss": -0.5638, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -15.324429512023926, + "rewards/margins": 69.0589599609375, + "rewards/rejected": -84.38338470458984, + "step": 2460 + }, + { + "epoch": 0.4847890088321884, + "grad_norm": 245.21472704086145, + "learning_rate": 3.0646491802781514e-07, + "logits/chosen": 0.8865596055984497, + "logits/rejected": 2.0313923358917236, + "logps/chosen": -325.08001708984375, + "logps/rejected": -273.4022521972656, + "loss": -0.4297, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -58.433311462402344, + "rewards/margins": 34.4904670715332, + "rewards/rejected": -92.92378234863281, + "step": 2470 + }, + { + "epoch": 0.4867517173699706, + "grad_norm": 175.9266452708588, + "learning_rate": 3.047948960129624e-07, + "logits/chosen": 0.5433771014213562, + "logits/rejected": 0.8757207989692688, + "logps/chosen": -209.1022186279297, + "logps/rejected": -347.94110107421875, + "loss": -0.5127, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": 0.3948183059692383, + "rewards/margins": 130.8015594482422, + "rewards/rejected": -130.40673828125, + "step": 2480 + }, + { + "epoch": 0.4887144259077527, + "grad_norm": 429.45192151115884, + "learning_rate": 3.0312230147388334e-07, + "logits/chosen": -0.4421153962612152, + "logits/rejected": 0.6323944330215454, + "logps/chosen": -348.27679443359375, + "logps/rejected": -481.54931640625, + "loss": -0.603, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -39.05653762817383, + "rewards/margins": 135.2555389404297, + "rewards/rejected": -174.3120880126953, + "step": 2490 + }, + { + "epoch": 0.49067713444553485, + "grad_norm": 377.05759477567375, + "learning_rate": 3.01447212935957e-07, + "logits/chosen": 0.0974908173084259, + "logits/rejected": 0.05548914521932602, + "logps/chosen": -322.7247314453125, + "logps/rejected": -400.3882751464844, + "loss": -0.2574, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -96.27644348144531, + "rewards/margins": 53.063560485839844, + "rewards/rejected": -149.3400115966797, + "step": 2500 + }, + { + "epoch": 0.492639842983317, + "grad_norm": 314.43327625998256, + "learning_rate": 2.9976970904165104e-07, + "logits/chosen": -0.8013470768928528, + "logits/rejected": 0.47607460618019104, + "logps/chosen": -423.392578125, + "logps/rejected": -418.38525390625, + "loss": -0.4672, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -38.44776916503906, + "rewards/margins": 85.25347137451172, + "rewards/rejected": -123.70123291015625, + "step": 2510 + }, + { + "epoch": 0.4946025515210991, + "grad_norm": 433.9401551770119, + "learning_rate": 2.980898685468301e-07, + "logits/chosen": -0.4024876654148102, + "logits/rejected": 0.22271475195884705, + "logps/chosen": -301.8051452636719, + "logps/rejected": -352.66485595703125, + "loss": -0.5458, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.163656234741211, + "rewards/margins": 117.77667236328125, + "rewards/rejected": -119.9403305053711, + "step": 2520 + }, + { + "epoch": 0.49656526005888124, + "grad_norm": 147.05168354892143, + "learning_rate": 2.96407770317058e-07, + "logits/chosen": 0.1366831511259079, + "logits/rejected": 0.10594828426837921, + "logps/chosen": -226.49386596679688, + "logps/rejected": -310.25079345703125, + "loss": -0.6687, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -18.096805572509766, + "rewards/margins": 83.7448959350586, + "rewards/rejected": -101.84171295166016, + "step": 2530 + }, + { + "epoch": 0.4985279685966634, + "grad_norm": 493.55900973884064, + "learning_rate": 2.9472349332389523e-07, + "logits/chosen": -0.4268958568572998, + "logits/rejected": 1.171649694442749, + "logps/chosen": -324.69378662109375, + "logps/rejected": -340.44158935546875, + "loss": -0.3868, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -31.31919288635254, + "rewards/margins": 109.56556701660156, + "rewards/rejected": -140.88473510742188, + "step": 2540 + }, + { + "epoch": 0.5004906771344455, + "grad_norm": 273.7588000109085, + "learning_rate": 2.930371166411915e-07, + "logits/chosen": -0.6654374003410339, + "logits/rejected": 0.21415922045707703, + "logps/chosen": -335.7213439941406, + "logps/rejected": -407.8766174316406, + "loss": -0.3825, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.95157241821289, + "rewards/margins": 74.48597717285156, + "rewards/rejected": -95.43756103515625, + "step": 2550 + }, + { + "epoch": 0.5024533856722276, + "grad_norm": 259.545847753256, + "learning_rate": 2.913487194413731e-07, + "logits/chosen": -0.5894133448600769, + "logits/rejected": -0.5803017020225525, + "logps/chosen": -288.4310302734375, + "logps/rejected": -443.6865234375, + "loss": -0.5866, + "rewards/accuracies": 0.8999999165534973, + "rewards/chosen": -5.7689409255981445, + "rewards/margins": 158.903564453125, + "rewards/rejected": -164.67250061035156, + "step": 2560 + }, + { + "epoch": 0.5044160942100098, + "grad_norm": 291.4075312745963, + "learning_rate": 2.896583809917262e-07, + "logits/chosen": -0.1378103792667389, + "logits/rejected": -0.324067085981369, + "logps/chosen": -248.91915893554688, + "logps/rejected": -341.13995361328125, + "loss": -0.3583, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -24.418867111206055, + "rewards/margins": 91.70320892333984, + "rewards/rejected": -116.12208557128906, + "step": 2570 + }, + { + "epoch": 0.5063788027477919, + "grad_norm": 135.6645715131718, + "learning_rate": 2.879661806506751e-07, + "logits/chosen": -0.561540424823761, + "logits/rejected": -0.05086873844265938, + "logps/chosen": -315.7080078125, + "logps/rejected": -410.9491271972656, + "loss": -0.364, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -42.76870346069336, + "rewards/margins": 58.515785217285156, + "rewards/rejected": -101.28450012207031, + "step": 2580 + }, + { + "epoch": 0.5083415112855741, + "grad_norm": 315.9674431217383, + "learning_rate": 2.86272197864057e-07, + "logits/chosen": -1.497479796409607, + "logits/rejected": -1.1507604122161865, + "logps/chosen": -356.84002685546875, + "logps/rejected": -362.37152099609375, + "loss": -0.4785, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -36.30466842651367, + "rewards/margins": 71.90602111816406, + "rewards/rejected": -108.21067810058594, + "step": 2590 + }, + { + "epoch": 0.5103042198233563, + "grad_norm": 118.39257618780518, + "learning_rate": 2.845765121613912e-07, + "logits/chosen": -0.8514739274978638, + "logits/rejected": -0.6283332109451294, + "logps/chosen": -309.8434143066406, + "logps/rejected": -350.513671875, + "loss": -0.7185, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.283230781555176, + "rewards/margins": 137.38882446289062, + "rewards/rejected": -151.67205810546875, + "step": 2600 + }, + { + "epoch": 0.5122669283611384, + "grad_norm": 285.0573022792299, + "learning_rate": 2.828792031521464e-07, + "logits/chosen": -0.9516509771347046, + "logits/rejected": -0.5323055982589722, + "logps/chosen": -328.3294372558594, + "logps/rejected": -464.4132385253906, + "loss": -0.4725, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -13.5633544921875, + "rewards/margins": 155.688232421875, + "rewards/rejected": -169.25160217285156, + "step": 2610 + }, + { + "epoch": 0.5142296368989205, + "grad_norm": 221.43043700872147, + "learning_rate": 2.811803505220025e-07, + "logits/chosen": -0.9343031644821167, + "logits/rejected": -0.3405666947364807, + "logps/chosen": -282.5321044921875, + "logps/rejected": -337.3126525878906, + "loss": -0.6065, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -19.01625633239746, + "rewards/margins": 110.60337829589844, + "rewards/rejected": -129.61962890625, + "step": 2620 + }, + { + "epoch": 0.5161923454367027, + "grad_norm": 237.370168514313, + "learning_rate": 2.7948003402910975e-07, + "logits/chosen": -0.1904366910457611, + "logits/rejected": 0.3686249554157257, + "logps/chosen": -328.06866455078125, + "logps/rejected": -384.8655700683594, + "loss": -0.3728, + "rewards/accuracies": 0.73333340883255, + "rewards/chosen": -30.02372169494629, + "rewards/margins": 101.01863861083984, + "rewards/rejected": -131.0423583984375, + "step": 2630 + }, + { + "epoch": 0.5181550539744848, + "grad_norm": 139.61263912514792, + "learning_rate": 2.777783335003442e-07, + "logits/chosen": -0.42642202973365784, + "logits/rejected": 0.09071238338947296, + "logps/chosen": -342.4646301269531, + "logps/rejected": -403.5032653808594, + "loss": -0.5165, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -30.88848304748535, + "rewards/margins": 112.633056640625, + "rewards/rejected": -143.52154541015625, + "step": 2640 + }, + { + "epoch": 0.5201177625122669, + "grad_norm": 391.3608215437027, + "learning_rate": 2.760753288275598e-07, + "logits/chosen": -0.25946754217147827, + "logits/rejected": 0.7186147570610046, + "logps/chosen": -279.14361572265625, + "logps/rejected": -379.6006164550781, + "loss": -0.3133, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -22.769941329956055, + "rewards/margins": 103.9280776977539, + "rewards/rejected": -126.6980209350586, + "step": 2650 + }, + { + "epoch": 0.5220804710500491, + "grad_norm": 308.8738180108568, + "learning_rate": 2.7437109996383795e-07, + "logits/chosen": 0.22495320439338684, + "logits/rejected": 1.347975730895996, + "logps/chosen": -303.25616455078125, + "logps/rejected": -349.54010009765625, + "loss": -0.4444, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -47.82208251953125, + "rewards/margins": 91.51597595214844, + "rewards/rejected": -139.33804321289062, + "step": 2660 + }, + { + "epoch": 0.5240431795878312, + "grad_norm": 105.59483552440615, + "learning_rate": 2.7266572691973365e-07, + "logits/chosen": -0.7208471894264221, + "logits/rejected": 0.026754379272460938, + "logps/chosen": -351.32861328125, + "logps/rejected": -366.71551513671875, + "loss": -0.548, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -17.688404083251953, + "rewards/margins": 72.31254577636719, + "rewards/rejected": -90.00094604492188, + "step": 2670 + }, + { + "epoch": 0.5260058881256133, + "grad_norm": 547.2158458872873, + "learning_rate": 2.709592897595191e-07, + "logits/chosen": 0.32650887966156006, + "logits/rejected": 1.0637938976287842, + "logps/chosen": -291.64801025390625, + "logps/rejected": -251.3025360107422, + "loss": -0.5324, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.850461959838867, + "rewards/margins": 49.82951354980469, + "rewards/rejected": -65.67997741699219, + "step": 2680 + }, + { + "epoch": 0.5279685966633955, + "grad_norm": 473.378993250825, + "learning_rate": 2.6925186859742494e-07, + "logits/chosen": -0.1511944681406021, + "logits/rejected": 0.0580764040350914, + "logps/chosen": -273.9756774902344, + "logps/rejected": -322.53436279296875, + "loss": -0.4016, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -16.555021286010742, + "rewards/margins": 78.03257751464844, + "rewards/rejected": -94.58760070800781, + "step": 2690 + }, + { + "epoch": 0.5299313052011776, + "grad_norm": 391.3912175071586, + "learning_rate": 2.675435435938788e-07, + "logits/chosen": -0.3210826516151428, + "logits/rejected": 0.6731927990913391, + "logps/chosen": -344.1539611816406, + "logps/rejected": -376.76116943359375, + "loss": -0.5019, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -28.54323387145996, + "rewards/margins": 92.14056396484375, + "rewards/rejected": -120.68379974365234, + "step": 2700 + }, + { + "epoch": 0.5318940137389597, + "grad_norm": 171.0253907949937, + "learning_rate": 2.6583439495174247e-07, + "logits/chosen": -0.4241692125797272, + "logits/rejected": 1.0711032152175903, + "logps/chosen": -298.0030822753906, + "logps/rejected": -373.0411071777344, + "loss": -0.5928, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.104356288909912, + "rewards/margins": 125.88126373291016, + "rewards/rejected": -132.98562622070312, + "step": 2710 + }, + { + "epoch": 0.5338567222767419, + "grad_norm": 182.16139827707858, + "learning_rate": 2.6412450291254564e-07, + "logits/chosen": 0.5123127102851868, + "logits/rejected": 1.3891392946243286, + "logps/chosen": -307.1592102050781, + "logps/rejected": -346.6839294433594, + "loss": -0.5057, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -15.78852367401123, + "rewards/margins": 89.80287170410156, + "rewards/rejected": -105.59139251708984, + "step": 2720 + }, + { + "epoch": 0.535819430814524, + "grad_norm": 259.8816047792165, + "learning_rate": 2.6241394775271954e-07, + "logits/chosen": 0.8046048283576965, + "logits/rejected": 2.188105583190918, + "logps/chosen": -287.75726318359375, + "logps/rejected": -396.4264831542969, + "loss": -0.382, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -27.231454849243164, + "rewards/margins": 140.114501953125, + "rewards/rejected": -167.34597778320312, + "step": 2730 + }, + { + "epoch": 0.5377821393523062, + "grad_norm": 321.48906686200013, + "learning_rate": 2.607028097798276e-07, + "logits/chosen": 0.6382243037223816, + "logits/rejected": 1.6460765600204468, + "logps/chosen": -322.6622619628906, + "logps/rejected": -424.9124450683594, + "loss": -0.6139, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -36.69150161743164, + "rewards/margins": 95.38984680175781, + "rewards/rejected": -132.0813446044922, + "step": 2740 + }, + { + "epoch": 0.5397448478900884, + "grad_norm": 185.42514226312778, + "learning_rate": 2.5899116932879534e-07, + "logits/chosen": 1.3601300716400146, + "logits/rejected": 2.312749147415161, + "logps/chosen": -243.48727416992188, + "logps/rejected": -364.85150146484375, + "loss": -0.6528, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -43.58977127075195, + "rewards/margins": 112.76011657714844, + "rewards/rejected": -156.34988403320312, + "step": 2750 + }, + { + "epoch": 0.5417075564278705, + "grad_norm": 113.87898599411729, + "learning_rate": 2.5727910675813866e-07, + "logits/chosen": 0.5609289407730103, + "logits/rejected": 0.6823413968086243, + "logps/chosen": -265.54571533203125, + "logps/rejected": -365.4170837402344, + "loss": -0.5823, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -12.634196281433105, + "rewards/margins": 80.48759460449219, + "rewards/rejected": -93.12178802490234, + "step": 2760 + }, + { + "epoch": 0.5436702649656526, + "grad_norm": 130.63615854608577, + "learning_rate": 2.555667024461915e-07, + "logits/chosen": 0.29585856199264526, + "logits/rejected": 0.36376041173934937, + "logps/chosen": -252.49246215820312, + "logps/rejected": -383.80352783203125, + "loss": -0.4498, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -19.849376678466797, + "rewards/margins": 98.01789855957031, + "rewards/rejected": -117.86726379394531, + "step": 2770 + }, + { + "epoch": 0.5456329735034348, + "grad_norm": 170.48391015666377, + "learning_rate": 2.5385403678733157e-07, + "logits/chosen": 0.8951346278190613, + "logits/rejected": 1.0202525854110718, + "logps/chosen": -254.63217163085938, + "logps/rejected": -348.728271484375, + "loss": -0.3319, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -55.43613815307617, + "rewards/margins": 61.74285125732422, + "rewards/rejected": -117.17899322509766, + "step": 2780 + }, + { + "epoch": 0.5475956820412169, + "grad_norm": 209.03297035174046, + "learning_rate": 2.521411901882067e-07, + "logits/chosen": 0.1450597047805786, + "logits/rejected": 1.8982454538345337, + "logps/chosen": -283.434326171875, + "logps/rejected": -355.127197265625, + "loss": -0.4797, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.629661560058594, + "rewards/margins": 128.4715118408203, + "rewards/rejected": -149.10116577148438, + "step": 2790 + }, + { + "epoch": 0.549558390578999, + "grad_norm": 316.4189278339142, + "learning_rate": 2.504282430639594e-07, + "logits/chosen": -0.9220380783081055, + "logits/rejected": -0.2594057023525238, + "logps/chosen": -219.96041870117188, + "logps/rejected": -279.0653076171875, + "loss": -0.5967, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -11.8013334274292, + "rewards/margins": 62.681007385253906, + "rewards/rejected": -74.48234558105469, + "step": 2800 + }, + { + "epoch": 0.5515210991167812, + "grad_norm": 213.5433079797419, + "learning_rate": 2.4871527583445163e-07, + "logits/chosen": -0.31822261214256287, + "logits/rejected": 0.5024303197860718, + "logps/chosen": -317.51275634765625, + "logps/rejected": -335.9136657714844, + "loss": -0.4669, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -8.082704544067383, + "rewards/margins": 54.3105583190918, + "rewards/rejected": -62.39326095581055, + "step": 2810 + }, + { + "epoch": 0.5534838076545633, + "grad_norm": 816.1579619271733, + "learning_rate": 2.470023689204893e-07, + "logits/chosen": 0.23843038082122803, + "logits/rejected": 0.5871859788894653, + "logps/chosen": -309.62445068359375, + "logps/rejected": -391.88250732421875, + "loss": -0.5, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.068517684936523, + "rewards/margins": 96.9255142211914, + "rewards/rejected": -111.9940414428711, + "step": 2820 + }, + { + "epoch": 0.5554465161923454, + "grad_norm": 280.3106207734899, + "learning_rate": 2.452896027400465e-07, + "logits/chosen": -0.3961392939090729, + "logits/rejected": 0.7283745408058167, + "logps/chosen": -339.65435791015625, + "logps/rejected": -421.9934997558594, + "loss": -0.4777, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -59.08606719970703, + "rewards/margins": 80.55363464355469, + "rewards/rejected": -139.63970947265625, + "step": 2830 + }, + { + "epoch": 0.5574092247301276, + "grad_norm": 167.81413165501237, + "learning_rate": 2.4357705770449046e-07, + "logits/chosen": 0.23879416286945343, + "logits/rejected": 0.9762646555900574, + "logps/chosen": -245.05215454101562, + "logps/rejected": -347.5263671875, + "loss": -0.5869, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": 4.809001922607422, + "rewards/margins": 114.6981201171875, + "rewards/rejected": -109.88912200927734, + "step": 2840 + }, + { + "epoch": 0.5593719332679097, + "grad_norm": 410.71247214339223, + "learning_rate": 2.418648142148056e-07, + "logits/chosen": 0.6748315095901489, + "logits/rejected": 1.1633888483047485, + "logps/chosen": -294.86175537109375, + "logps/rejected": -309.39154052734375, + "loss": -0.457, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -11.052109718322754, + "rewards/margins": 81.19559478759766, + "rewards/rejected": -92.2477035522461, + "step": 2850 + }, + { + "epoch": 0.5613346418056918, + "grad_norm": 413.7593541786494, + "learning_rate": 2.4015295265781966e-07, + "logits/chosen": 0.6866556406021118, + "logits/rejected": 1.3604393005371094, + "logps/chosen": -299.4148864746094, + "logps/rejected": -408.46795654296875, + "loss": -0.5172, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.166587233543396, + "rewards/margins": 98.66439819335938, + "rewards/rejected": -99.83097839355469, + "step": 2860 + }, + { + "epoch": 0.563297350343474, + "grad_norm": 181.55258976710394, + "learning_rate": 2.3844155340242893e-07, + "logits/chosen": 0.3880153298377991, + "logits/rejected": 1.2153961658477783, + "logps/chosen": -225.6029052734375, + "logps/rejected": -322.51715087890625, + "loss": -0.4219, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -9.458656311035156, + "rewards/margins": 94.21931457519531, + "rewards/rejected": -103.677978515625, + "step": 2870 + }, + { + "epoch": 0.5652600588812562, + "grad_norm": 220.54795795101919, + "learning_rate": 2.36730696795826e-07, + "logits/chosen": 0.18370838463306427, + "logits/rejected": 0.2858714461326599, + "logps/chosen": -278.33013916015625, + "logps/rejected": -425.53204345703125, + "loss": -0.5321, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": -38.12256622314453, + "rewards/margins": 51.5951042175293, + "rewards/rejected": -89.71766662597656, + "step": 2880 + }, + { + "epoch": 0.5672227674190383, + "grad_norm": 578.8234640908487, + "learning_rate": 2.3502046315972655e-07, + "logits/chosen": 0.17645081877708435, + "logits/rejected": 1.2865142822265625, + "logps/chosen": -331.9459228515625, + "logps/rejected": -421.33282470703125, + "loss": -0.7247, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -13.197916984558105, + "rewards/margins": 114.34892272949219, + "rewards/rejected": -127.54683685302734, + "step": 2890 + }, + { + "epoch": 0.5691854759568205, + "grad_norm": 232.0675435051825, + "learning_rate": 2.3331093278659906e-07, + "logits/chosen": 0.49825865030288696, + "logits/rejected": 1.0472362041473389, + "logps/chosen": -360.3938293457031, + "logps/rejected": -421.24884033203125, + "loss": -0.4672, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -51.801918029785156, + "rewards/margins": 85.29037475585938, + "rewards/rejected": -137.09228515625, + "step": 2900 + }, + { + "epoch": 0.5711481844946026, + "grad_norm": 471.76705387499675, + "learning_rate": 2.31602185935895e-07, + "logits/chosen": 0.01983051374554634, + "logits/rejected": 1.5712801218032837, + "logps/chosen": -293.21490478515625, + "logps/rejected": -355.33935546875, + "loss": -0.5618, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -9.408475875854492, + "rewards/margins": 122.68055725097656, + "rewards/rejected": -132.0890350341797, + "step": 2910 + }, + { + "epoch": 0.5731108930323847, + "grad_norm": 245.42857665560027, + "learning_rate": 2.298943028302811e-07, + "logits/chosen": -0.35872071981430054, + "logits/rejected": 0.5607597827911377, + "logps/chosen": -299.25189208984375, + "logps/rejected": -461.92694091796875, + "loss": -0.6854, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.7750883102417, + "rewards/margins": 140.8957061767578, + "rewards/rejected": -150.67079162597656, + "step": 2920 + }, + { + "epoch": 0.5750736015701668, + "grad_norm": 266.0404820595972, + "learning_rate": 2.2818736365187242e-07, + "logits/chosen": 0.13500066101551056, + "logits/rejected": 1.0924553871154785, + "logps/chosen": -255.1774139404297, + "logps/rejected": -331.12811279296875, + "loss": -0.6754, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -28.0899658203125, + "rewards/margins": 111.04876708984375, + "rewards/rejected": -139.13873291015625, + "step": 2930 + }, + { + "epoch": 0.577036310107949, + "grad_norm": 334.69553613754675, + "learning_rate": 2.2648144853846847e-07, + "logits/chosen": 1.2062290906906128, + "logits/rejected": 2.1320557594299316, + "logps/chosen": -282.756103515625, + "logps/rejected": -425.32763671875, + "loss": -0.8328, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -23.573299407958984, + "rewards/margins": 136.2438201904297, + "rewards/rejected": -159.81710815429688, + "step": 2940 + }, + { + "epoch": 0.5789990186457311, + "grad_norm": 536.1371741380907, + "learning_rate": 2.247766375797906e-07, + "logits/chosen": 1.191646933555603, + "logits/rejected": 1.212264060974121, + "logps/chosen": -222.5431671142578, + "logps/rejected": -283.1639709472656, + "loss": -0.4712, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.147907257080078, + "rewards/margins": 47.22834396362305, + "rewards/rejected": -78.37625885009766, + "step": 2950 + }, + { + "epoch": 0.5809617271835132, + "grad_norm": 277.61070176896783, + "learning_rate": 2.2307301081372222e-07, + "logits/chosen": 1.1878509521484375, + "logits/rejected": 1.0988099575042725, + "logps/chosen": -266.31292724609375, + "logps/rejected": -325.9090881347656, + "loss": -0.5037, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.415120124816895, + "rewards/margins": 41.67178726196289, + "rewards/rejected": -50.0869026184082, + "step": 2960 + }, + { + "epoch": 0.5829244357212954, + "grad_norm": 188.6950936096456, + "learning_rate": 2.2137064822255086e-07, + "logits/chosen": 0.7357149720191956, + "logits/rejected": 1.3781557083129883, + "logps/chosen": -268.281005859375, + "logps/rejected": -321.81365966796875, + "loss": -0.2804, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -47.52278137207031, + "rewards/margins": 58.322242736816406, + "rewards/rejected": -105.84503173828125, + "step": 2970 + }, + { + "epoch": 0.5848871442590775, + "grad_norm": 236.25051602619024, + "learning_rate": 2.1966962972921322e-07, + "logits/chosen": 0.6798173785209656, + "logits/rejected": 1.3316466808319092, + "logps/chosen": -246.56906127929688, + "logps/rejected": -374.881103515625, + "loss": -0.4841, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -9.728264808654785, + "rewards/margins": 95.40788269042969, + "rewards/rejected": -105.13615417480469, + "step": 2980 + }, + { + "epoch": 0.5868498527968596, + "grad_norm": 306.5765043786579, + "learning_rate": 2.1797003519354285e-07, + "logits/chosen": 0.7005780935287476, + "logits/rejected": 1.2031795978546143, + "logps/chosen": -283.9145202636719, + "logps/rejected": -374.4034423828125, + "loss": -0.4342, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -22.043981552124023, + "rewards/margins": 89.0151596069336, + "rewards/rejected": -111.05914306640625, + "step": 2990 + }, + { + "epoch": 0.5888125613346418, + "grad_norm": 165.1066746878023, + "learning_rate": 2.1627194440852142e-07, + "logits/chosen": 1.7588584423065186, + "logits/rejected": 2.171719789505005, + "logps/chosen": -332.97210693359375, + "logps/rejected": -390.28961181640625, + "loss": -0.5126, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -39.196678161621094, + "rewards/margins": 92.90324401855469, + "rewards/rejected": -132.09991455078125, + "step": 3000 + }, + { + "epoch": 0.5907752698724239, + "grad_norm": 288.32571922188697, + "learning_rate": 2.1457543709653176e-07, + "logits/chosen": 0.5540057420730591, + "logits/rejected": 1.577968955039978, + "logps/chosen": -303.7123107910156, + "logps/rejected": -401.205078125, + "loss": -0.6076, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -29.9188232421875, + "rewards/margins": 119.54066467285156, + "rewards/rejected": -149.45950317382812, + "step": 3010 + }, + { + "epoch": 0.592737978410206, + "grad_norm": 183.1564103096898, + "learning_rate": 2.128805929056154e-07, + "logits/chosen": 0.8288165330886841, + "logits/rejected": 1.4216700792312622, + "logps/chosen": -207.1270294189453, + "logps/rejected": -338.00469970703125, + "loss": -0.4724, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -26.292505264282227, + "rewards/margins": 117.13128662109375, + "rewards/rejected": -143.42379760742188, + "step": 3020 + }, + { + "epoch": 0.5947006869479883, + "grad_norm": 453.7931357345026, + "learning_rate": 2.1118749140573358e-07, + "logits/chosen": 0.2079722136259079, + "logits/rejected": 1.3552095890045166, + "logps/chosen": -273.36590576171875, + "logps/rejected": -342.1899108886719, + "loss": -0.7633, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -19.20699119567871, + "rewards/margins": 71.76216125488281, + "rewards/rejected": -90.96916198730469, + "step": 3030 + }, + { + "epoch": 0.5966633954857704, + "grad_norm": 361.84275984996304, + "learning_rate": 2.0949621208503092e-07, + "logits/chosen": 1.0352598428726196, + "logits/rejected": 1.4081958532333374, + "logps/chosen": -301.6096496582031, + "logps/rejected": -272.4801025390625, + "loss": -0.2859, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -0.8882265090942383, + "rewards/margins": 50.127777099609375, + "rewards/rejected": -51.0160026550293, + "step": 3040 + }, + { + "epoch": 0.5986261040235525, + "grad_norm": 284.93897537011503, + "learning_rate": 2.0780683434610413e-07, + "logits/chosen": 1.1556251049041748, + "logits/rejected": 1.7304332256317139, + "logps/chosen": -267.9789123535156, + "logps/rejected": -346.31591796875, + "loss": -0.5116, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -27.530445098876953, + "rewards/margins": 64.04904174804688, + "rewards/rejected": -91.57948303222656, + "step": 3050 + }, + { + "epoch": 0.6005888125613347, + "grad_norm": 482.4916907396394, + "learning_rate": 2.0611943750227375e-07, + "logits/chosen": 1.165791630744934, + "logits/rejected": 1.4937446117401123, + "logps/chosen": -295.9093322753906, + "logps/rejected": -373.0982666015625, + "loss": -0.5185, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -33.31257629394531, + "rewards/margins": 96.5605697631836, + "rewards/rejected": -129.87313842773438, + "step": 3060 + }, + { + "epoch": 0.6025515210991168, + "grad_norm": 331.97254237418036, + "learning_rate": 2.044341007738612e-07, + "logits/chosen": 0.7668994069099426, + "logits/rejected": 2.2817959785461426, + "logps/chosen": -363.9306335449219, + "logps/rejected": -378.9432373046875, + "loss": -0.2915, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -13.068647384643555, + "rewards/margins": 68.29893493652344, + "rewards/rejected": -81.36756896972656, + "step": 3070 + }, + { + "epoch": 0.6045142296368989, + "grad_norm": 418.3816094045505, + "learning_rate": 2.027509032844687e-07, + "logits/chosen": 0.9893747568130493, + "logits/rejected": 0.8026935458183289, + "logps/chosen": -384.5431823730469, + "logps/rejected": -456.28131103515625, + "loss": -0.4825, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -64.47868347167969, + "rewards/margins": 40.07658767700195, + "rewards/rejected": -104.5552749633789, + "step": 3080 + }, + { + "epoch": 0.6064769381746811, + "grad_norm": 1041.4192259949457, + "learning_rate": 2.010699240572651e-07, + "logits/chosen": 1.1153537034988403, + "logits/rejected": 0.9373297691345215, + "logps/chosen": -376.4952392578125, + "logps/rejected": -405.5873718261719, + "loss": -0.5225, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -17.543874740600586, + "rewards/margins": 62.97236251831055, + "rewards/rejected": -80.51622772216797, + "step": 3090 + }, + { + "epoch": 0.6084396467124632, + "grad_norm": 195.58219958672998, + "learning_rate": 1.993912420112756e-07, + "logits/chosen": 1.7597665786743164, + "logits/rejected": 1.7504100799560547, + "logps/chosen": -331.1658630371094, + "logps/rejected": -477.91741943359375, + "loss": -0.3962, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -7.719897270202637, + "rewards/margins": 86.05127716064453, + "rewards/rejected": -93.77116394042969, + "step": 3100 + }, + { + "epoch": 0.6104023552502453, + "grad_norm": 1212.7554210507622, + "learning_rate": 1.9771493595767707e-07, + "logits/chosen": 1.1921669244766235, + "logits/rejected": 1.6928989887237549, + "logps/chosen": -301.7481384277344, + "logps/rejected": -412.67529296875, + "loss": -0.4302, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -16.064899444580078, + "rewards/margins": 69.83645629882812, + "rewards/rejected": -85.90135192871094, + "step": 3110 + }, + { + "epoch": 0.6123650637880275, + "grad_norm": 280.2192733262556, + "learning_rate": 1.9604108459609752e-07, + "logits/chosen": 2.572514295578003, + "logits/rejected": 2.3528380393981934, + "logps/chosen": -357.37030029296875, + "logps/rejected": -440.0013732910156, + "loss": -0.4794, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -37.611942291259766, + "rewards/margins": 84.75715637207031, + "rewards/rejected": -122.36910247802734, + "step": 3120 + }, + { + "epoch": 0.6143277723258096, + "grad_norm": 288.41772244279105, + "learning_rate": 1.9436976651092142e-07, + "logits/chosen": 1.4929567575454712, + "logits/rejected": 2.4030117988586426, + "logps/chosen": -364.6961975097656, + "logps/rejected": -454.45111083984375, + "loss": -0.4981, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -56.003074645996094, + "rewards/margins": 99.0272445678711, + "rewards/rejected": -155.03030395507812, + "step": 3130 + }, + { + "epoch": 0.6162904808635917, + "grad_norm": 223.58800209954262, + "learning_rate": 1.9270106016760035e-07, + "logits/chosen": 0.15036991238594055, + "logits/rejected": 1.5348567962646484, + "logps/chosen": -291.8357238769531, + "logps/rejected": -419.71136474609375, + "loss": -0.5021, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -28.053186416625977, + "rewards/margins": 118.88764953613281, + "rewards/rejected": -146.9408416748047, + "step": 3140 + }, + { + "epoch": 0.6182531894013739, + "grad_norm": 183.3480969124145, + "learning_rate": 1.9103504390896944e-07, + "logits/chosen": 2.1762146949768066, + "logits/rejected": 2.7052228450775146, + "logps/chosen": -280.4345397949219, + "logps/rejected": -441.0689392089844, + "loss": -0.626, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -56.543914794921875, + "rewards/margins": 98.6163101196289, + "rewards/rejected": -155.16024780273438, + "step": 3150 + }, + { + "epoch": 0.620215897939156, + "grad_norm": 172.9244867458964, + "learning_rate": 1.8937179595156876e-07, + "logits/chosen": 0.5130751729011536, + "logits/rejected": 1.79129958152771, + "logps/chosen": -311.43133544921875, + "logps/rejected": -364.58795166015625, + "loss": -0.623, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -32.472686767578125, + "rewards/margins": 123.1352767944336, + "rewards/rejected": -155.6079559326172, + "step": 3160 + }, + { + "epoch": 0.6221786064769381, + "grad_norm": 234.5070864974775, + "learning_rate": 1.8771139438197168e-07, + "logits/chosen": 1.4119322299957275, + "logits/rejected": 2.310541868209839, + "logps/chosen": -310.05572509765625, + "logps/rejected": -445.2001953125, + "loss": -0.5831, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -17.906932830810547, + "rewards/margins": 127.82010650634766, + "rewards/rejected": -145.72705078125, + "step": 3170 + }, + { + "epoch": 0.6241413150147204, + "grad_norm": 249.96181920277226, + "learning_rate": 1.8605391715311846e-07, + "logits/chosen": 1.629399061203003, + "logits/rejected": 3.0770516395568848, + "logps/chosen": -320.28265380859375, + "logps/rejected": -302.6684265136719, + "loss": -0.6057, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -35.234336853027344, + "rewards/margins": 60.21760177612305, + "rewards/rejected": -95.45193481445312, + "step": 3180 + }, + { + "epoch": 0.6261040235525025, + "grad_norm": 148.03831268903735, + "learning_rate": 1.8439944208065704e-07, + "logits/chosen": 1.3731263875961304, + "logits/rejected": 2.3876495361328125, + "logps/chosen": -372.33026123046875, + "logps/rejected": -455.12957763671875, + "loss": -0.4608, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -22.71662712097168, + "rewards/margins": 105.861328125, + "rewards/rejected": -128.5779571533203, + "step": 3190 + }, + { + "epoch": 0.6280667320902846, + "grad_norm": 244.4534245125885, + "learning_rate": 1.8274804683928913e-07, + "logits/chosen": 1.1035573482513428, + "logits/rejected": 2.6440072059631348, + "logps/chosen": -374.5699462890625, + "logps/rejected": -428.7478942871094, + "loss": -0.4164, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -41.70665740966797, + "rewards/margins": 103.33683776855469, + "rewards/rejected": -145.0435028076172, + "step": 3200 + }, + { + "epoch": 0.6300294406280668, + "grad_norm": 109.62242811529732, + "learning_rate": 1.810998089591238e-07, + "logits/chosen": 0.7609840035438538, + "logits/rejected": 1.2874010801315308, + "logps/chosen": -283.74871826171875, + "logps/rejected": -372.7958984375, + "loss": -0.4905, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -30.674938201904297, + "rewards/margins": 83.36442565917969, + "rewards/rejected": -114.03938293457031, + "step": 3210 + }, + { + "epoch": 0.6319921491658489, + "grad_norm": 121.27814585355463, + "learning_rate": 1.7945480582203745e-07, + "logits/chosen": 1.4029531478881836, + "logits/rejected": 1.2399829626083374, + "logps/chosen": -292.9909362792969, + "logps/rejected": -370.7956848144531, + "loss": -0.5169, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -44.148494720458984, + "rewards/margins": 58.81536102294922, + "rewards/rejected": -102.96385192871094, + "step": 3220 + }, + { + "epoch": 0.633954857703631, + "grad_norm": 280.83982168767716, + "learning_rate": 1.7781311465804128e-07, + "logits/chosen": 1.577337622642517, + "logits/rejected": 2.336193561553955, + "logps/chosen": -295.2773132324219, + "logps/rejected": -328.74664306640625, + "loss": -0.4481, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -16.177501678466797, + "rewards/margins": 79.00422668457031, + "rewards/rejected": -95.18171691894531, + "step": 3230 + }, + { + "epoch": 0.6359175662414132, + "grad_norm": 461.85620163691954, + "learning_rate": 1.7617481254165487e-07, + "logits/chosen": 1.1604465246200562, + "logits/rejected": 2.0431575775146484, + "logps/chosen": -278.5603332519531, + "logps/rejected": -354.8936767578125, + "loss": -0.6121, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -6.95641565322876, + "rewards/margins": 125.07762145996094, + "rewards/rejected": -132.03402709960938, + "step": 3240 + }, + { + "epoch": 0.6378802747791953, + "grad_norm": 294.72060086783773, + "learning_rate": 1.745399763882881e-07, + "logits/chosen": 1.236867904663086, + "logits/rejected": 2.3780226707458496, + "logps/chosen": -307.36224365234375, + "logps/rejected": -471.08795166015625, + "loss": -0.6499, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -17.60917091369629, + "rewards/margins": 176.69235229492188, + "rewards/rejected": -194.301513671875, + "step": 3250 + }, + { + "epoch": 0.6398429833169774, + "grad_norm": 316.18236023568033, + "learning_rate": 1.7290868295062983e-07, + "logits/chosen": 1.6850658655166626, + "logits/rejected": 2.023160219192505, + "logps/chosen": -281.6526794433594, + "logps/rejected": -408.58892822265625, + "loss": -0.6055, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": 5.73920202255249, + "rewards/margins": 129.40170288085938, + "rewards/rejected": -123.6624984741211, + "step": 3260 + }, + { + "epoch": 0.6418056918547596, + "grad_norm": 538.2140517846747, + "learning_rate": 1.7128100881504492e-07, + "logits/chosen": 1.0986950397491455, + "logits/rejected": 2.613492727279663, + "logps/chosen": -297.9154357910156, + "logps/rejected": -331.2212829589844, + "loss": -0.569, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -11.341866493225098, + "rewards/margins": 102.48435974121094, + "rewards/rejected": -113.82623291015625, + "step": 3270 + }, + { + "epoch": 0.6437684003925417, + "grad_norm": 296.57329753749735, + "learning_rate": 1.6965703039797808e-07, + "logits/chosen": 0.8851855397224426, + "logits/rejected": 3.257719039916992, + "logps/chosen": -333.4023742675781, + "logps/rejected": -404.5556335449219, + "loss": -0.7286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.969829559326172, + "rewards/margins": 159.63040161132812, + "rewards/rejected": -177.60020446777344, + "step": 3280 + }, + { + "epoch": 0.6457311089303238, + "grad_norm": 345.10178972612823, + "learning_rate": 1.6803682394236656e-07, + "logits/chosen": 0.7804575562477112, + "logits/rejected": 2.5302646160125732, + "logps/chosen": -335.7467041015625, + "logps/rejected": -391.03607177734375, + "loss": -0.4962, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -3.409665584564209, + "rewards/margins": 118.77891540527344, + "rewards/rejected": -122.18858337402344, + "step": 3290 + }, + { + "epoch": 0.647693817468106, + "grad_norm": 234.51865003515888, + "learning_rate": 1.664204655140607e-07, + "logits/chosen": 1.4312318563461304, + "logits/rejected": 1.6907854080200195, + "logps/chosen": -250.748291015625, + "logps/rejected": -392.40423583984375, + "loss": -0.6363, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.458979606628418, + "rewards/margins": 100.32145690917969, + "rewards/rejected": -113.78043365478516, + "step": 3300 + }, + { + "epoch": 0.6496565260058881, + "grad_norm": 281.70637757158755, + "learning_rate": 1.6480803099825277e-07, + "logits/chosen": 1.3997304439544678, + "logits/rejected": 2.446746349334717, + "logps/chosen": -266.99725341796875, + "logps/rejected": -365.4762878417969, + "loss": -0.6901, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -5.368094444274902, + "rewards/margins": 138.910888671875, + "rewards/rejected": -144.27896118164062, + "step": 3310 + }, + { + "epoch": 0.6516192345436702, + "grad_norm": 400.467149919209, + "learning_rate": 1.6319959609591412e-07, + "logits/chosen": 2.444638252258301, + "logits/rejected": 3.3924450874328613, + "logps/chosen": -283.1401062011719, + "logps/rejected": -289.62225341796875, + "loss": -0.5487, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -40.56492233276367, + "rewards/margins": 53.07103729248047, + "rewards/rejected": -93.63597106933594, + "step": 3320 + }, + { + "epoch": 0.6535819430814525, + "grad_norm": 185.375579267095, + "learning_rate": 1.6159523632024126e-07, + "logits/chosen": 1.1899499893188477, + "logits/rejected": 2.3480947017669678, + "logps/chosen": -344.8422546386719, + "logps/rejected": -453.46661376953125, + "loss": -0.456, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -55.438575744628906, + "rewards/margins": 82.45014953613281, + "rewards/rejected": -137.88873291015625, + "step": 3330 + }, + { + "epoch": 0.6555446516192346, + "grad_norm": 136.46339856465588, + "learning_rate": 1.599950269931107e-07, + "logits/chosen": 2.2227530479431152, + "logits/rejected": 2.7438931465148926, + "logps/chosen": -344.6984558105469, + "logps/rejected": -318.1833190917969, + "loss": -0.4392, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -48.4615364074707, + "rewards/margins": 17.468292236328125, + "rewards/rejected": -65.92981719970703, + "step": 3340 + }, + { + "epoch": 0.6575073601570167, + "grad_norm": 230.0271625529546, + "learning_rate": 1.5839904324154273e-07, + "logits/chosen": 1.5149564743041992, + "logits/rejected": 2.8948044776916504, + "logps/chosen": -296.1535949707031, + "logps/rejected": -403.90057373046875, + "loss": -0.6306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.129514694213867, + "rewards/margins": 97.59844207763672, + "rewards/rejected": -108.72795104980469, + "step": 3350 + }, + { + "epoch": 0.6594700686947988, + "grad_norm": 241.22053261151103, + "learning_rate": 1.568073599941742e-07, + "logits/chosen": 0.9362170100212097, + "logits/rejected": 1.5310771465301514, + "logps/chosen": -313.9458312988281, + "logps/rejected": -397.6568298339844, + "loss": -0.7312, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -15.276570320129395, + "rewards/margins": 108.81392669677734, + "rewards/rejected": -124.09049987792969, + "step": 3360 + }, + { + "epoch": 0.661432777232581, + "grad_norm": 324.03389412978737, + "learning_rate": 1.552200519777408e-07, + "logits/chosen": 1.2166019678115845, + "logits/rejected": 3.130192279815674, + "logps/chosen": -354.7196044921875, + "logps/rejected": -419.1104431152344, + "loss": -0.5795, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -36.037296295166016, + "rewards/margins": 134.6616973876953, + "rewards/rejected": -170.69900512695312, + "step": 3370 + }, + { + "epoch": 0.6633954857703631, + "grad_norm": 190.25835073505576, + "learning_rate": 1.5363719371356882e-07, + "logits/chosen": 0.7802735567092896, + "logits/rejected": 1.6470670700073242, + "logps/chosen": -347.74395751953125, + "logps/rejected": -345.00970458984375, + "loss": -0.6156, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -6.984062194824219, + "rewards/margins": 110.39164733886719, + "rewards/rejected": -117.3757095336914, + "step": 3380 + }, + { + "epoch": 0.6653581943081452, + "grad_norm": 219.1929528790628, + "learning_rate": 1.5205885951407665e-07, + "logits/chosen": 2.0109965801239014, + "logits/rejected": 2.1420645713806152, + "logps/chosen": -299.0583190917969, + "logps/rejected": -444.8160705566406, + "loss": -0.691, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -45.24928283691406, + "rewards/margins": 96.7963638305664, + "rewards/rejected": -142.045654296875, + "step": 3390 + }, + { + "epoch": 0.6673209028459274, + "grad_norm": 365.378455350356, + "learning_rate": 1.5048512347928564e-07, + "logits/chosen": 1.6866347789764404, + "logits/rejected": 4.299131870269775, + "logps/chosen": -358.8126220703125, + "logps/rejected": -418.63671875, + "loss": -0.5959, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -70.52055358886719, + "rewards/margins": 145.75868225097656, + "rewards/rejected": -216.2792205810547, + "step": 3400 + }, + { + "epoch": 0.6692836113837095, + "grad_norm": 278.7569396914758, + "learning_rate": 1.4891605949334133e-07, + "logits/chosen": 0.7028568983078003, + "logits/rejected": 3.0758461952209473, + "logps/chosen": -528.3422241210938, + "logps/rejected": -545.26953125, + "loss": -0.4262, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -55.5322265625, + "rewards/margins": 110.14747619628906, + "rewards/rejected": -165.67970275878906, + "step": 3410 + }, + { + "epoch": 0.6712463199214916, + "grad_norm": 397.2812387711312, + "learning_rate": 1.4735174122104476e-07, + "logits/chosen": 2.603889226913452, + "logits/rejected": 3.3954358100891113, + "logps/chosen": -256.2731018066406, + "logps/rejected": -348.58001708984375, + "loss": -0.4006, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -16.873428344726562, + "rewards/margins": 132.78973388671875, + "rewards/rejected": -149.66317749023438, + "step": 3420 + }, + { + "epoch": 0.6732090284592738, + "grad_norm": 304.2857262870805, + "learning_rate": 1.457922421043943e-07, + "logits/chosen": 0.4312248229980469, + "logits/rejected": 2.1262800693511963, + "logps/chosen": -357.89703369140625, + "logps/rejected": -317.8974914550781, + "loss": -0.4859, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -37.8548698425293, + "rewards/margins": 74.66275787353516, + "rewards/rejected": -112.51763916015625, + "step": 3430 + }, + { + "epoch": 0.6751717369970559, + "grad_norm": 387.62014373355345, + "learning_rate": 1.4423763535913704e-07, + "logits/chosen": 0.6034333109855652, + "logits/rejected": 0.8363513946533203, + "logps/chosen": -275.8504333496094, + "logps/rejected": -388.68939208984375, + "loss": -0.5767, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -8.406243324279785, + "rewards/margins": 112.24727630615234, + "rewards/rejected": -120.65352630615234, + "step": 3440 + }, + { + "epoch": 0.677134445534838, + "grad_norm": 251.17708827453578, + "learning_rate": 1.426879939713322e-07, + "logits/chosen": 1.0980184078216553, + "logits/rejected": 1.070042371749878, + "logps/chosen": -339.46783447265625, + "logps/rejected": -394.92974853515625, + "loss": -0.4703, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -35.621665954589844, + "rewards/margins": 123.67140197753906, + "rewards/rejected": -159.29306030273438, + "step": 3450 + }, + { + "epoch": 0.6790971540726202, + "grad_norm": 265.8926787931652, + "learning_rate": 1.4114339069392374e-07, + "logits/chosen": 1.2519810199737549, + "logits/rejected": 2.74015736579895, + "logps/chosen": -306.773681640625, + "logps/rejected": -342.7041015625, + "loss": -0.6756, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -10.08450698852539, + "rewards/margins": 120.99635314941406, + "rewards/rejected": -131.0808563232422, + "step": 3460 + }, + { + "epoch": 0.6810598626104023, + "grad_norm": 142.08053216952268, + "learning_rate": 1.3960389804332556e-07, + "logits/chosen": 1.8515313863754272, + "logits/rejected": 2.3379006385803223, + "logps/chosen": -269.44189453125, + "logps/rejected": -415.3699645996094, + "loss": -0.4601, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -12.299274444580078, + "rewards/margins": 111.2161636352539, + "rewards/rejected": -123.51544189453125, + "step": 3470 + }, + { + "epoch": 0.6830225711481845, + "grad_norm": 297.68343547141916, + "learning_rate": 1.380695882960165e-07, + "logits/chosen": 2.3755345344543457, + "logits/rejected": 3.324944257736206, + "logps/chosen": -303.84210205078125, + "logps/rejected": -409.0731201171875, + "loss": -0.6388, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -30.8568058013916, + "rewards/margins": 151.22311401367188, + "rewards/rejected": -182.0799102783203, + "step": 3480 + }, + { + "epoch": 0.6849852796859667, + "grad_norm": 279.77973877221467, + "learning_rate": 1.3654053348514702e-07, + "logits/chosen": 2.4140167236328125, + "logits/rejected": 2.442918300628662, + "logps/chosen": -175.77764892578125, + "logps/rejected": -345.3222961425781, + "loss": -0.7239, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -11.252742767333984, + "rewards/margins": 132.3984832763672, + "rewards/rejected": -143.6512451171875, + "step": 3490 + }, + { + "epoch": 0.6869479882237488, + "grad_norm": 460.300392286645, + "learning_rate": 1.350168053971577e-07, + "logits/chosen": 2.53047776222229, + "logits/rejected": 3.1190500259399414, + "logps/chosen": -395.7915954589844, + "logps/rejected": -388.1199645996094, + "loss": -0.3305, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -33.28361511230469, + "rewards/margins": 115.11958312988281, + "rewards/rejected": -148.4031982421875, + "step": 3500 + }, + { + "epoch": 0.6889106967615309, + "grad_norm": 413.92216592989024, + "learning_rate": 1.3349847556840876e-07, + "logits/chosen": 2.0696728229522705, + "logits/rejected": 2.69974946975708, + "logps/chosen": -287.5434875488281, + "logps/rejected": -428.76129150390625, + "loss": -0.5497, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -42.18700408935547, + "rewards/margins": 119.90152740478516, + "rewards/rejected": -162.08853149414062, + "step": 3510 + }, + { + "epoch": 0.6908734052993131, + "grad_norm": 384.7457818102189, + "learning_rate": 1.3198561528182182e-07, + "logits/chosen": 1.647242546081543, + "logits/rejected": 1.7746429443359375, + "logps/chosen": -264.11566162109375, + "logps/rejected": -381.8612060546875, + "loss": -0.3659, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -65.06902313232422, + "rewards/margins": 93.1152114868164, + "rewards/rejected": -158.18423461914062, + "step": 3520 + }, + { + "epoch": 0.6928361138370952, + "grad_norm": 246.02225552085997, + "learning_rate": 1.3047829556353263e-07, + "logits/chosen": 1.6259530782699585, + "logits/rejected": 2.0524356365203857, + "logps/chosen": -267.45831298828125, + "logps/rejected": -353.01702880859375, + "loss": -0.4369, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.631207466125488, + "rewards/margins": 98.35829162597656, + "rewards/rejected": -112.989501953125, + "step": 3530 + }, + { + "epoch": 0.6947988223748773, + "grad_norm": 169.49248459736611, + "learning_rate": 1.2897658717955742e-07, + "logits/chosen": 2.2385401725769043, + "logits/rejected": 2.4062373638153076, + "logps/chosen": -300.55035400390625, + "logps/rejected": -373.66290283203125, + "loss": -0.7203, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -38.223567962646484, + "rewards/margins": 121.9072494506836, + "rewards/rejected": -160.13082885742188, + "step": 3540 + }, + { + "epoch": 0.6967615309126595, + "grad_norm": 305.6211458361156, + "learning_rate": 1.2748056063246994e-07, + "logits/chosen": 1.5881072282791138, + "logits/rejected": 1.7754720449447632, + "logps/chosen": -348.344482421875, + "logps/rejected": -392.8487243652344, + "loss": -0.4053, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -46.63700866699219, + "rewards/margins": 56.822364807128906, + "rewards/rejected": -103.45936584472656, + "step": 3550 + }, + { + "epoch": 0.6987242394504416, + "grad_norm": 244.3085454314864, + "learning_rate": 1.2599028615809183e-07, + "logits/chosen": 1.9024779796600342, + "logits/rejected": 1.784313440322876, + "logps/chosen": -313.4281311035156, + "logps/rejected": -343.64007568359375, + "loss": -0.3804, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -16.648738861083984, + "rewards/margins": 88.41041564941406, + "rewards/rejected": -105.05915832519531, + "step": 3560 + }, + { + "epoch": 0.7006869479882237, + "grad_norm": 258.8389666758088, + "learning_rate": 1.2450583372219458e-07, + "logits/chosen": 2.2555994987487793, + "logits/rejected": 1.8444554805755615, + "logps/chosen": -301.27239990234375, + "logps/rejected": -376.5409851074219, + "loss": -0.581, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": 1.242764949798584, + "rewards/margins": 82.57177734375, + "rewards/rejected": -81.32901763916016, + "step": 3570 + }, + { + "epoch": 0.7026496565260059, + "grad_norm": 259.8600830505897, + "learning_rate": 1.230272730172157e-07, + "logits/chosen": 1.8105671405792236, + "logits/rejected": 2.0636348724365234, + "logps/chosen": -310.08135986328125, + "logps/rejected": -418.76678466796875, + "loss": -0.405, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -13.664457321166992, + "rewards/margins": 97.44984436035156, + "rewards/rejected": -111.11429595947266, + "step": 3580 + }, + { + "epoch": 0.704612365063788, + "grad_norm": 147.04742095509496, + "learning_rate": 1.2155467345898602e-07, + "logits/chosen": 0.6524871587753296, + "logits/rejected": 1.2266894578933716, + "logps/chosen": -302.1370849609375, + "logps/rejected": -405.2377014160156, + "loss": -0.3501, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -34.67855453491211, + "rewards/margins": 57.04478073120117, + "rewards/rejected": -91.72332763671875, + "step": 3590 + }, + { + "epoch": 0.7065750736015701, + "grad_norm": 310.2071885494148, + "learning_rate": 1.2008810418347093e-07, + "logits/chosen": 1.3502354621887207, + "logits/rejected": 1.7374212741851807, + "logps/chosen": -214.0709228515625, + "logps/rejected": -325.71087646484375, + "loss": -0.5805, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -13.01269817352295, + "rewards/margins": 100.90092468261719, + "rewards/rejected": -113.91361999511719, + "step": 3600 + }, + { + "epoch": 0.7085377821393523, + "grad_norm": 264.81278013923355, + "learning_rate": 1.1862763404352483e-07, + "logits/chosen": 0.3365991413593292, + "logits/rejected": 1.8003944158554077, + "logps/chosen": -346.4209289550781, + "logps/rejected": -401.68988037109375, + "loss": -0.6485, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -32.3430061340332, + "rewards/margins": 90.04476165771484, + "rewards/rejected": -122.38777160644531, + "step": 3610 + }, + { + "epoch": 0.7105004906771345, + "grad_norm": 272.57531690224744, + "learning_rate": 1.1717333160565807e-07, + "logits/chosen": 0.9894776344299316, + "logits/rejected": 2.2653136253356934, + "logps/chosen": -389.22003173828125, + "logps/rejected": -423.453369140625, + "loss": -0.5542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.51241111755371, + "rewards/margins": 109.59358215332031, + "rewards/rejected": -141.10598754882812, + "step": 3620 + }, + { + "epoch": 0.7124631992149166, + "grad_norm": 309.02009455768996, + "learning_rate": 1.1572526514681874e-07, + "logits/chosen": 1.7682292461395264, + "logits/rejected": 1.765679955482483, + "logps/chosen": -324.6111755371094, + "logps/rejected": -452.42608642578125, + "loss": -0.4345, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -26.59149169921875, + "rewards/margins": 97.76274871826172, + "rewards/rejected": -124.3542251586914, + "step": 3630 + }, + { + "epoch": 0.7144259077526988, + "grad_norm": 258.0120855386704, + "learning_rate": 1.1428350265118613e-07, + "logits/chosen": 0.9690055847167969, + "logits/rejected": 2.254654884338379, + "logps/chosen": -352.84295654296875, + "logps/rejected": -410.548583984375, + "loss": -0.4062, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -35.663734436035156, + "rewards/margins": 97.86629486083984, + "rewards/rejected": -133.530029296875, + "step": 3640 + }, + { + "epoch": 0.7163886162904809, + "grad_norm": 163.1368480023823, + "learning_rate": 1.128481118069799e-07, + "logits/chosen": 0.5297033190727234, + "logits/rejected": 1.7237342596054077, + "logps/chosen": -305.6944274902344, + "logps/rejected": -466.70855712890625, + "loss": -0.6406, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -34.66438674926758, + "rewards/margins": 156.79010009765625, + "rewards/rejected": -191.4544677734375, + "step": 3650 + }, + { + "epoch": 0.718351324828263, + "grad_norm": 209.7828282956596, + "learning_rate": 1.114191600032815e-07, + "logits/chosen": 0.5942263603210449, + "logits/rejected": 1.5835633277893066, + "logps/chosen": -353.80645751953125, + "logps/rejected": -406.839111328125, + "loss": -0.566, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -42.19194030761719, + "rewards/margins": 91.42488861083984, + "rewards/rejected": -133.61683654785156, + "step": 3660 + }, + { + "epoch": 0.7203140333660452, + "grad_norm": 165.7360403115477, + "learning_rate": 1.0999671432687099e-07, + "logits/chosen": 0.8667260408401489, + "logits/rejected": 2.171340227127075, + "logps/chosen": -323.3560485839844, + "logps/rejected": -361.23687744140625, + "loss": -0.5047, + "rewards/accuracies": 0.73333340883255, + "rewards/chosen": -24.4227237701416, + "rewards/margins": 100.17520141601562, + "rewards/rejected": -124.59791564941406, + "step": 3670 + }, + { + "epoch": 0.7222767419038273, + "grad_norm": 90.46623641084982, + "learning_rate": 1.085808415590772e-07, + "logits/chosen": -0.16272859275341034, + "logits/rejected": 0.37558555603027344, + "logps/chosen": -296.73712158203125, + "logps/rejected": -376.70184326171875, + "loss": -0.479, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -10.97760009765625, + "rewards/margins": 117.96441650390625, + "rewards/rejected": -128.9420166015625, + "step": 3680 + }, + { + "epoch": 0.7242394504416094, + "grad_norm": 198.77277060510434, + "learning_rate": 1.0717160817264217e-07, + "logits/chosen": 0.6684038043022156, + "logits/rejected": 2.0979228019714355, + "logps/chosen": -320.0564880371094, + "logps/rejected": -464.06793212890625, + "loss": -0.7192, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -47.57789611816406, + "rewards/margins": 147.88043212890625, + "rewards/rejected": -195.4583282470703, + "step": 3690 + }, + { + "epoch": 0.7262021589793916, + "grad_norm": 559.4067605726672, + "learning_rate": 1.0576908032860088e-07, + "logits/chosen": 1.2240560054779053, + "logits/rejected": 2.251919984817505, + "logps/chosen": -290.126220703125, + "logps/rejected": -351.03192138671875, + "loss": -0.5242, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -33.61920928955078, + "rewards/margins": 110.39930725097656, + "rewards/rejected": -144.0185089111328, + "step": 3700 + }, + { + "epoch": 0.7281648675171737, + "grad_norm": 225.36143964638654, + "learning_rate": 1.0437332387317474e-07, + "logits/chosen": 0.10267148166894913, + "logits/rejected": 1.5426862239837646, + "logps/chosen": -275.28887939453125, + "logps/rejected": -324.0962219238281, + "loss": -0.3556, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -29.85512924194336, + "rewards/margins": 82.32190704345703, + "rewards/rejected": -112.17704772949219, + "step": 3710 + }, + { + "epoch": 0.7301275760549558, + "grad_norm": 381.0312986812787, + "learning_rate": 1.0298440433468048e-07, + "logits/chosen": -0.1327010691165924, + "logits/rejected": 1.3545982837677002, + "logps/chosen": -350.61724853515625, + "logps/rejected": -377.9830017089844, + "loss": -0.5429, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -13.095626831054688, + "rewards/margins": 112.74749755859375, + "rewards/rejected": -125.84312438964844, + "step": 3720 + }, + { + "epoch": 0.732090284592738, + "grad_norm": 312.7283497697748, + "learning_rate": 1.0160238692045331e-07, + "logits/chosen": -0.3035415709018707, + "logits/rejected": 1.0891860723495483, + "logps/chosen": -291.15777587890625, + "logps/rejected": -338.7342224121094, + "loss": -0.4859, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -39.76912307739258, + "rewards/margins": 90.14669799804688, + "rewards/rejected": -129.9158172607422, + "step": 3730 + }, + { + "epoch": 0.7340529931305201, + "grad_norm": 323.57379442181485, + "learning_rate": 1.0022733651378606e-07, + "logits/chosen": -0.02376272715628147, + "logits/rejected": 1.6784549951553345, + "logps/chosen": -408.494140625, + "logps/rejected": -411.84375, + "loss": -0.3745, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -35.54701232910156, + "rewards/margins": 106.29045104980469, + "rewards/rejected": -141.83746337890625, + "step": 3740 + }, + { + "epoch": 0.7360157016683022, + "grad_norm": 203.72492080539834, + "learning_rate": 9.88593176708827e-08, + "logits/chosen": 0.7709684371948242, + "logits/rejected": 0.7810913920402527, + "logps/chosen": -302.86419677734375, + "logps/rejected": -366.42962646484375, + "loss": -0.4955, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -27.58626937866211, + "rewards/margins": 53.094276428222656, + "rewards/rejected": -80.68054962158203, + "step": 3750 + }, + { + "epoch": 0.7379784102060843, + "grad_norm": 303.43224661005456, + "learning_rate": 9.749839461782769e-08, + "logits/chosen": 0.011441946029663086, + "logits/rejected": -0.2894682288169861, + "logps/chosen": -271.73199462890625, + "logps/rejected": -441.5648498535156, + "loss": -0.6756, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -22.304996490478516, + "rewards/margins": 126.96807861328125, + "rewards/rejected": -149.27305603027344, + "step": 3760 + }, + { + "epoch": 0.7399411187438666, + "grad_norm": 481.47184929772465, + "learning_rate": 9.614463124757041e-08, + "logits/chosen": 0.39508628845214844, + "logits/rejected": 0.5891133546829224, + "logps/chosen": -250.55795288085938, + "logps/rejected": -323.19146728515625, + "loss": -0.3047, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -14.409457206726074, + "rewards/margins": 75.74601745605469, + "rewards/rejected": -90.15547943115234, + "step": 3770 + }, + { + "epoch": 0.7419038272816487, + "grad_norm": 222.203049531488, + "learning_rate": 9.479809111692586e-08, + "logits/chosen": 0.41384345293045044, + "logits/rejected": -0.08077137172222137, + "logps/chosen": -291.7434997558594, + "logps/rejected": -386.6715393066406, + "loss": -0.5309, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": -56.585548400878906, + "rewards/margins": 48.57170104980469, + "rewards/rejected": -105.1572494506836, + "step": 3780 + }, + { + "epoch": 0.7438665358194309, + "grad_norm": 358.06095742681515, + "learning_rate": 9.345883744359065e-08, + "logits/chosen": 0.6432726979255676, + "logits/rejected": 0.2744694650173187, + "logps/chosen": -305.78045654296875, + "logps/rejected": -428.5538635253906, + "loss": -0.5304, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.73126983642578, + "rewards/margins": 56.70947265625, + "rewards/rejected": -92.44075012207031, + "step": 3790 + }, + { + "epoch": 0.745829244357213, + "grad_norm": 257.0185079736378, + "learning_rate": 9.212693310317479e-08, + "logits/chosen": 0.4275744557380676, + "logits/rejected": 0.7536159157752991, + "logps/chosen": -286.36309814453125, + "logps/rejected": -352.2890625, + "loss": -0.3569, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -32.664642333984375, + "rewards/margins": 77.56367492675781, + "rewards/rejected": -110.22830963134766, + "step": 3800 + }, + { + "epoch": 0.7477919528949951, + "grad_norm": 288.5250092689165, + "learning_rate": 9.08024406262503e-08, + "logits/chosen": -0.06111738830804825, + "logits/rejected": 0.008484485559165478, + "logps/chosen": -259.80963134765625, + "logps/rejected": -386.0383605957031, + "loss": -0.6137, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -14.924217224121094, + "rewards/margins": 121.2860336303711, + "rewards/rejected": -136.21023559570312, + "step": 3810 + }, + { + "epoch": 0.7497546614327772, + "grad_norm": 97.67762932716003, + "learning_rate": 8.94854221954148e-08, + "logits/chosen": 0.49058040976524353, + "logits/rejected": 0.7544366121292114, + "logps/chosen": -242.14682006835938, + "logps/rejected": -329.55914306640625, + "loss": -0.5801, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -8.71400260925293, + "rewards/margins": 116.04339599609375, + "rewards/rejected": -124.75740051269531, + "step": 3820 + }, + { + "epoch": 0.7517173699705594, + "grad_norm": 362.06192532755273, + "learning_rate": 8.817593964237316e-08, + "logits/chosen": 0.23223204910755157, + "logits/rejected": 1.0880941152572632, + "logps/chosen": -288.9664001464844, + "logps/rejected": -371.0241394042969, + "loss": -0.7323, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -9.658148765563965, + "rewards/margins": 117.62623596191406, + "rewards/rejected": -127.28438568115234, + "step": 3830 + }, + { + "epoch": 0.7536800785083415, + "grad_norm": 222.45991290146569, + "learning_rate": 8.68740544450334e-08, + "logits/chosen": 0.6538372039794922, + "logits/rejected": 2.5161030292510986, + "logps/chosen": -366.0320739746094, + "logps/rejected": -405.2726745605469, + "loss": -0.5504, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -5.696615695953369, + "rewards/margins": 131.22412109375, + "rewards/rejected": -136.92074584960938, + "step": 3840 + }, + { + "epoch": 0.7556427870461236, + "grad_norm": 717.138465667547, + "learning_rate": 8.557982772462138e-08, + "logits/chosen": 1.8596255779266357, + "logits/rejected": 2.209252119064331, + "logps/chosen": -282.5457458496094, + "logps/rejected": -413.7879333496094, + "loss": -0.8445, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -19.658872604370117, + "rewards/margins": 142.85853576660156, + "rewards/rejected": -162.51742553710938, + "step": 3850 + }, + { + "epoch": 0.7576054955839058, + "grad_norm": 142.2537163753503, + "learning_rate": 8.429332024281088e-08, + "logits/chosen": 1.1737343072891235, + "logits/rejected": 2.674985885620117, + "logps/chosen": -313.0780944824219, + "logps/rejected": -380.13140869140625, + "loss": -0.3671, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -27.172000885009766, + "rewards/margins": 136.07864379882812, + "rewards/rejected": -163.25064086914062, + "step": 3860 + }, + { + "epoch": 0.7595682041216879, + "grad_norm": 194.64636178959483, + "learning_rate": 8.301459239887073e-08, + "logits/chosen": 1.0153484344482422, + "logits/rejected": 3.1004929542541504, + "logps/chosen": -365.90277099609375, + "logps/rejected": -405.7188415527344, + "loss": -0.716, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -26.039794921875, + "rewards/margins": 113.65787506103516, + "rewards/rejected": -139.69766235351562, + "step": 3870 + }, + { + "epoch": 0.76153091265947, + "grad_norm": 292.799253015931, + "learning_rate": 8.17437042268298e-08, + "logits/chosen": 0.9066111445426941, + "logits/rejected": 1.4293513298034668, + "logps/chosen": -349.82415771484375, + "logps/rejected": -410.9677734375, + "loss": -0.4059, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": -61.107322692871094, + "rewards/margins": 43.853843688964844, + "rewards/rejected": -104.96116638183594, + "step": 3880 + }, + { + "epoch": 0.7634936211972522, + "grad_norm": 450.23071256513026, + "learning_rate": 8.048071539265761e-08, + "logits/chosen": 0.8080563545227051, + "logits/rejected": 2.6450655460357666, + "logps/chosen": -344.237548828125, + "logps/rejected": -363.1737365722656, + "loss": -0.523, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -29.7846622467041, + "rewards/margins": 102.0052490234375, + "rewards/rejected": -131.78990173339844, + "step": 3890 + }, + { + "epoch": 0.7654563297350343, + "grad_norm": 442.8918649708774, + "learning_rate": 7.922568519146425e-08, + "logits/chosen": 1.209207534790039, + "logits/rejected": 1.6337013244628906, + "logps/chosen": -236.75015258789062, + "logps/rejected": -388.8421630859375, + "loss": -0.6117, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -9.993242263793945, + "rewards/margins": 173.70559692382812, + "rewards/rejected": -183.69882202148438, + "step": 3900 + }, + { + "epoch": 0.7674190382728164, + "grad_norm": 238.3320698428565, + "learning_rate": 7.79786725447154e-08, + "logits/chosen": 1.3150415420532227, + "logits/rejected": 1.7168185710906982, + "logps/chosen": -289.5064697265625, + "logps/rejected": -378.62432861328125, + "loss": -0.7813, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -29.609729766845703, + "rewards/margins": 133.35765075683594, + "rewards/rejected": -162.96737670898438, + "step": 3910 + }, + { + "epoch": 0.7693817468105987, + "grad_norm": 341.2953124746272, + "learning_rate": 7.6739735997467e-08, + "logits/chosen": 0.30328941345214844, + "logits/rejected": 0.7972557544708252, + "logps/chosen": -305.1976623535156, + "logps/rejected": -374.2314147949219, + "loss": -0.6348, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -7.398439884185791, + "rewards/margins": 106.65047454833984, + "rewards/rejected": -114.04891204833984, + "step": 3920 + }, + { + "epoch": 0.7713444553483808, + "grad_norm": 130.20377953007142, + "learning_rate": 7.550893371561593e-08, + "logits/chosen": 1.5633090734481812, + "logits/rejected": 1.7413053512573242, + "logps/chosen": -265.47125244140625, + "logps/rejected": -367.0103454589844, + "loss": -0.6864, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 2.874222993850708, + "rewards/margins": 125.1466064453125, + "rewards/rejected": -122.27238464355469, + "step": 3930 + }, + { + "epoch": 0.7733071638861629, + "grad_norm": 177.53461331251899, + "learning_rate": 7.428632348317004e-08, + "logits/chosen": 0.896818995475769, + "logits/rejected": 1.8238928318023682, + "logps/chosen": -291.6225280761719, + "logps/rejected": -379.63116455078125, + "loss": -0.6347, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -43.21448516845703, + "rewards/margins": 77.93448638916016, + "rewards/rejected": -121.14897155761719, + "step": 3940 + }, + { + "epoch": 0.7752698724239451, + "grad_norm": 355.2490033380478, + "learning_rate": 7.307196269953444e-08, + "logits/chosen": 0.5623804330825806, + "logits/rejected": 1.8873199224472046, + "logps/chosen": -299.43780517578125, + "logps/rejected": -382.9126281738281, + "loss": -0.5757, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -16.341676712036133, + "rewards/margins": 103.789306640625, + "rewards/rejected": -120.1309814453125, + "step": 3950 + }, + { + "epoch": 0.7772325809617272, + "grad_norm": 333.02791884820846, + "learning_rate": 7.186590837681732e-08, + "logits/chosen": 1.4319672584533691, + "logits/rejected": 2.663729429244995, + "logps/chosen": -284.7814636230469, + "logps/rejected": -335.676513671875, + "loss": -0.6028, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.942358016967773, + "rewards/margins": 106.50833892822266, + "rewards/rejected": -128.45069885253906, + "step": 3960 + }, + { + "epoch": 0.7791952894995093, + "grad_norm": 200.70420257471542, + "learning_rate": 7.066821713715293e-08, + "logits/chosen": 0.5648600459098816, + "logits/rejected": 2.5612308979034424, + "logps/chosen": -322.00958251953125, + "logps/rejected": -450.244873046875, + "loss": -0.8002, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.121159553527832, + "rewards/margins": 157.29432678222656, + "rewards/rejected": -164.41549682617188, + "step": 3970 + }, + { + "epoch": 0.7811579980372915, + "grad_norm": 438.7939154886828, + "learning_rate": 6.947894521004357e-08, + "logits/chosen": 1.2530823945999146, + "logits/rejected": 1.3784297704696655, + "logps/chosen": -309.9731140136719, + "logps/rejected": -383.11029052734375, + "loss": -0.4638, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -27.896488189697266, + "rewards/margins": 63.006988525390625, + "rewards/rejected": -90.90348815917969, + "step": 3980 + }, + { + "epoch": 0.7831207065750736, + "grad_norm": 755.0624920692909, + "learning_rate": 6.829814842971965e-08, + "logits/chosen": 1.7242523431777954, + "logits/rejected": 1.5838003158569336, + "logps/chosen": -265.7640686035156, + "logps/rejected": -372.4088439941406, + "loss": -0.4205, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -39.63862609863281, + "rewards/margins": 85.05244445800781, + "rewards/rejected": -124.6910629272461, + "step": 3990 + }, + { + "epoch": 0.7850834151128557, + "grad_norm": 397.78536604269027, + "learning_rate": 6.712588223251809e-08, + "logits/chosen": 0.15322282910346985, + "logits/rejected": 1.4203684329986572, + "logps/chosen": -383.421142578125, + "logps/rejected": -383.93560791015625, + "loss": -0.3545, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -42.78777313232422, + "rewards/margins": 61.30333709716797, + "rewards/rejected": -104.09112548828125, + "step": 4000 + }, + { + "epoch": 0.7870461236506379, + "grad_norm": 162.3487897649674, + "learning_rate": 6.596220165428002e-08, + "logits/chosen": 0.6272414922714233, + "logits/rejected": 1.2616965770721436, + "logps/chosen": -265.5509338378906, + "logps/rejected": -371.5789794921875, + "loss": -0.5927, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.53268051147461, + "rewards/margins": 106.93350982666016, + "rewards/rejected": -123.46620178222656, + "step": 4010 + }, + { + "epoch": 0.78900883218842, + "grad_norm": 230.88984827692337, + "learning_rate": 6.48071613277669e-08, + "logits/chosen": 1.1987674236297607, + "logits/rejected": 1.6930396556854248, + "logps/chosen": -274.56378173828125, + "logps/rejected": -356.0934753417969, + "loss": -0.44, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -34.567588806152344, + "rewards/margins": 53.38029861450195, + "rewards/rejected": -87.9478988647461, + "step": 4020 + }, + { + "epoch": 0.7909715407262021, + "grad_norm": 187.75679830790082, + "learning_rate": 6.366081548009553e-08, + "logits/chosen": 1.311037302017212, + "logits/rejected": 2.419053316116333, + "logps/chosen": -312.9007568359375, + "logps/rejected": -362.9205017089844, + "loss": -0.4142, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -45.957069396972656, + "rewards/margins": 55.95958709716797, + "rewards/rejected": -101.91665649414062, + "step": 4030 + }, + { + "epoch": 0.7929342492639843, + "grad_norm": 145.60544370516428, + "learning_rate": 6.252321793019192e-08, + "logits/chosen": 1.6651710271835327, + "logits/rejected": 1.679205298423767, + "logps/chosen": -245.623046875, + "logps/rejected": -347.30169677734375, + "loss": -0.56, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -13.886880874633789, + "rewards/margins": 85.67262268066406, + "rewards/rejected": -99.55952453613281, + "step": 4040 + }, + { + "epoch": 0.7948969578017664, + "grad_norm": 439.07124592225847, + "learning_rate": 6.139442208626517e-08, + "logits/chosen": 1.7780866622924805, + "logits/rejected": 1.6243947744369507, + "logps/chosen": -223.4034423828125, + "logps/rejected": -285.50860595703125, + "loss": -0.428, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.53204917907715, + "rewards/margins": 94.08558654785156, + "rewards/rejected": -125.61763000488281, + "step": 4050 + }, + { + "epoch": 0.7968596663395485, + "grad_norm": 327.0604486754587, + "learning_rate": 6.027448094329963e-08, + "logits/chosen": 0.4172574579715729, + "logits/rejected": 0.574047863483429, + "logps/chosen": -236.00741577148438, + "logps/rejected": -358.6146545410156, + "loss": -0.5246, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 7.08117151260376, + "rewards/margins": 93.22477722167969, + "rewards/rejected": -86.14361572265625, + "step": 4060 + }, + { + "epoch": 0.7988223748773308, + "grad_norm": 490.6291439769595, + "learning_rate": 5.916344708056681e-08, + "logits/chosen": 0.927447497844696, + "logits/rejected": 1.9099938869476318, + "logps/chosen": -286.248779296875, + "logps/rejected": -347.1673278808594, + "loss": -0.561, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -17.070194244384766, + "rewards/margins": 97.71923828125, + "rewards/rejected": -114.7894287109375, + "step": 4070 + }, + { + "epoch": 0.8007850834151129, + "grad_norm": 216.6762728168146, + "learning_rate": 5.8061372659157306e-08, + "logits/chosen": 0.7928574681282043, + "logits/rejected": 2.0397815704345703, + "logps/chosen": -347.1395263671875, + "logps/rejected": -426.16302490234375, + "loss": -0.7094, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": 3.2770323753356934, + "rewards/margins": 100.47317504882812, + "rewards/rejected": -97.1961441040039, + "step": 4080 + }, + { + "epoch": 0.802747791952895, + "grad_norm": 351.3322000805113, + "learning_rate": 5.6968309419531376e-08, + "logits/chosen": 0.5025372505187988, + "logits/rejected": 1.037649393081665, + "logps/chosen": -327.008544921875, + "logps/rejected": -356.27886962890625, + "loss": -0.4612, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -37.835044860839844, + "rewards/margins": 66.51990509033203, + "rewards/rejected": -104.35494232177734, + "step": 4090 + }, + { + "epoch": 0.8047105004906772, + "grad_norm": 505.12633519862123, + "learning_rate": 5.5884308679090525e-08, + "logits/chosen": 0.9266678094863892, + "logits/rejected": 3.7431774139404297, + "logps/chosen": -289.0246276855469, + "logps/rejected": -349.05731201171875, + "loss": -0.6185, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -41.9423828125, + "rewards/margins": 123.00175476074219, + "rewards/rejected": -164.94412231445312, + "step": 4100 + }, + { + "epoch": 0.8066732090284593, + "grad_norm": 218.91844844788974, + "learning_rate": 5.480942132976732e-08, + "logits/chosen": 0.7196097373962402, + "logits/rejected": 2.6659862995147705, + "logps/chosen": -321.7400817871094, + "logps/rejected": -329.4438781738281, + "loss": -0.6874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1335063874721527, + "rewards/margins": 135.5712890625, + "rewards/rejected": -135.70480346679688, + "step": 4110 + }, + { + "epoch": 0.8086359175662414, + "grad_norm": 275.9408643046396, + "learning_rate": 5.374369783563698e-08, + "logits/chosen": 0.6858819127082825, + "logits/rejected": 1.181862473487854, + "logps/chosen": -319.489013671875, + "logps/rejected": -437.139404296875, + "loss": -0.5752, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -40.33269500732422, + "rewards/margins": 117.47265625, + "rewards/rejected": -157.80535888671875, + "step": 4120 + }, + { + "epoch": 0.8105986261040236, + "grad_norm": 259.9904942446092, + "learning_rate": 5.268718823054752e-08, + "logits/chosen": 0.6993392705917358, + "logits/rejected": 1.6426143646240234, + "logps/chosen": -291.2001647949219, + "logps/rejected": -404.6143493652344, + "loss": -0.5519, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -44.98619079589844, + "rewards/margins": 112.06353759765625, + "rewards/rejected": -157.0497283935547, + "step": 4130 + }, + { + "epoch": 0.8125613346418057, + "grad_norm": 369.8978673367864, + "learning_rate": 5.1639942115771384e-08, + "logits/chosen": 1.37197744846344, + "logits/rejected": 0.9585355520248413, + "logps/chosen": -290.52984619140625, + "logps/rejected": -297.7582702636719, + "loss": -0.3268, + "rewards/accuracies": 0.5, + "rewards/chosen": -57.57798385620117, + "rewards/margins": 22.77359962463379, + "rewards/rejected": -80.35157775878906, + "step": 4140 + }, + { + "epoch": 0.8145240431795878, + "grad_norm": 396.7142798917053, + "learning_rate": 5.060200865767605e-08, + "logits/chosen": 1.0594708919525146, + "logits/rejected": 1.8188024759292603, + "logps/chosen": -407.10394287109375, + "logps/rejected": -454.1275939941406, + "loss": -0.6956, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -33.43864822387695, + "rewards/margins": 117.25416564941406, + "rewards/rejected": -150.6927947998047, + "step": 4150 + }, + { + "epoch": 0.81648675171737, + "grad_norm": 346.76432338866283, + "learning_rate": 4.957343658541632e-08, + "logits/chosen": 1.1963056325912476, + "logits/rejected": 1.8911014795303345, + "logps/chosen": -236.36544799804688, + "logps/rejected": -420.71527099609375, + "loss": -0.698, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -5.097572326660156, + "rewards/margins": 145.96742248535156, + "rewards/rejected": -151.06500244140625, + "step": 4160 + }, + { + "epoch": 0.8184494602551521, + "grad_norm": 609.2657880796065, + "learning_rate": 4.8554274188646215e-08, + "logits/chosen": 0.6010076403617859, + "logits/rejected": 2.1071629524230957, + "logps/chosen": -310.2207946777344, + "logps/rejected": -350.9004821777344, + "loss": -0.3548, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -50.050010681152344, + "rewards/margins": 90.51728820800781, + "rewards/rejected": -140.5673065185547, + "step": 4170 + }, + { + "epoch": 0.8204121687929342, + "grad_norm": 299.5026463466133, + "learning_rate": 4.754456931525208e-08, + "logits/chosen": 1.6258262395858765, + "logits/rejected": 1.4584633111953735, + "logps/chosen": -285.9049987792969, + "logps/rejected": -380.3423767089844, + "loss": -0.6576, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -18.34375762939453, + "rewards/margins": 92.11905670166016, + "rewards/rejected": -110.4627914428711, + "step": 4180 + }, + { + "epoch": 0.8223748773307163, + "grad_norm": 206.57628278491475, + "learning_rate": 4.654436936910622e-08, + "logits/chosen": -0.2231508046388626, + "logits/rejected": 1.7128015756607056, + "logps/chosen": -319.3303527832031, + "logps/rejected": -415.04248046875, + "loss": -0.6426, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -14.945584297180176, + "rewards/margins": 155.222900390625, + "rewards/rejected": -170.16847229003906, + "step": 4190 + }, + { + "epoch": 0.8243375858684985, + "grad_norm": 105.2173841997405, + "learning_rate": 4.555372130784102e-08, + "logits/chosen": 0.2965846061706543, + "logits/rejected": 1.2652969360351562, + "logps/chosen": -401.03765869140625, + "logps/rejected": -413.9496154785156, + "loss": -0.7065, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -20.450271606445312, + "rewards/margins": 93.99417114257812, + "rewards/rejected": -114.4444580078125, + "step": 4200 + }, + { + "epoch": 0.8263002944062807, + "grad_norm": 410.71746386465617, + "learning_rate": 4.45726716406449e-08, + "logits/chosen": 0.3647512197494507, + "logits/rejected": 0.4564463198184967, + "logps/chosen": -344.83673095703125, + "logps/rejected": -329.1011657714844, + "loss": -0.3075, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -35.848228454589844, + "rewards/margins": 34.870399475097656, + "rewards/rejected": -70.7186279296875, + "step": 4210 + }, + { + "epoch": 0.8282630029440629, + "grad_norm": 384.917290181706, + "learning_rate": 4.360126642607842e-08, + "logits/chosen": 0.2658080756664276, + "logits/rejected": 2.3667030334472656, + "logps/chosen": -358.5342102050781, + "logps/rejected": -396.6856384277344, + "loss": -0.6292, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -28.129785537719727, + "rewards/margins": 130.65847778320312, + "rewards/rejected": -158.7882537841797, + "step": 4220 + }, + { + "epoch": 0.830225711481845, + "grad_norm": 295.7881487019572, + "learning_rate": 4.2639551269912034e-08, + "logits/chosen": 0.7166069149971008, + "logits/rejected": 1.645583152770996, + "logps/chosen": -218.4636688232422, + "logps/rejected": -284.74822998046875, + "loss": -0.5446, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -9.48962688446045, + "rewards/margins": 84.1449203491211, + "rewards/rejected": -93.63453674316406, + "step": 4230 + }, + { + "epoch": 0.8321884200196271, + "grad_norm": 351.7452805992152, + "learning_rate": 4.168757132298478e-08, + "logits/chosen": 1.1947740316390991, + "logits/rejected": 1.9185377359390259, + "logps/chosen": -316.4347229003906, + "logps/rejected": -383.91925048828125, + "loss": -0.3942, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -57.603797912597656, + "rewards/margins": 49.7264404296875, + "rewards/rejected": -107.33023834228516, + "step": 4240 + }, + { + "epoch": 0.8341511285574092, + "grad_norm": 814.0917068512831, + "learning_rate": 4.0745371279084976e-08, + "logits/chosen": 0.057543229311704636, + "logits/rejected": 1.2063575983047485, + "logps/chosen": -250.46231079101562, + "logps/rejected": -333.850341796875, + "loss": -0.4427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1185879707336426, + "rewards/margins": 107.40657043457031, + "rewards/rejected": -109.5251693725586, + "step": 4250 + }, + { + "epoch": 0.8361138370951914, + "grad_norm": 258.54518752910525, + "learning_rate": 3.9812995372851544e-08, + "logits/chosen": 1.233195424079895, + "logits/rejected": 1.9567787647247314, + "logps/chosen": -276.5019836425781, + "logps/rejected": -377.66082763671875, + "loss": -0.579, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -29.312414169311523, + "rewards/margins": 123.75312805175781, + "rewards/rejected": -153.06552124023438, + "step": 4260 + }, + { + "epoch": 0.8380765456329735, + "grad_norm": 239.6401093504306, + "learning_rate": 3.8890487377697265e-08, + "logits/chosen": 0.14301064610481262, + "logits/rejected": 0.6716960668563843, + "logps/chosen": -271.7588195800781, + "logps/rejected": -368.71533203125, + "loss": -0.7104, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.645185470581055, + "rewards/margins": 113.19022369384766, + "rewards/rejected": -130.8354034423828, + "step": 4270 + }, + { + "epoch": 0.8400392541707556, + "grad_norm": 290.5098880742371, + "learning_rate": 3.7977890603754e-08, + "logits/chosen": 0.24163508415222168, + "logits/rejected": 1.7025432586669922, + "logps/chosen": -356.93133544921875, + "logps/rejected": -414.83697509765625, + "loss": -0.4283, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -7.9239091873168945, + "rewards/margins": 93.69638061523438, + "rewards/rejected": -101.62028503417969, + "step": 4280 + }, + { + "epoch": 0.8420019627085378, + "grad_norm": 220.99981121958942, + "learning_rate": 3.707524789583891e-08, + "logits/chosen": 0.3767244815826416, + "logits/rejected": 1.6875295639038086, + "logps/chosen": -340.16790771484375, + "logps/rejected": -454.8905334472656, + "loss": -0.5899, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -26.359344482421875, + "rewards/margins": 119.7392578125, + "rewards/rejected": -146.09860229492188, + "step": 4290 + }, + { + "epoch": 0.8439646712463199, + "grad_norm": 302.29119119914856, + "learning_rate": 3.6182601631443596e-08, + "logits/chosen": 0.2487632930278778, + "logits/rejected": 0.7517064809799194, + "logps/chosen": -321.514892578125, + "logps/rejected": -384.12884521484375, + "loss": -0.7531, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": 1.8332033157348633, + "rewards/margins": 118.40811920166016, + "rewards/rejected": -116.57490539550781, + "step": 4300 + }, + { + "epoch": 0.845927379784102, + "grad_norm": 194.97602036985367, + "learning_rate": 3.529999371874381e-08, + "logits/chosen": 1.1088536977767944, + "logits/rejected": 2.119696617126465, + "logps/chosen": -296.379638671875, + "logps/rejected": -368.9100036621094, + "loss": -0.3116, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -14.844305038452148, + "rewards/margins": 91.52473449707031, + "rewards/rejected": -106.3690414428711, + "step": 4310 + }, + { + "epoch": 0.8478900883218842, + "grad_norm": 394.0520603985482, + "learning_rate": 3.4427465594632555e-08, + "logits/chosen": 0.7034338712692261, + "logits/rejected": 2.448606252670288, + "logps/chosen": -210.2288360595703, + "logps/rejected": -337.64605712890625, + "loss": -0.8014, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -9.511738777160645, + "rewards/margins": 150.11659240722656, + "rewards/rejected": -159.62832641601562, + "step": 4320 + }, + { + "epoch": 0.8498527968596663, + "grad_norm": 150.8570059388215, + "learning_rate": 3.356505822277417e-08, + "logits/chosen": 0.8254325985908508, + "logits/rejected": 1.6962730884552002, + "logps/chosen": -305.8452453613281, + "logps/rejected": -367.0594177246094, + "loss": -0.4724, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -35.75792694091797, + "rewards/margins": 68.71248626708984, + "rewards/rejected": -104.47042083740234, + "step": 4330 + }, + { + "epoch": 0.8518155053974484, + "grad_norm": 353.8048196765354, + "learning_rate": 3.271281209168186e-08, + "logits/chosen": 0.6131759881973267, + "logits/rejected": 2.7262377738952637, + "logps/chosen": -309.7735595703125, + "logps/rejected": -435.9186096191406, + "loss": -0.6477, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -33.432395935058594, + "rewards/margins": 186.08941650390625, + "rewards/rejected": -219.5218048095703, + "step": 4340 + }, + { + "epoch": 0.8537782139352306, + "grad_norm": 383.2116725375334, + "learning_rate": 3.187076721281595e-08, + "logits/chosen": -0.23545953631401062, + "logits/rejected": 0.920207679271698, + "logps/chosen": -248.1889190673828, + "logps/rejected": -330.3977966308594, + "loss": -0.5731, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.5350608825683594, + "rewards/margins": 88.65583038330078, + "rewards/rejected": -91.1908950805664, + "step": 4350 + }, + { + "epoch": 0.8557409224730128, + "grad_norm": 341.9486490830222, + "learning_rate": 3.1038963118706244e-08, + "logits/chosen": 1.2642277479171753, + "logits/rejected": 2.6628224849700928, + "logps/chosen": -305.7078552246094, + "logps/rejected": -417.4297790527344, + "loss": -0.6431, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -28.319299697875977, + "rewards/margins": 156.50613403320312, + "rewards/rejected": -184.82545471191406, + "step": 4360 + }, + { + "epoch": 0.8577036310107949, + "grad_norm": 273.98996550540824, + "learning_rate": 3.0217438861095315e-08, + "logits/chosen": 0.22947652637958527, + "logits/rejected": 0.4285706579685211, + "logps/chosen": -212.7989501953125, + "logps/rejected": -338.49627685546875, + "loss": -0.4624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -6.231478691101074, + "rewards/margins": 110.1903305053711, + "rewards/rejected": -116.42181396484375, + "step": 4370 + }, + { + "epoch": 0.8596663395485771, + "grad_norm": 185.10845009673469, + "learning_rate": 2.940623300910572e-08, + "logits/chosen": -0.6255972385406494, + "logits/rejected": 3.2109932899475098, + "logps/chosen": -319.77105712890625, + "logps/rejected": -424.9895935058594, + "loss": -0.795, + "rewards/accuracies": 0.9333332777023315, + "rewards/chosen": 0.09875945746898651, + "rewards/margins": 223.38894653320312, + "rewards/rejected": -223.29019165039062, + "step": 4380 + }, + { + "epoch": 0.8616290480863592, + "grad_norm": 323.2968043969208, + "learning_rate": 2.860538364742898e-08, + "logits/chosen": -0.2271687537431717, + "logits/rejected": 1.6766523122787476, + "logps/chosen": -372.1040954589844, + "logps/rejected": -314.25537109375, + "loss": -0.5056, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.2704625129699707, + "rewards/margins": 91.76817321777344, + "rewards/rejected": -94.03864288330078, + "step": 4390 + }, + { + "epoch": 0.8635917566241413, + "grad_norm": 277.72244186597015, + "learning_rate": 2.7814928374537334e-08, + "logits/chosen": 0.20215849578380585, + "logits/rejected": 1.1914180517196655, + "logps/chosen": -246.9439239501953, + "logps/rejected": -311.5885009765625, + "loss": -0.471, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -20.785297393798828, + "rewards/margins": 87.80458068847656, + "rewards/rejected": -108.58988189697266, + "step": 4400 + }, + { + "epoch": 0.8655544651619235, + "grad_norm": 420.6807121605461, + "learning_rate": 2.7034904300918982e-08, + "logits/chosen": 1.1174386739730835, + "logits/rejected": 1.056130051612854, + "logps/chosen": -253.21585083007812, + "logps/rejected": -417.9261169433594, + "loss": -0.5231, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -31.55215072631836, + "rewards/margins": 123.5484619140625, + "rewards/rejected": -155.10061645507812, + "step": 4410 + }, + { + "epoch": 0.8675171736997056, + "grad_norm": 250.21456798870338, + "learning_rate": 2.62653480473356e-08, + "logits/chosen": -0.5521323680877686, + "logits/rejected": 0.5378388166427612, + "logps/chosen": -293.2830505371094, + "logps/rejected": -350.2652893066406, + "loss": -0.4848, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -29.301555633544922, + "rewards/margins": 73.45783996582031, + "rewards/rejected": -102.75938415527344, + "step": 4420 + }, + { + "epoch": 0.8694798822374877, + "grad_norm": 415.18101305656563, + "learning_rate": 2.550629574310309e-08, + "logits/chosen": 1.223892331123352, + "logits/rejected": 1.8861782550811768, + "logps/chosen": -275.0828857421875, + "logps/rejected": -405.38543701171875, + "loss": -0.6553, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -35.61486053466797, + "rewards/margins": 86.40784454345703, + "rewards/rejected": -122.022705078125, + "step": 4430 + }, + { + "epoch": 0.8714425907752699, + "grad_norm": 208.67365775539722, + "learning_rate": 2.475778302439524e-08, + "logits/chosen": -0.16906538605690002, + "logits/rejected": 1.70730721950531, + "logps/chosen": -348.5227355957031, + "logps/rejected": -451.517822265625, + "loss": -0.7052, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -31.86871910095215, + "rewards/margins": 170.1761016845703, + "rewards/rejected": -202.04483032226562, + "step": 4440 + }, + { + "epoch": 0.873405299313052, + "grad_norm": 349.6349361120186, + "learning_rate": 2.4019845032570875e-08, + "logits/chosen": 0.5358771085739136, + "logits/rejected": 1.282859444618225, + "logps/chosen": -285.733154296875, + "logps/rejected": -432.317626953125, + "loss": -0.7335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -13.590060234069824, + "rewards/margins": 141.32090759277344, + "rewards/rejected": -154.91098022460938, + "step": 4450 + }, + { + "epoch": 0.8753680078508341, + "grad_norm": 283.2495966315424, + "learning_rate": 2.3292516412524054e-08, + "logits/chosen": -0.04940909147262573, + "logits/rejected": 1.5180885791778564, + "logps/chosen": -352.1752014160156, + "logps/rejected": -361.323974609375, + "loss": -0.5562, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -57.4222297668457, + "rewards/margins": 83.98561096191406, + "rewards/rejected": -141.4078369140625, + "step": 4460 + }, + { + "epoch": 0.8773307163886163, + "grad_norm": 590.2024998291921, + "learning_rate": 2.2575831311057225e-08, + "logits/chosen": 0.27069857716560364, + "logits/rejected": 2.1589179039001465, + "logps/chosen": -250.70681762695312, + "logps/rejected": -436.9722595214844, + "loss": -0.4881, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -17.34808921813965, + "rewards/margins": 185.95175170898438, + "rewards/rejected": -203.2998504638672, + "step": 4470 + }, + { + "epoch": 0.8792934249263984, + "grad_norm": 234.56569368498648, + "learning_rate": 2.1869823375278483e-08, + "logits/chosen": 0.6429430842399597, + "logits/rejected": 2.53413987159729, + "logps/chosen": -273.86456298828125, + "logps/rejected": -452.06494140625, + "loss": -0.7035, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -60.50865936279297, + "rewards/margins": 189.64010620117188, + "rewards/rejected": -250.1487579345703, + "step": 4480 + }, + { + "epoch": 0.8812561334641805, + "grad_norm": 133.18651889639486, + "learning_rate": 2.1174525751021578e-08, + "logits/chosen": 1.0968296527862549, + "logits/rejected": 1.6707656383514404, + "logps/chosen": -286.2945556640625, + "logps/rejected": -429.34136962890625, + "loss": -0.6453, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -14.320622444152832, + "rewards/margins": 143.9236602783203, + "rewards/rejected": -158.24429321289062, + "step": 4490 + }, + { + "epoch": 0.8832188420019627, + "grad_norm": 208.06989814235044, + "learning_rate": 2.0489971081290193e-08, + "logits/chosen": 0.28894519805908203, + "logits/rejected": 1.3963991403579712, + "logps/chosen": -314.00762939453125, + "logps/rejected": -371.5, + "loss": -0.5372, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -15.929452896118164, + "rewards/margins": 115.1006088256836, + "rewards/rejected": -131.03005981445312, + "step": 4500 + }, + { + "epoch": 0.8851815505397449, + "grad_norm": 290.0724683505124, + "learning_rate": 1.9816191504724826e-08, + "logits/chosen": -0.5656202435493469, + "logits/rejected": 0.5285595655441284, + "logps/chosen": -236.2646484375, + "logps/rejected": -363.3451843261719, + "loss": -0.7221, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -10.014857292175293, + "rewards/margins": 138.04251098632812, + "rewards/rejected": -148.05735778808594, + "step": 4510 + }, + { + "epoch": 0.887144259077527, + "grad_norm": 256.04984750637175, + "learning_rate": 1.9153218654094498e-08, + "logits/chosen": 0.3918920159339905, + "logits/rejected": 1.6313140392303467, + "logps/chosen": -289.8053894042969, + "logps/rejected": -390.0482177734375, + "loss": -0.6173, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -10.703710556030273, + "rewards/margins": 137.1112060546875, + "rewards/rejected": -147.81491088867188, + "step": 4520 + }, + { + "epoch": 0.8891069676153092, + "grad_norm": 289.09621609689054, + "learning_rate": 1.8501083654811206e-08, + "logits/chosen": 0.5931957364082336, + "logits/rejected": 0.6276336908340454, + "logps/chosen": -338.95697021484375, + "logps/rejected": -414.01190185546875, + "loss": -0.7391, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -42.674705505371094, + "rewards/margins": 110.20075988769531, + "rewards/rejected": -152.87545776367188, + "step": 4530 + }, + { + "epoch": 0.8910696761530913, + "grad_norm": 351.31332864253557, + "learning_rate": 1.7859817123469068e-08, + "logits/chosen": 1.5052107572555542, + "logits/rejected": 1.5364171266555786, + "logps/chosen": -233.75607299804688, + "logps/rejected": -320.6417236328125, + "loss": -0.388, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -32.718772888183594, + "rewards/margins": 58.3486442565918, + "rewards/rejected": -91.06742858886719, + "step": 4540 + }, + { + "epoch": 0.8930323846908734, + "grad_norm": 222.72125554681512, + "learning_rate": 1.7229449166406477e-08, + "logits/chosen": 0.37584689259529114, + "logits/rejected": 1.6475623846054077, + "logps/chosen": -358.82379150390625, + "logps/rejected": -423.95526123046875, + "loss": -0.6306, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -28.88714027404785, + "rewards/margins": 132.3856201171875, + "rewards/rejected": -161.2727508544922, + "step": 4550 + }, + { + "epoch": 0.8949950932286556, + "grad_norm": 269.894549429393, + "learning_rate": 1.66100093782931e-08, + "logits/chosen": 0.5159591436386108, + "logits/rejected": 1.579828143119812, + "logps/chosen": -282.75341796875, + "logps/rejected": -495.56402587890625, + "loss": -0.8634, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -20.95271110534668, + "rewards/margins": 183.48341369628906, + "rewards/rejected": -204.4361114501953, + "step": 4560 + }, + { + "epoch": 0.8969578017664377, + "grad_norm": 252.9639222655981, + "learning_rate": 1.600152684074005e-08, + "logits/chosen": 0.23654143512248993, + "logits/rejected": 1.355732798576355, + "logps/chosen": -366.46575927734375, + "logps/rejected": -434.37322998046875, + "loss": -0.4786, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -43.1453742980957, + "rewards/margins": 74.18260192871094, + "rewards/rejected": -117.3279800415039, + "step": 4570 + }, + { + "epoch": 0.8989205103042198, + "grad_norm": 258.081907888107, + "learning_rate": 1.540403012093483e-08, + "logits/chosen": 0.13490648567676544, + "logits/rejected": 1.4607415199279785, + "logps/chosen": -297.9033203125, + "logps/rejected": -315.2236633300781, + "loss": -0.7211, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": 10.252466201782227, + "rewards/margins": 86.24171447753906, + "rewards/rejected": -75.98924255371094, + "step": 4580 + }, + { + "epoch": 0.900883218842002, + "grad_norm": 322.93829995750394, + "learning_rate": 1.4817547270300185e-08, + "logits/chosen": 0.5269209146499634, + "logits/rejected": 0.8512266874313354, + "logps/chosen": -317.64691162109375, + "logps/rejected": -471.1598205566406, + "loss": -0.3715, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -49.822837829589844, + "rewards/margins": 70.89936828613281, + "rewards/rejected": -120.72221374511719, + "step": 4590 + }, + { + "epoch": 0.9028459273797841, + "grad_norm": 217.8857894740021, + "learning_rate": 1.4242105823176837e-08, + "logits/chosen": 0.020373066887259483, + "logits/rejected": 2.338036298751831, + "logps/chosen": -308.81036376953125, + "logps/rejected": -413.1044921875, + "loss": -0.7181, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": 11.094378471374512, + "rewards/margins": 169.42678833007812, + "rewards/rejected": -158.33242797851562, + "step": 4600 + }, + { + "epoch": 0.9048086359175662, + "grad_norm": 375.1278537276694, + "learning_rate": 1.3677732795531083e-08, + "logits/chosen": 0.8167325854301453, + "logits/rejected": 1.3168516159057617, + "logps/chosen": -307.87786865234375, + "logps/rejected": -451.4591369628906, + "loss": -0.544, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -43.726715087890625, + "rewards/margins": 93.41045379638672, + "rewards/rejected": -137.1371612548828, + "step": 4610 + }, + { + "epoch": 0.9067713444553483, + "grad_norm": 147.87957779599003, + "learning_rate": 1.3124454683686364e-08, + "logits/chosen": 0.45479816198349, + "logits/rejected": 0.7868109941482544, + "logps/chosen": -317.99322509765625, + "logps/rejected": -419.85888671875, + "loss": -0.5282, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -68.1338119506836, + "rewards/margins": 88.10193634033203, + "rewards/rejected": -156.23574829101562, + "step": 4620 + }, + { + "epoch": 0.9087340529931305, + "grad_norm": 170.49463587556912, + "learning_rate": 1.2582297463079288e-08, + "logits/chosen": 0.4195406436920166, + "logits/rejected": 2.507051706314087, + "logps/chosen": -266.1393127441406, + "logps/rejected": -328.5851135253906, + "loss": -0.5295, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -37.0715446472168, + "rewards/margins": 136.57232666015625, + "rewards/rejected": -173.6438751220703, + "step": 4630 + }, + { + "epoch": 0.9106967615309126, + "grad_norm": 124.71832674996668, + "learning_rate": 1.2051286587040049e-08, + "logits/chosen": 0.41768568754196167, + "logits/rejected": 1.3550302982330322, + "logps/chosen": -302.3821716308594, + "logps/rejected": -406.23291015625, + "loss": -0.6192, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -22.018672943115234, + "rewards/margins": 107.51679992675781, + "rewards/rejected": -129.5354766845703, + "step": 4640 + }, + { + "epoch": 0.9126594700686947, + "grad_norm": 174.39805397430672, + "learning_rate": 1.1531446985597604e-08, + "logits/chosen": 0.2835673391819, + "logits/rejected": 1.0853602886199951, + "logps/chosen": -398.7691345214844, + "logps/rejected": -419.5128479003906, + "loss": -0.5427, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -23.566162109375, + "rewards/margins": 109.63841247558594, + "rewards/rejected": -133.20458984375, + "step": 4650 + }, + { + "epoch": 0.914622178606477, + "grad_norm": 185.3400576110855, + "learning_rate": 1.1022803064309194e-08, + "logits/chosen": 0.5096344351768494, + "logits/rejected": 1.0800716876983643, + "logps/chosen": -350.9262390136719, + "logps/rejected": -509.3897399902344, + "loss": -0.6601, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": -33.37424850463867, + "rewards/margins": 95.1714096069336, + "rewards/rejected": -128.54566955566406, + "step": 4660 + }, + { + "epoch": 0.9165848871442591, + "grad_norm": 162.28773941593772, + "learning_rate": 1.0525378703114401e-08, + "logits/chosen": -1.083046793937683, + "logits/rejected": 0.14212393760681152, + "logps/chosen": -238.0006866455078, + "logps/rejected": -280.20257568359375, + "loss": -0.4034, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -27.042510986328125, + "rewards/margins": 63.006874084472656, + "rewards/rejected": -90.04937744140625, + "step": 4670 + }, + { + "epoch": 0.9185475956820413, + "grad_norm": 272.7718579878306, + "learning_rate": 1.0039197255214238e-08, + "logits/chosen": 0.2609715163707733, + "logits/rejected": 0.8051837086677551, + "logps/chosen": -244.71841430664062, + "logps/rejected": -375.2208251953125, + "loss": -0.5975, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -56.447486877441406, + "rewards/margins": 87.3301010131836, + "rewards/rejected": -143.777587890625, + "step": 4680 + }, + { + "epoch": 0.9205103042198234, + "grad_norm": 257.3650997787213, + "learning_rate": 9.564281545974661e-09, + "logits/chosen": 0.007027420215308666, + "logits/rejected": 0.8426044583320618, + "logps/chosen": -259.75238037109375, + "logps/rejected": -389.84857177734375, + "loss": -0.5895, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.872671127319336, + "rewards/margins": 138.10362243652344, + "rewards/rejected": -139.97628784179688, + "step": 4690 + }, + { + "epoch": 0.9224730127576055, + "grad_norm": 371.0909879912845, + "learning_rate": 9.100653871854963e-09, + "logits/chosen": -0.013437772169709206, + "logits/rejected": 0.2783178687095642, + "logps/chosen": -333.48175048828125, + "logps/rejected": -392.1842346191406, + "loss": -0.6072, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -37.490272521972656, + "rewards/margins": 65.58151245117188, + "rewards/rejected": -103.07179260253906, + "step": 4700 + }, + { + "epoch": 0.9244357212953876, + "grad_norm": 281.8196916198271, + "learning_rate": 8.648335999360934e-09, + "logits/chosen": 0.1921078860759735, + "logits/rejected": 1.6628574132919312, + "logps/chosen": -286.959716796875, + "logps/rejected": -335.0383605957031, + "loss": -0.6888, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -25.70328140258789, + "rewards/margins": 99.55555725097656, + "rewards/rejected": -125.25882720947266, + "step": 4710 + }, + { + "epoch": 0.9263984298331698, + "grad_norm": 349.0223170273162, + "learning_rate": 8.207349164023047e-09, + "logits/chosen": 1.5743603706359863, + "logits/rejected": 1.799330711364746, + "logps/chosen": -289.4337463378906, + "logps/rejected": -403.5376892089844, + "loss": -0.4601, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -24.794387817382812, + "rewards/margins": 117.5296630859375, + "rewards/rejected": -142.32403564453125, + "step": 4720 + }, + { + "epoch": 0.9283611383709519, + "grad_norm": 475.055288626442, + "learning_rate": 7.777714069399532e-09, + "logits/chosen": 0.0723201259970665, + "logits/rejected": 1.574939489364624, + "logps/chosen": -309.9425048828125, + "logps/rejected": -379.8987121582031, + "loss": -0.606, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -36.66429138183594, + "rewards/margins": 88.56461334228516, + "rewards/rejected": -125.2289047241211, + "step": 4730 + }, + { + "epoch": 0.930323846908734, + "grad_norm": 221.80323352458163, + "learning_rate": 7.359450886104263e-09, + "logits/chosen": 0.6704687476158142, + "logits/rejected": 2.1186952590942383, + "logps/chosen": -345.60015869140625, + "logps/rejected": -446.39013671875, + "loss": -0.5609, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -47.869842529296875, + "rewards/margins": 155.35009765625, + "rewards/rejected": -203.21994018554688, + "step": 4740 + }, + { + "epoch": 0.9322865554465162, + "grad_norm": 275.5212677879844, + "learning_rate": 6.9525792508597634e-09, + "logits/chosen": -0.15146800875663757, + "logits/rejected": -0.13504230976104736, + "logps/chosen": -305.78619384765625, + "logps/rejected": -354.84466552734375, + "loss": -0.6163, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.82454490661621, + "rewards/margins": 51.58939743041992, + "rewards/rejected": -68.41394805908203, + "step": 4750 + }, + { + "epoch": 0.9342492639842983, + "grad_norm": 503.8696271643102, + "learning_rate": 6.557118265575451e-09, + "logits/chosen": 0.7123690843582153, + "logits/rejected": 1.1601670980453491, + "logps/chosen": -345.5361328125, + "logps/rejected": -394.48638916015625, + "loss": -0.7137, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -40.36691665649414, + "rewards/margins": 78.15025329589844, + "rewards/rejected": -118.51716613769531, + "step": 4760 + }, + { + "epoch": 0.9362119725220804, + "grad_norm": 363.9435644530817, + "learning_rate": 6.1730864964507636e-09, + "logits/chosen": -0.12066509574651718, + "logits/rejected": 1.1250253915786743, + "logps/chosen": -306.2756652832031, + "logps/rejected": -320.68316650390625, + "loss": -0.7471, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -9.50562572479248, + "rewards/margins": 72.00847625732422, + "rewards/rejected": -81.51410675048828, + "step": 4770 + }, + { + "epoch": 0.9381746810598626, + "grad_norm": 219.3754347694064, + "learning_rate": 5.8005019731033615e-09, + "logits/chosen": -0.09917403757572174, + "logits/rejected": 1.4547569751739502, + "logps/chosen": -298.7986145019531, + "logps/rejected": -388.39215087890625, + "loss": -0.5021, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -18.909420013427734, + "rewards/margins": 146.1449432373047, + "rewards/rejected": -165.0543670654297, + "step": 4780 + }, + { + "epoch": 0.9401373895976447, + "grad_norm": 391.55353836484426, + "learning_rate": 5.439382187722968e-09, + "logits/chosen": -0.4104865491390228, + "logits/rejected": 1.3586753606796265, + "logps/chosen": -417.8433532714844, + "logps/rejected": -426.7972106933594, + "loss": -0.4973, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -34.476287841796875, + "rewards/margins": 105.99763488769531, + "rewards/rejected": -140.4739227294922, + "step": 4790 + }, + { + "epoch": 0.9421000981354269, + "grad_norm": 266.94716437698855, + "learning_rate": 5.089744094249837e-09, + "logits/chosen": -0.7528950572013855, + "logits/rejected": 2.4677882194519043, + "logps/chosen": -420.2793884277344, + "logps/rejected": -461.388916015625, + "loss": -0.4952, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -50.36171340942383, + "rewards/margins": 146.85055541992188, + "rewards/rejected": -197.2122802734375, + "step": 4800 + }, + { + "epoch": 0.9440628066732091, + "grad_norm": 409.1331980960002, + "learning_rate": 4.751604107579077e-09, + "logits/chosen": -0.7290471792221069, + "logits/rejected": 0.8242856860160828, + "logps/chosen": -294.11724853515625, + "logps/rejected": -413.388427734375, + "loss": -0.6069, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -10.515897750854492, + "rewards/margins": 156.26760864257812, + "rewards/rejected": -166.78350830078125, + "step": 4810 + }, + { + "epoch": 0.9460255152109912, + "grad_norm": 329.83492348886745, + "learning_rate": 4.424978102789661e-09, + "logits/chosen": -0.3126320242881775, + "logits/rejected": 1.815192461013794, + "logps/chosen": -415.0078125, + "logps/rejected": -398.8657531738281, + "loss": -0.5737, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -8.481783866882324, + "rewards/margins": 138.76304626464844, + "rewards/rejected": -147.2448272705078, + "step": 4820 + }, + { + "epoch": 0.9479882237487733, + "grad_norm": 146.79394961172602, + "learning_rate": 4.109881414399524e-09, + "logits/chosen": 0.2610158324241638, + "logits/rejected": 1.550054907798767, + "logps/chosen": -296.30718994140625, + "logps/rejected": -419.08721923828125, + "loss": -0.5933, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -17.670312881469727, + "rewards/margins": 120.4037857055664, + "rewards/rejected": -138.0740966796875, + "step": 4830 + }, + { + "epoch": 0.9499509322865555, + "grad_norm": 354.72111363647065, + "learning_rate": 3.806328835645272e-09, + "logits/chosen": 1.0623828172683716, + "logits/rejected": 1.783556580543518, + "logps/chosen": -293.881591796875, + "logps/rejected": -383.89410400390625, + "loss": -0.3878, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -52.4915771484375, + "rewards/margins": 102.1007308959961, + "rewards/rejected": -154.59231567382812, + "step": 4840 + }, + { + "epoch": 0.9519136408243376, + "grad_norm": 328.23163632228284, + "learning_rate": 3.5143346177878565e-09, + "logits/chosen": -0.5292250514030457, + "logits/rejected": 0.4050324559211731, + "logps/chosen": -361.75628662109375, + "logps/rejected": -405.6460266113281, + "loss": -0.5574, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 7.2587385177612305, + "rewards/margins": 143.06585693359375, + "rewards/rejected": -135.80711364746094, + "step": 4850 + }, + { + "epoch": 0.9538763493621197, + "grad_norm": 523.8293870872152, + "learning_rate": 3.233912469443545e-09, + "logits/chosen": 0.2311282902956009, + "logits/rejected": 2.5509095191955566, + "logps/chosen": -357.62530517578125, + "logps/rejected": -387.13946533203125, + "loss": -0.4784, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -29.86185646057129, + "rewards/margins": 137.68093872070312, + "rewards/rejected": -167.5428009033203, + "step": 4860 + }, + { + "epoch": 0.9558390578999019, + "grad_norm": 761.3471269440555, + "learning_rate": 2.9650755559401388e-09, + "logits/chosen": 0.3595990538597107, + "logits/rejected": 1.5829707384109497, + "logps/chosen": -351.6505432128906, + "logps/rejected": -460.19842529296875, + "loss": -0.4697, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -4.7347412109375, + "rewards/margins": 141.64712524414062, + "rewards/rejected": -146.38186645507812, + "step": 4870 + }, + { + "epoch": 0.957801766437684, + "grad_norm": 373.4846896173969, + "learning_rate": 2.7078364986990175e-09, + "logits/chosen": 1.0008618831634521, + "logits/rejected": 1.9765441417694092, + "logps/chosen": -432.39422607421875, + "logps/rejected": -442.600341796875, + "loss": -0.6348, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -36.65483474731445, + "rewards/margins": 96.07183837890625, + "rewards/rejected": -132.7266845703125, + "step": 4880 + }, + { + "epoch": 0.9597644749754661, + "grad_norm": 559.491522237486, + "learning_rate": 2.4622073746426165e-09, + "logits/chosen": 0.026527557522058487, + "logits/rejected": 0.9185472726821899, + "logps/chosen": -299.48248291015625, + "logps/rejected": -402.83929443359375, + "loss": -0.4808, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -25.574132919311523, + "rewards/margins": 152.12991333007812, + "rewards/rejected": -177.70404052734375, + "step": 4890 + }, + { + "epoch": 0.9617271835132483, + "grad_norm": 555.2586103731929, + "learning_rate": 2.2281997156273213e-09, + "logits/chosen": 0.6086570024490356, + "logits/rejected": 1.4897328615188599, + "logps/chosen": -335.24920654296875, + "logps/rejected": -352.65753173828125, + "loss": -0.3788, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -22.903039932250977, + "rewards/margins": 71.0635757446289, + "rewards/rejected": -93.96661376953125, + "step": 4900 + }, + { + "epoch": 0.9636898920510304, + "grad_norm": 156.6382222757462, + "learning_rate": 2.0058245079021265e-09, + "logits/chosen": -0.09430718421936035, + "logits/rejected": 1.1005871295928955, + "logps/chosen": -274.4385986328125, + "logps/rejected": -320.4398193359375, + "loss": -0.7222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.693513870239258, + "rewards/margins": 103.73075103759766, + "rewards/rejected": -118.42427825927734, + "step": 4910 + }, + { + "epoch": 0.9656526005888125, + "grad_norm": 151.0863886461919, + "learning_rate": 1.7950921915928784e-09, + "logits/chosen": 1.0376745462417603, + "logits/rejected": 2.0094356536865234, + "logps/chosen": -289.60052490234375, + "logps/rejected": -330.909912109375, + "loss": -0.5896, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -24.24953269958496, + "rewards/margins": 95.75595092773438, + "rewards/rejected": -120.00547790527344, + "step": 4920 + }, + { + "epoch": 0.9676153091265947, + "grad_norm": 166.83864869826812, + "learning_rate": 1.596012660212087e-09, + "logits/chosen": 0.8260341882705688, + "logits/rejected": 2.2165873050689697, + "logps/chosen": -346.9764709472656, + "logps/rejected": -354.48797607421875, + "loss": -0.6885, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.28257179260254, + "rewards/margins": 106.68492126464844, + "rewards/rejected": -134.96749877929688, + "step": 4930 + }, + { + "epoch": 0.9695780176643768, + "grad_norm": 448.83254470903984, + "learning_rate": 1.408595260194434e-09, + "logits/chosen": 0.4163898527622223, + "logits/rejected": 2.3370816707611084, + "logps/chosen": -365.4043884277344, + "logps/rejected": -368.92205810546875, + "loss": -0.4861, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -50.28551483154297, + "rewards/margins": 117.5450668334961, + "rewards/rejected": -167.83059692382812, + "step": 4940 + }, + { + "epoch": 0.971540726202159, + "grad_norm": 384.94449721636045, + "learning_rate": 1.2328487904580131e-09, + "logits/chosen": 0.29007938504219055, + "logits/rejected": 1.312359094619751, + "logps/chosen": -285.5650939941406, + "logps/rejected": -396.947021484375, + "loss": -0.594, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -71.74400329589844, + "rewards/margins": 105.836669921875, + "rewards/rejected": -177.58067321777344, + "step": 4950 + }, + { + "epoch": 0.9735034347399412, + "grad_norm": 179.93188002657797, + "learning_rate": 1.0687815019912173e-09, + "logits/chosen": 1.147339940071106, + "logits/rejected": 1.9018316268920898, + "logps/chosen": -297.005126953125, + "logps/rejected": -486.56402587890625, + "loss": -0.8956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.530776977539062, + "rewards/margins": 138.8377685546875, + "rewards/rejected": -155.36854553222656, + "step": 4960 + }, + { + "epoch": 0.9754661432777233, + "grad_norm": 171.77469049926168, + "learning_rate": 9.164010974653802e-10, + "logits/chosen": 0.19017255306243896, + "logits/rejected": 0.98823082447052, + "logps/chosen": -268.375732421875, + "logps/rejected": -376.2149353027344, + "loss": -0.5995, + "rewards/accuracies": 0.9000000953674316, + "rewards/chosen": -0.4377630352973938, + "rewards/margins": 103.22297668457031, + "rewards/rejected": -103.6607437133789, + "step": 4970 + }, + { + "epoch": 0.9774288518155054, + "grad_norm": 315.626629910148, + "learning_rate": 7.757147308731504e-10, + "logits/chosen": -0.06003303453326225, + "logits/rejected": 1.8267772197723389, + "logps/chosen": -317.6167907714844, + "logps/rejected": -414.8966369628906, + "loss": -0.5966, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -9.810222625732422, + "rewards/margins": 135.85130310058594, + "rewards/rejected": -145.66152954101562, + "step": 4980 + }, + { + "epoch": 0.9793915603532876, + "grad_norm": 209.33760832517214, + "learning_rate": 6.467290071925646e-10, + "logits/chosen": 0.8949087858200073, + "logits/rejected": 0.8340100049972534, + "logps/chosen": -269.3585205078125, + "logps/rejected": -375.28399658203125, + "loss": -0.4299, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -55.363548278808594, + "rewards/margins": 97.962646484375, + "rewards/rejected": -153.32620239257812, + "step": 4990 + }, + { + "epoch": 0.9813542688910697, + "grad_norm": 197.62618569733775, + "learning_rate": 5.29449982077046e-10, + "logits/chosen": 0.3139314651489258, + "logits/rejected": 1.3526438474655151, + "logps/chosen": -324.19903564453125, + "logps/rejected": -368.36492919921875, + "loss": -0.6841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -39.67112350463867, + "rewards/margins": 114.58052062988281, + "rewards/rejected": -154.25164794921875, + "step": 5000 + }, + { + "epoch": 0.9833169774288518, + "grad_norm": 226.4959541478614, + "learning_rate": 4.2388316157104806e-10, + "logits/chosen": 0.5193920731544495, + "logits/rejected": 2.0736169815063477, + "logps/chosen": -278.8150634765625, + "logps/rejected": -372.239990234375, + "loss": -0.7287, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -10.922646522521973, + "rewards/margins": 139.96482849121094, + "rewards/rejected": -150.88748168945312, + "step": 5010 + }, + { + "epoch": 0.985279685966634, + "grad_norm": 237.75346125543064, + "learning_rate": 3.300335018515676e-10, + "logits/chosen": -0.4149685800075531, + "logits/rejected": 1.0835611820220947, + "logps/chosen": -233.6608428955078, + "logps/rejected": -297.69451904296875, + "loss": -0.6249, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -19.882762908935547, + "rewards/margins": 113.03089904785156, + "rewards/rejected": -132.9136505126953, + "step": 5020 + }, + { + "epoch": 0.9872423945044161, + "grad_norm": 162.94130412364044, + "learning_rate": 2.4790540899546907e-10, + "logits/chosen": 0.4125348627567291, + "logits/rejected": 1.145112156867981, + "logps/chosen": -248.94131469726562, + "logps/rejected": -460.9149475097656, + "loss": -0.6704, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.890644073486328, + "rewards/margins": 158.34779357910156, + "rewards/rejected": -185.23843383789062, + "step": 5030 + }, + { + "epoch": 0.9892051030421982, + "grad_norm": 364.84350185093984, + "learning_rate": 1.7750273877262244e-10, + "logits/chosen": 0.2277032881975174, + "logits/rejected": 1.6701894998550415, + "logps/chosen": -314.9948425292969, + "logps/rejected": -410.34228515625, + "loss": -0.704, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -26.898975372314453, + "rewards/margins": 122.16634368896484, + "rewards/rejected": -149.06533813476562, + "step": 5040 + }, + { + "epoch": 0.9911678115799804, + "grad_norm": 279.13631035832304, + "learning_rate": 1.1882879646485379e-10, + "logits/chosen": 0.7624462842941284, + "logits/rejected": 2.067471504211426, + "logps/chosen": -255.6316375732422, + "logps/rejected": -410.8294372558594, + "loss": -0.9275, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.428985595703125, + "rewards/margins": 164.67681884765625, + "rewards/rejected": -195.10580444335938, + "step": 5050 + }, + { + "epoch": 0.9931305201177625, + "grad_norm": 248.91485646622564, + "learning_rate": 7.188633671079136e-11, + "logits/chosen": 0.11063379049301147, + "logits/rejected": 1.7005183696746826, + "logps/chosen": -313.30157470703125, + "logps/rejected": -358.89654541015625, + "loss": -0.404, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -25.09392738342285, + "rewards/margins": 124.01396179199219, + "rewards/rejected": -149.10787963867188, + "step": 5060 + }, + { + "epoch": 0.9950932286555446, + "grad_norm": 184.30903455395242, + "learning_rate": 3.6677563376580344e-11, + "logits/chosen": -0.1692899465560913, + "logits/rejected": 1.1702044010162354, + "logps/chosen": -288.71173095703125, + "logps/rejected": -450.014404296875, + "loss": -0.4779, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -23.89110565185547, + "rewards/margins": 115.5859146118164, + "rewards/rejected": -139.47702026367188, + "step": 5070 + }, + { + "epoch": 0.9970559371933267, + "grad_norm": 356.10024359954843, + "learning_rate": 1.3204129452354385e-11, + "logits/chosen": 0.4353364109992981, + "logits/rejected": 1.390868067741394, + "logps/chosen": -288.8840637207031, + "logps/rejected": -421.11041259765625, + "loss": -0.6044, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.802814483642578, + "rewards/margins": 118.05680847167969, + "rewards/rejected": -135.859619140625, + "step": 5080 + }, + { + "epoch": 0.9990186457311089, + "grad_norm": 342.9402544399462, + "learning_rate": 1.467136974631078e-12, + "logits/chosen": 0.0052385092712938786, + "logits/rejected": 1.605820655822754, + "logps/chosen": -289.46038818359375, + "logps/rejected": -320.14251708984375, + "loss": -0.8449, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -17.538249969482422, + "rewards/margins": 100.76835632324219, + "rewards/rejected": -118.3066177368164, + "step": 5090 + }, + { + "epoch": 1.0, + "step": 5095, + "total_flos": 0.0, + "train_loss": -0.4546862483018428, + "train_runtime": 14609.9717, + "train_samples_per_second": 4.184, + "train_steps_per_second": 0.349 + } + ], + "logging_steps": 10, + "max_steps": 5095, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}