diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3336113427856547, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004170141784820684, + "grad_norm": 17.774518966674805, + "learning_rate": 1.1582117211026176e-09, + "logits/chosen": -0.3961139917373657, + "logits/rejected": -1.0504034757614136, + "logps/chosen": -963.864013671875, + "logps/rejected": -515.4060668945312, + "loss": 0.6931, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.00543212890625e-05, + "rewards/margins": 2.746581958490424e-05, + "rewards/rejected": 1.2588501704158261e-05, + "step": 1 + }, + { + "epoch": 0.0008340283569641367, + "grad_norm": 19.28428840637207, + "learning_rate": 2.316423442205235e-09, + "logits/chosen": -1.182056188583374, + "logits/rejected": -0.8520368337631226, + "logps/chosen": -1038.871826171875, + "logps/rejected": -546.56103515625, + "loss": 0.6931, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0817263652570546e-05, + "rewards/margins": 2.5558472771081142e-05, + "rewards/rejected": 1.52587890625e-05, + "step": 2 + }, + { + "epoch": 0.0012510425354462051, + "grad_norm": 83.27648162841797, + "learning_rate": 3.474635163307853e-09, + "logits/chosen": -1.3219187259674072, + "logits/rejected": -0.8898472785949707, + "logps/chosen": -1010.59765625, + "logps/rejected": -381.141845703125, + "loss": 0.6931, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.730224463855848e-05, + "rewards/margins": 3.5476681659929454e-05, + "rewards/rejected": 1.182556115963962e-05, + "step": 3 + }, + { + "epoch": 0.0016680567139282735, + "grad_norm": 26.990997314453125, + "learning_rate": 4.63284688441047e-09, + "logits/chosen": -0.7544832825660706, + "logits/rejected": -0.8729987740516663, + "logps/chosen": -1195.995849609375, + "logps/rejected": -555.9637451171875, + "loss": 0.6901, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006339645013213158, + "rewards/margins": 0.006165075581520796, + "rewards/rejected": 0.0001745700865285471, + "step": 4 + }, + { + "epoch": 0.002085070892410342, + "grad_norm": 15.815837860107422, + "learning_rate": 5.7910586055130885e-09, + "logits/chosen": -0.8043867349624634, + "logits/rejected": -0.9521368741989136, + "logps/chosen": -999.9270629882812, + "logps/rejected": -512.778564453125, + "loss": 0.6988, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005383300594985485, + "rewards/margins": -0.011165238916873932, + "rewards/rejected": 0.005781936924904585, + "step": 5 + }, + { + "epoch": 0.0025020850708924102, + "grad_norm": 25.579988479614258, + "learning_rate": 6.949270326615706e-09, + "logits/chosen": -0.2441806197166443, + "logits/rejected": -0.844770073890686, + "logps/chosen": -1177.0252685546875, + "logps/rejected": -594.0948486328125, + "loss": 0.6806, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0012603756040334702, + "rewards/margins": 0.02732086181640625, + "rewards/rejected": -0.02858123555779457, + "step": 6 + }, + { + "epoch": 0.0029190992493744786, + "grad_norm": 21.802959442138672, + "learning_rate": 8.107482047718323e-09, + "logits/chosen": -1.2792237997055054, + "logits/rejected": -0.6900994777679443, + "logps/chosen": -975.126708984375, + "logps/rejected": -589.58056640625, + "loss": 0.7079, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0015514381229877472, + "rewards/margins": -0.026293184608221054, + "rewards/rejected": 0.027844620868563652, + "step": 7 + }, + { + "epoch": 0.003336113427856547, + "grad_norm": 24.057151794433594, + "learning_rate": 9.26569376882094e-09, + "logits/chosen": -1.200152039527893, + "logits/rejected": NaN, + "logps/chosen": -857.7166748046875, + "logps/rejected": -248.6055908203125, + "loss": 0.721, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04731331020593643, + "rewards/margins": -0.052849769592285156, + "rewards/rejected": 0.005536460783332586, + "step": 8 + }, + { + "epoch": 0.0037531276063386154, + "grad_norm": 17.40530014038086, + "learning_rate": 1.0423905489923558e-08, + "logits/chosen": -0.9301769137382507, + "logits/rejected": -0.9678241014480591, + "logps/chosen": -955.018798828125, + "logps/rejected": -530.345947265625, + "loss": 0.6874, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013547897338867188, + "rewards/margins": 0.017159275710582733, + "rewards/rejected": -0.030707169324159622, + "step": 9 + }, + { + "epoch": 0.004170141784820684, + "grad_norm": 13.608105659484863, + "learning_rate": 1.1582117211026177e-08, + "logits/chosen": -0.927754282951355, + "logits/rejected": -1.273963451385498, + "logps/chosen": -798.5603637695312, + "logps/rejected": -426.7510986328125, + "loss": 0.671, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.007159805856645107, + "rewards/margins": 0.04976654052734375, + "rewards/rejected": -0.04260673373937607, + "step": 10 + }, + { + "epoch": 0.0045871559633027525, + "grad_norm": 19.300207138061523, + "learning_rate": 1.2740328932128794e-08, + "logits/chosen": -0.7852665185928345, + "logits/rejected": -0.7461918592453003, + "logps/chosen": -1011.228759765625, + "logps/rejected": -597.04248046875, + "loss": 0.6776, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.033849336206912994, + "rewards/margins": 0.03994865342974663, + "rewards/rejected": -0.0060993218794465065, + "step": 11 + }, + { + "epoch": 0.0050041701417848205, + "grad_norm": 17.744590759277344, + "learning_rate": 1.3898540653231412e-08, + "logits/chosen": -1.1916190385818481, + "logits/rejected": NaN, + "logps/chosen": -913.2021484375, + "logps/rejected": -358.2269592285156, + "loss": 0.6901, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0025638602674007416, + "rewards/margins": 0.012960247695446014, + "rewards/rejected": -0.010396385565400124, + "step": 12 + }, + { + "epoch": 0.005421184320266889, + "grad_norm": 25.25324821472168, + "learning_rate": 1.505675237433403e-08, + "logits/chosen": -0.6415953636169434, + "logits/rejected": -0.9880213737487793, + "logps/chosen": -1102.93701171875, + "logps/rejected": -589.987060546875, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01995544694364071, + "rewards/margins": -0.003168296068906784, + "rewards/rejected": 0.023123741149902344, + "step": 13 + }, + { + "epoch": 0.005838198498748957, + "grad_norm": 21.058212280273438, + "learning_rate": 1.6214964095436646e-08, + "logits/chosen": -1.3455449342727661, + "logits/rejected": -0.8477530479431152, + "logps/chosen": -1140.7501220703125, + "logps/rejected": -587.849609375, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0025131218135356903, + "rewards/margins": 0.011837390251457691, + "rewards/rejected": -0.014350511133670807, + "step": 14 + }, + { + "epoch": 0.006255212677231026, + "grad_norm": 19.774621963500977, + "learning_rate": 1.7373175816539264e-08, + "logits/chosen": -0.16635353863239288, + "logits/rejected": -1.1190940141677856, + "logps/chosen": -922.7181396484375, + "logps/rejected": -469.08843994140625, + "loss": 0.7293, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09469223022460938, + "rewards/margins": -0.06941758096218109, + "rewards/rejected": -0.02527465671300888, + "step": 15 + }, + { + "epoch": 0.006672226855713094, + "grad_norm": 14.15581226348877, + "learning_rate": 1.853138753764188e-08, + "logits/chosen": -0.9874470233917236, + "logits/rejected": NaN, + "logps/chosen": -899.2044067382812, + "logps/rejected": -362.26171875, + "loss": 0.6714, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023412322625517845, + "rewards/margins": 0.04711799696087837, + "rewards/rejected": -0.023705672472715378, + "step": 16 + }, + { + "epoch": 0.007089241034195163, + "grad_norm": 32.259742736816406, + "learning_rate": 1.96895992587445e-08, + "logits/chosen": -0.9073110222816467, + "logits/rejected": -0.9444566965103149, + "logps/chosen": -982.8934326171875, + "logps/rejected": -543.4546508789062, + "loss": 0.6972, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.006610107142478228, + "rewards/margins": -0.005303763784468174, + "rewards/rejected": 0.011913875117897987, + "step": 17 + }, + { + "epoch": 0.007506255212677231, + "grad_norm": 14.036819458007812, + "learning_rate": 2.0847810979847116e-08, + "logits/chosen": -0.3268158435821533, + "logits/rejected": -1.094388723373413, + "logps/chosen": -738.9466552734375, + "logps/rejected": -402.3883056640625, + "loss": 0.7165, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04739837720990181, + "rewards/margins": -0.04348164051771164, + "rewards/rejected": -0.003916741348803043, + "step": 18 + }, + { + "epoch": 0.0079232693911593, + "grad_norm": 221.34130859375, + "learning_rate": 2.2006022700949733e-08, + "logits/chosen": -1.225860595703125, + "logits/rejected": -0.5591225028038025, + "logps/chosen": -964.0198364257812, + "logps/rejected": -433.41925048828125, + "loss": 0.7011, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.009828949347138405, + "rewards/margins": -0.01293983869254589, + "rewards/rejected": 0.0031108856201171875, + "step": 19 + }, + { + "epoch": 0.008340283569641367, + "grad_norm": 15.74856185913086, + "learning_rate": 2.3164234422052354e-08, + "logits/chosen": -0.9485300779342651, + "logits/rejected": -0.9104875326156616, + "logps/chosen": -1000.811279296875, + "logps/rejected": -411.12225341796875, + "loss": 0.7157, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04155120626091957, + "rewards/margins": -0.043171025812625885, + "rewards/rejected": 0.0016198167577385902, + "step": 20 + }, + { + "epoch": 0.008757297748123435, + "grad_norm": 50.960758209228516, + "learning_rate": 2.432244614315497e-08, + "logits/chosen": -0.8578770160675049, + "logits/rejected": -0.9583806991577148, + "logps/chosen": -971.6134033203125, + "logps/rejected": -498.82757568359375, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024937057867646217, + "rewards/margins": 0.008140943944454193, + "rewards/rejected": 0.016796112060546875, + "step": 21 + }, + { + "epoch": 0.009174311926605505, + "grad_norm": 21.18694305419922, + "learning_rate": 2.548065786425759e-08, + "logits/chosen": -0.25711873173713684, + "logits/rejected": -0.9464637637138367, + "logps/chosen": -1058.75, + "logps/rejected": -625.6365966796875, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03243407979607582, + "rewards/margins": -0.003665735013782978, + "rewards/rejected": 0.03609981760382652, + "step": 22 + }, + { + "epoch": 0.009591326105087573, + "grad_norm": 16.65604591369629, + "learning_rate": 2.6638869585360206e-08, + "logits/chosen": -0.6770492196083069, + "logits/rejected": -1.0457276105880737, + "logps/chosen": -956.3612060546875, + "logps/rejected": -636.1737060546875, + "loss": 0.6791, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.031128691509366035, + "rewards/margins": 0.03254127502441406, + "rewards/rejected": -0.0014125816524028778, + "step": 23 + }, + { + "epoch": 0.010008340283569641, + "grad_norm": 20.375097274780273, + "learning_rate": 2.7797081306462823e-08, + "logits/chosen": -0.6488408446311951, + "logits/rejected": -0.8693333864212036, + "logps/chosen": -1105.8828125, + "logps/rejected": -648.3882446289062, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028141021728515625, + "rewards/margins": 0.014588158577680588, + "rewards/rejected": 0.013552858494222164, + "step": 24 + }, + { + "epoch": 0.010425354462051709, + "grad_norm": 36.17469787597656, + "learning_rate": 2.895529302756544e-08, + "logits/chosen": -1.6341822147369385, + "logits/rejected": -0.7528332471847534, + "logps/chosen": -1147.047119140625, + "logps/rejected": -579.8052978515625, + "loss": 0.6607, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0079193115234375, + "rewards/margins": 0.07500076293945312, + "rewards/rejected": -0.08292007446289062, + "step": 25 + }, + { + "epoch": 0.010842368640533779, + "grad_norm": 17.880525588989258, + "learning_rate": 3.011350474866806e-08, + "logits/chosen": -0.42070281505584717, + "logits/rejected": -0.9035682678222656, + "logps/chosen": -897.2158203125, + "logps/rejected": -618.46337890625, + "loss": 0.6858, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07063140720129013, + "rewards/margins": 0.02218933030962944, + "rewards/rejected": 0.04844207689166069, + "step": 26 + }, + { + "epoch": 0.011259382819015847, + "grad_norm": 40.74405288696289, + "learning_rate": 3.127171646977068e-08, + "logits/chosen": -0.7300899028778076, + "logits/rejected": -0.9085140228271484, + "logps/chosen": -1035.1669921875, + "logps/rejected": -549.1453857421875, + "loss": 0.7142, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030368808656930923, + "rewards/margins": -0.03819580376148224, + "rewards/rejected": 0.007826998829841614, + "step": 27 + }, + { + "epoch": 0.011676396997497914, + "grad_norm": 19.361570358276367, + "learning_rate": 3.242992819087329e-08, + "logits/chosen": -0.47741249203681946, + "logits/rejected": -1.143082618713379, + "logps/chosen": -827.9263916015625, + "logps/rejected": -428.64471435546875, + "loss": 0.6679, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03403663635253906, + "rewards/margins": 0.05536498874425888, + "rewards/rejected": -0.02132835052907467, + "step": 28 + }, + { + "epoch": 0.012093411175979984, + "grad_norm": 18.17047882080078, + "learning_rate": 3.3588139911975914e-08, + "logits/chosen": -0.8834643363952637, + "logits/rejected": -0.7606860995292664, + "logps/chosen": -1108.857177734375, + "logps/rejected": -574.9135131835938, + "loss": 0.6355, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08760452270507812, + "rewards/margins": 0.12379761040210724, + "rewards/rejected": -0.036193087697029114, + "step": 29 + }, + { + "epoch": 0.012510425354462052, + "grad_norm": 16.501026153564453, + "learning_rate": 3.474635163307853e-08, + "logits/chosen": -0.8803046345710754, + "logits/rejected": -0.9054956436157227, + "logps/chosen": -902.9686889648438, + "logps/rejected": -563.65234375, + "loss": 0.6865, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07099953293800354, + "rewards/margins": 0.01577453315258026, + "rewards/rejected": 0.05522499233484268, + "step": 30 + }, + { + "epoch": 0.01292743953294412, + "grad_norm": 16.966527938842773, + "learning_rate": 3.590456335418115e-08, + "logits/chosen": -0.9364233016967773, + "logits/rejected": -0.8004406690597534, + "logps/chosen": -873.32373046875, + "logps/rejected": -378.0670166015625, + "loss": 0.7163, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.013535309582948685, + "rewards/margins": -0.04332313686609268, + "rewards/rejected": 0.05685844644904137, + "step": 31 + }, + { + "epoch": 0.013344453711426188, + "grad_norm": 22.800033569335938, + "learning_rate": 3.706277507528376e-08, + "logits/chosen": -0.7299998998641968, + "logits/rejected": -0.842666506767273, + "logps/chosen": -1166.4217529296875, + "logps/rejected": -672.5575561523438, + "loss": 0.6692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02117156982421875, + "rewards/margins": 0.06013145670294762, + "rewards/rejected": -0.03895988315343857, + "step": 32 + }, + { + "epoch": 0.013761467889908258, + "grad_norm": 15.049094200134277, + "learning_rate": 3.822098679638638e-08, + "logits/chosen": -0.419299453496933, + "logits/rejected": -1.1496597528457642, + "logps/chosen": -881.0628662109375, + "logps/rejected": -515.5440673828125, + "loss": 0.6681, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03231964260339737, + "rewards/margins": 0.053334809839725494, + "rewards/rejected": -0.021015167236328125, + "step": 33 + }, + { + "epoch": 0.014178482068390326, + "grad_norm": 16.042755126953125, + "learning_rate": 3.9379198517489e-08, + "logits/chosen": -0.8055427670478821, + "logits/rejected": NaN, + "logps/chosen": -1183.39404296875, + "logps/rejected": -528.0593872070312, + "loss": 0.7205, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05260619893670082, + "rewards/margins": -0.04944305494427681, + "rewards/rejected": -0.0031631477177143097, + "step": 34 + }, + { + "epoch": 0.014595496246872394, + "grad_norm": 31.73788833618164, + "learning_rate": 4.053741023859162e-08, + "logits/chosen": -1.188354730606079, + "logits/rejected": -0.3838965892791748, + "logps/chosen": -1412.1552734375, + "logps/rejected": -824.401123046875, + "loss": 0.7102, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.030733490362763405, + "rewards/margins": -0.03291969373822212, + "rewards/rejected": 0.06365318596363068, + "step": 35 + }, + { + "epoch": 0.015012510425354461, + "grad_norm": 18.99544334411621, + "learning_rate": 4.169562195969423e-08, + "logits/chosen": -0.8527953028678894, + "logits/rejected": -0.9829111099243164, + "logps/chosen": -973.263671875, + "logps/rejected": -492.7947998046875, + "loss": 0.7314, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10427513718605042, + "rewards/margins": -0.07180366665124893, + "rewards/rejected": -0.03247147053480148, + "step": 36 + }, + { + "epoch": 0.015429524603836531, + "grad_norm": 16.79134750366211, + "learning_rate": 4.285383368079685e-08, + "logits/chosen": -0.8428342342376709, + "logits/rejected": -0.9502476453781128, + "logps/chosen": -1146.2225341796875, + "logps/rejected": -597.0903930664062, + "loss": 0.6796, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.04320678859949112, + "rewards/margins": 0.035399630665779114, + "rewards/rejected": 0.007807157933712006, + "step": 37 + }, + { + "epoch": 0.0158465387823186, + "grad_norm": 19.277873992919922, + "learning_rate": 4.4012045401899467e-08, + "logits/chosen": -0.7517361640930176, + "logits/rejected": -0.6205301284790039, + "logps/chosen": -1177.94580078125, + "logps/rejected": -675.190185546875, + "loss": 0.6813, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01098480261862278, + "rewards/margins": 0.03953704237937927, + "rewards/rejected": -0.02855224534869194, + "step": 38 + }, + { + "epoch": 0.01626355296080067, + "grad_norm": 23.51366424560547, + "learning_rate": 4.517025712300209e-08, + "logits/chosen": -1.009857177734375, + "logits/rejected": -0.8019417524337769, + "logps/chosen": -984.2115478515625, + "logps/rejected": -439.8927307128906, + "loss": 0.7306, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04855651780962944, + "rewards/margins": -0.06541386246681213, + "rewards/rejected": 0.016857337206602097, + "step": 39 + }, + { + "epoch": 0.016680567139282735, + "grad_norm": 38.0661735534668, + "learning_rate": 4.632846884410471e-08, + "logits/chosen": -0.6788904666900635, + "logits/rejected": -0.8065959215164185, + "logps/chosen": -1093.99462890625, + "logps/rejected": -516.011962890625, + "loss": 0.7349, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.061658862978219986, + "rewards/margins": -0.0759098082780838, + "rewards/rejected": 0.014250943437218666, + "step": 40 + }, + { + "epoch": 0.017097581317764805, + "grad_norm": 19.51694107055664, + "learning_rate": 4.748668056520732e-08, + "logits/chosen": -0.7552354335784912, + "logits/rejected": -0.9037665724754333, + "logps/chosen": -969.8609619140625, + "logps/rejected": -562.0914916992188, + "loss": 0.7231, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0014366135001182556, + "rewards/margins": -0.05512581020593643, + "rewards/rejected": 0.05656242370605469, + "step": 41 + }, + { + "epoch": 0.01751459549624687, + "grad_norm": 21.316509246826172, + "learning_rate": 4.864489228630994e-08, + "logits/chosen": -1.6140897274017334, + "logits/rejected": NaN, + "logps/chosen": -1146.27783203125, + "logps/rejected": -636.063720703125, + "loss": 0.7181, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01391678024083376, + "rewards/margins": -0.044801902025938034, + "rewards/rejected": 0.030885126441717148, + "step": 42 + }, + { + "epoch": 0.01793160967472894, + "grad_norm": 60.35101318359375, + "learning_rate": 4.980310400741256e-08, + "logits/chosen": -1.1478383541107178, + "logits/rejected": -0.6564218997955322, + "logps/chosen": -1247.0919189453125, + "logps/rejected": -700.3190307617188, + "loss": 0.6865, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.040283773094415665, + "rewards/margins": 0.014703751541674137, + "rewards/rejected": 0.0255800262093544, + "step": 43 + }, + { + "epoch": 0.01834862385321101, + "grad_norm": 16.85247230529785, + "learning_rate": 5.096131572851518e-08, + "logits/chosen": -1.2633756399154663, + "logits/rejected": NaN, + "logps/chosen": -941.3444213867188, + "logps/rejected": -351.49249267578125, + "loss": 0.6945, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.014347076416015625, + "rewards/margins": -0.002180100418627262, + "rewards/rejected": -0.012166975997388363, + "step": 44 + }, + { + "epoch": 0.018765638031693076, + "grad_norm": 19.327016830444336, + "learning_rate": 5.211952744961779e-08, + "logits/chosen": -0.6868521571159363, + "logits/rejected": -1.279496192932129, + "logps/chosen": -831.4959716796875, + "logps/rejected": -434.26202392578125, + "loss": 0.6941, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.015859223902225494, + "rewards/margins": 0.0012533185072243214, + "rewards/rejected": 0.014605903998017311, + "step": 45 + }, + { + "epoch": 0.019182652210175146, + "grad_norm": 19.341440200805664, + "learning_rate": 5.327773917072041e-08, + "logits/chosen": -1.1113828420639038, + "logits/rejected": -0.8087471723556519, + "logps/chosen": -1112.5712890625, + "logps/rejected": -557.085205078125, + "loss": 0.6935, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.046791836619377136, + "rewards/margins": 0.002270890399813652, + "rewards/rejected": -0.049062732607126236, + "step": 46 + }, + { + "epoch": 0.019599666388657216, + "grad_norm": 50.97684097290039, + "learning_rate": 5.443595089182303e-08, + "logits/chosen": -1.0403318405151367, + "logits/rejected": -0.9018253087997437, + "logps/chosen": -782.2076416015625, + "logps/rejected": -421.1203308105469, + "loss": 0.6819, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02706451714038849, + "rewards/margins": 0.025698469951748848, + "rewards/rejected": -0.05276298522949219, + "step": 47 + }, + { + "epoch": 0.020016680567139282, + "grad_norm": 29.327924728393555, + "learning_rate": 5.559416261292565e-08, + "logits/chosen": -1.7389593124389648, + "logits/rejected": -0.7991050481796265, + "logps/chosen": -1017.2852783203125, + "logps/rejected": -517.2421264648438, + "loss": 0.6695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.029515840113162994, + "rewards/margins": 0.05181083828210831, + "rewards/rejected": -0.022294998168945312, + "step": 48 + }, + { + "epoch": 0.02043369474562135, + "grad_norm": 14.915777206420898, + "learning_rate": 5.675237433402827e-08, + "logits/chosen": -0.31539177894592285, + "logits/rejected": -1.0174384117126465, + "logps/chosen": -838.0572509765625, + "logps/rejected": -481.8790588378906, + "loss": 0.7072, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.027943041175603867, + "rewards/margins": -0.026145555078983307, + "rewards/rejected": 0.054088592529296875, + "step": 49 + }, + { + "epoch": 0.020850708924103418, + "grad_norm": 22.452150344848633, + "learning_rate": 5.791058605513088e-08, + "logits/chosen": -0.6134439706802368, + "logits/rejected": -1.111847162246704, + "logps/chosen": -878.875, + "logps/rejected": -473.634033203125, + "loss": 0.6708, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04776840656995773, + "rewards/margins": 0.04841823875904083, + "rewards/rejected": -0.0006498340517282486, + "step": 50 + }, + { + "epoch": 0.021267723102585488, + "grad_norm": 40.824440002441406, + "learning_rate": 5.90687977762335e-08, + "logits/chosen": -0.9156632423400879, + "logits/rejected": -0.8153332471847534, + "logps/chosen": -1104.3662109375, + "logps/rejected": -584.6292114257812, + "loss": 0.6682, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0022750841453671455, + "rewards/margins": 0.05793265998363495, + "rewards/rejected": -0.05565757304430008, + "step": 51 + }, + { + "epoch": 0.021684737281067557, + "grad_norm": 119.54183959960938, + "learning_rate": 6.022700949733612e-08, + "logits/chosen": -0.7227405905723572, + "logits/rejected": -0.6297723054885864, + "logps/chosen": -821.8289184570312, + "logps/rejected": -378.9083251953125, + "loss": 0.695, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.00782089214771986, + "rewards/margins": -0.0024902336299419403, + "rewards/rejected": 0.0103111257776618, + "step": 52 + }, + { + "epoch": 0.022101751459549623, + "grad_norm": 22.807153701782227, + "learning_rate": 6.138522121843873e-08, + "logits/chosen": -0.6195705533027649, + "logits/rejected": -0.7850760221481323, + "logps/chosen": -1142.4130859375, + "logps/rejected": -548.1829833984375, + "loss": 0.684, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.022368624806404114, + "rewards/margins": 0.023282624781131744, + "rewards/rejected": -0.04565124213695526, + "step": 53 + }, + { + "epoch": 0.022518765638031693, + "grad_norm": 24.95684814453125, + "learning_rate": 6.254343293954136e-08, + "logits/chosen": -1.0281580686569214, + "logits/rejected": -0.8457854986190796, + "logps/chosen": -1430.1173095703125, + "logps/rejected": -844.511474609375, + "loss": 0.753, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08716888725757599, + "rewards/margins": -0.10775566101074219, + "rewards/rejected": 0.020586777478456497, + "step": 54 + }, + { + "epoch": 0.022935779816513763, + "grad_norm": 18.823034286499023, + "learning_rate": 6.370164466064397e-08, + "logits/chosen": -0.8098901510238647, + "logits/rejected": -0.9913021326065063, + "logps/chosen": -898.1552734375, + "logps/rejected": -486.6142883300781, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020023727789521217, + "rewards/margins": 0.008649444207549095, + "rewards/rejected": 0.011374283581972122, + "step": 55 + }, + { + "epoch": 0.02335279399499583, + "grad_norm": 21.520174026489258, + "learning_rate": 6.485985638174659e-08, + "logits/chosen": -1.7405438423156738, + "logits/rejected": -0.8756951093673706, + "logps/chosen": -1051.8515625, + "logps/rejected": -562.9490966796875, + "loss": 0.6741, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.040479280054569244, + "rewards/margins": 0.044184111058712006, + "rewards/rejected": -0.0037048333324491978, + "step": 56 + }, + { + "epoch": 0.0237698081734779, + "grad_norm": 40.11115264892578, + "learning_rate": 6.60180681028492e-08, + "logits/chosen": -1.0035383701324463, + "logits/rejected": -1.0949254035949707, + "logps/chosen": -878.2860717773438, + "logps/rejected": -286.619140625, + "loss": 0.7197, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.018975067883729935, + "rewards/margins": -0.04622821509838104, + "rewards/rejected": 0.027253154665231705, + "step": 57 + }, + { + "epoch": 0.02418682235195997, + "grad_norm": 30.443204879760742, + "learning_rate": 6.717627982395183e-08, + "logits/chosen": -0.933469831943512, + "logits/rejected": -0.8008683919906616, + "logps/chosen": -1202.5306396484375, + "logps/rejected": -576.6199951171875, + "loss": 0.6666, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03037872537970543, + "rewards/margins": 0.05893173813819885, + "rewards/rejected": -0.028553010895848274, + "step": 58 + }, + { + "epoch": 0.024603836530442035, + "grad_norm": 18.52507781982422, + "learning_rate": 6.833449154505444e-08, + "logits/chosen": -0.7624117136001587, + "logits/rejected": -0.9757118225097656, + "logps/chosen": -916.1500244140625, + "logps/rejected": -489.7148742675781, + "loss": 0.6937, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0170135498046875, + "rewards/margins": 0.002638434525579214, + "rewards/rejected": 0.014375115744769573, + "step": 59 + }, + { + "epoch": 0.025020850708924104, + "grad_norm": 131.59783935546875, + "learning_rate": 6.949270326615706e-08, + "logits/chosen": -1.2870289087295532, + "logits/rejected": -0.8154935240745544, + "logps/chosen": -1169.3388671875, + "logps/rejected": -566.4340209960938, + "loss": 0.7459, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.06500548869371414, + "rewards/margins": -0.10119171440601349, + "rewards/rejected": 0.03618621826171875, + "step": 60 + }, + { + "epoch": 0.02543786488740617, + "grad_norm": 15.77666187286377, + "learning_rate": 7.065091498725967e-08, + "logits/chosen": -1.2692936658859253, + "logits/rejected": -0.9715656638145447, + "logps/chosen": -865.5953979492188, + "logps/rejected": -439.8361511230469, + "loss": 0.6667, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006563948467373848, + "rewards/margins": 0.058458711951971054, + "rewards/rejected": -0.05189476162195206, + "step": 61 + }, + { + "epoch": 0.02585487906588824, + "grad_norm": 18.79291534423828, + "learning_rate": 7.18091267083623e-08, + "logits/chosen": -1.3332494497299194, + "logits/rejected": NaN, + "logps/chosen": -891.9952392578125, + "logps/rejected": -222.16107177734375, + "loss": 0.7056, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0027534477412700653, + "rewards/margins": -0.023471832275390625, + "rewards/rejected": 0.02622528187930584, + "step": 62 + }, + { + "epoch": 0.02627189324437031, + "grad_norm": 24.714696884155273, + "learning_rate": 7.296733842946491e-08, + "logits/chosen": -0.790179967880249, + "logits/rejected": -1.0911731719970703, + "logps/chosen": -1082.45947265625, + "logps/rejected": -401.8116455078125, + "loss": 0.7321, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04286956787109375, + "rewards/margins": -0.06772708892822266, + "rewards/rejected": 0.024857521057128906, + "step": 63 + }, + { + "epoch": 0.026688907422852376, + "grad_norm": 17.87702178955078, + "learning_rate": 7.412555015056752e-08, + "logits/chosen": -1.3301479816436768, + "logits/rejected": -0.6269656419754028, + "logps/chosen": -1212.1014404296875, + "logps/rejected": -642.0003051757812, + "loss": 0.7153, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04098358377814293, + "rewards/margins": -0.04170722886919975, + "rewards/rejected": 0.0007236474193632603, + "step": 64 + }, + { + "epoch": 0.027105921601334446, + "grad_norm": 51.042030334472656, + "learning_rate": 7.528376187167015e-08, + "logits/chosen": -0.9647114276885986, + "logits/rejected": -0.9835405349731445, + "logps/chosen": -935.2518310546875, + "logps/rejected": -550.3818969726562, + "loss": 0.638, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1552295684814453, + "rewards/margins": 0.12221603095531464, + "rewards/rejected": 0.033013537526130676, + "step": 65 + }, + { + "epoch": 0.027522935779816515, + "grad_norm": 18.89752197265625, + "learning_rate": 7.644197359277277e-08, + "logits/chosen": -0.3916284441947937, + "logits/rejected": -0.7384779453277588, + "logps/chosen": -1268.8743896484375, + "logps/rejected": -597.4725952148438, + "loss": 0.6878, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004918668419122696, + "rewards/margins": 0.01543598435819149, + "rewards/rejected": -0.020354654639959335, + "step": 66 + }, + { + "epoch": 0.02793994995829858, + "grad_norm": 21.641490936279297, + "learning_rate": 7.760018531387538e-08, + "logits/chosen": -0.6784153580665588, + "logits/rejected": -0.8281925320625305, + "logps/chosen": -1177.588623046875, + "logps/rejected": -552.3471069335938, + "loss": 0.6742, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06276093423366547, + "rewards/margins": 0.04310645908117294, + "rewards/rejected": 0.019654463976621628, + "step": 67 + }, + { + "epoch": 0.02835696413678065, + "grad_norm": 21.31064796447754, + "learning_rate": 7.8758397034978e-08, + "logits/chosen": -0.767756998538971, + "logits/rejected": -0.8618663549423218, + "logps/chosen": -1269.63671875, + "logps/rejected": -679.4761962890625, + "loss": 0.677, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04372711479663849, + "rewards/margins": 0.03530922159552574, + "rewards/rejected": 0.008417891338467598, + "step": 68 + }, + { + "epoch": 0.028773978315262717, + "grad_norm": 91.07283020019531, + "learning_rate": 7.991660875608062e-08, + "logits/chosen": -1.3356010913848877, + "logits/rejected": -0.8103543519973755, + "logps/chosen": -1098.5870361328125, + "logps/rejected": -550.8215942382812, + "loss": 0.7058, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015134810470044613, + "rewards/margins": -0.021274566650390625, + "rewards/rejected": 0.00613975478336215, + "step": 69 + }, + { + "epoch": 0.029190992493744787, + "grad_norm": 13.829802513122559, + "learning_rate": 8.107482047718324e-08, + "logits/chosen": -0.6501033902168274, + "logits/rejected": -1.2505512237548828, + "logps/chosen": -724.3450927734375, + "logps/rejected": -388.98052978515625, + "loss": 0.6467, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08526763319969177, + "rewards/margins": 0.09667377918958664, + "rewards/rejected": -0.011406135745346546, + "step": 70 + }, + { + "epoch": 0.029608006672226857, + "grad_norm": 16.855070114135742, + "learning_rate": 8.223303219828586e-08, + "logits/chosen": -0.1636202335357666, + "logits/rejected": -1.0385526418685913, + "logps/chosen": -853.0540771484375, + "logps/rejected": -453.39678955078125, + "loss": 0.6841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0026359576731920242, + "rewards/margins": 0.021498393267393112, + "rewards/rejected": -0.024134350940585136, + "step": 71 + }, + { + "epoch": 0.030025020850708923, + "grad_norm": 15.928338050842285, + "learning_rate": 8.339124391938846e-08, + "logits/chosen": -0.779632031917572, + "logits/rejected": -0.9878882169723511, + "logps/chosen": -909.8008422851562, + "logps/rejected": -489.654541015625, + "loss": 0.6724, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.050704196095466614, + "rewards/margins": 0.04584044963121414, + "rewards/rejected": 0.00486373994499445, + "step": 72 + }, + { + "epoch": 0.030442035029190993, + "grad_norm": 60.98784255981445, + "learning_rate": 8.454945564049109e-08, + "logits/chosen": -0.8013566732406616, + "logits/rejected": -1.0198818445205688, + "logps/chosen": -997.0302734375, + "logps/rejected": -456.8519592285156, + "loss": 0.6707, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.041598130017519, + "rewards/margins": 0.04826202988624573, + "rewards/rejected": -0.006663896143436432, + "step": 73 + }, + { + "epoch": 0.030859049207673062, + "grad_norm": 21.400890350341797, + "learning_rate": 8.57076673615937e-08, + "logits/chosen": -0.7362083196640015, + "logits/rejected": -0.6525900363922119, + "logps/chosen": -1088.894775390625, + "logps/rejected": -646.3641967773438, + "loss": 0.6966, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02752990834414959, + "rewards/margins": 0.0026931767351925373, + "rewards/rejected": -0.03022308461368084, + "step": 74 + }, + { + "epoch": 0.03127606338615513, + "grad_norm": 16.7918758392334, + "learning_rate": 8.686587908269633e-08, + "logits/chosen": -0.28360724449157715, + "logits/rejected": -1.1354385614395142, + "logps/chosen": -1128.63330078125, + "logps/rejected": -638.0317993164062, + "loss": 0.6461, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01493454072624445, + "rewards/margins": 0.09858398139476776, + "rewards/rejected": -0.08364944159984589, + "step": 75 + }, + { + "epoch": 0.0316930775646372, + "grad_norm": 108.72737121582031, + "learning_rate": 8.802409080379893e-08, + "logits/chosen": -0.5492386221885681, + "logits/rejected": -0.8831167817115784, + "logps/chosen": -1261.10205078125, + "logps/rejected": -595.3948364257812, + "loss": 0.7613, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06007843464612961, + "rewards/margins": -0.1255725920200348, + "rewards/rejected": 0.06549415737390518, + "step": 76 + }, + { + "epoch": 0.03211009174311927, + "grad_norm": 17.26358985900879, + "learning_rate": 8.918230252490156e-08, + "logits/chosen": -0.929674506187439, + "logits/rejected": -1.2790333032608032, + "logps/chosen": -835.6614990234375, + "logps/rejected": -423.5019836425781, + "loss": 0.7032, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015805434435606003, + "rewards/margins": -0.018981169909238815, + "rewards/rejected": 0.003175734542310238, + "step": 77 + }, + { + "epoch": 0.03252710592160134, + "grad_norm": 65.98795318603516, + "learning_rate": 9.034051424600417e-08, + "logits/chosen": -0.8532543182373047, + "logits/rejected": -0.8709993362426758, + "logps/chosen": -994.849853515625, + "logps/rejected": -501.1091613769531, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005176546052098274, + "rewards/margins": 0.014560317620635033, + "rewards/rejected": -0.009383775293827057, + "step": 78 + }, + { + "epoch": 0.0329441201000834, + "grad_norm": 37.07647705078125, + "learning_rate": 9.14987259671068e-08, + "logits/chosen": -1.000131607055664, + "logits/rejected": NaN, + "logps/chosen": -1071.8184814453125, + "logps/rejected": -398.84637451171875, + "loss": 0.7083, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.015410233289003372, + "rewards/margins": -0.027819253504276276, + "rewards/rejected": 0.012409019283950329, + "step": 79 + }, + { + "epoch": 0.03336113427856547, + "grad_norm": 17.242448806762695, + "learning_rate": 9.265693768820942e-08, + "logits/chosen": -0.9561635851860046, + "logits/rejected": -1.195489525794983, + "logps/chosen": -910.5739135742188, + "logps/rejected": -487.1072998046875, + "loss": 0.6226, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12011795490980148, + "rewards/margins": 0.15376168489456177, + "rewards/rejected": -0.03364372253417969, + "step": 80 + }, + { + "epoch": 0.03377814845704754, + "grad_norm": 86.44615173339844, + "learning_rate": 9.381514940931202e-08, + "logits/chosen": -1.3450794219970703, + "logits/rejected": -0.7903304696083069, + "logps/chosen": -1182.44775390625, + "logps/rejected": -495.9226989746094, + "loss": 0.7123, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.064722441136837, + "rewards/margins": -0.034734535962343216, + "rewards/rejected": -0.02998790703713894, + "step": 81 + }, + { + "epoch": 0.03419516263552961, + "grad_norm": 26.98616600036621, + "learning_rate": 9.497336113041464e-08, + "logits/chosen": -1.4685529470443726, + "logits/rejected": -0.7570338249206543, + "logps/chosen": -940.4725952148438, + "logps/rejected": -410.765869140625, + "loss": 0.7179, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07790184020996094, + "rewards/margins": -0.04334831237792969, + "rewards/rejected": -0.03455352783203125, + "step": 82 + }, + { + "epoch": 0.03461217681401168, + "grad_norm": 15.285861015319824, + "learning_rate": 9.613157285151727e-08, + "logits/chosen": -0.9712526798248291, + "logits/rejected": NaN, + "logps/chosen": -868.490478515625, + "logps/rejected": -383.4150085449219, + "loss": 0.6732, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.029228974133729935, + "rewards/margins": 0.04201774671673775, + "rewards/rejected": -0.012788772583007812, + "step": 83 + }, + { + "epoch": 0.03502919099249374, + "grad_norm": 22.275808334350586, + "learning_rate": 9.728978457261989e-08, + "logits/chosen": -0.7350895404815674, + "logits/rejected": -1.1268417835235596, + "logps/chosen": -972.3436279296875, + "logps/rejected": -350.9463195800781, + "loss": 0.7263, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.005471039563417435, + "rewards/margins": -0.06386680901050568, + "rewards/rejected": 0.05839576572179794, + "step": 84 + }, + { + "epoch": 0.03544620517097581, + "grad_norm": 74.2577896118164, + "learning_rate": 9.844799629372251e-08, + "logits/chosen": -0.9534342885017395, + "logits/rejected": -0.8625384569168091, + "logps/chosen": -1110.146240234375, + "logps/rejected": -500.42974853515625, + "loss": 0.702, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0138893136754632, + "rewards/margins": -0.012622452341020107, + "rewards/rejected": -0.0012668594717979431, + "step": 85 + }, + { + "epoch": 0.03586321934945788, + "grad_norm": 19.555416107177734, + "learning_rate": 9.960620801482511e-08, + "logits/chosen": -1.290772557258606, + "logits/rejected": -0.6522356271743774, + "logps/chosen": -1282.969970703125, + "logps/rejected": -676.4635620117188, + "loss": 0.7229, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02444152720272541, + "rewards/margins": -0.05384979397058487, + "rewards/rejected": 0.02940826676785946, + "step": 86 + }, + { + "epoch": 0.03628023352793995, + "grad_norm": 16.333271026611328, + "learning_rate": 1.0076441973592773e-07, + "logits/chosen": -0.7311081886291504, + "logits/rejected": -0.8354039192199707, + "logps/chosen": -869.994140625, + "logps/rejected": -509.25408935546875, + "loss": 0.7077, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.005273055285215378, + "rewards/margins": -0.02354850433766842, + "rewards/rejected": 0.01827545091509819, + "step": 87 + }, + { + "epoch": 0.03669724770642202, + "grad_norm": 18.380983352661133, + "learning_rate": 1.0192263145703035e-07, + "logits/chosen": -1.1401082277297974, + "logits/rejected": -1.132318377494812, + "logps/chosen": -1008.6221313476562, + "logps/rejected": -477.565185546875, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014961625449359417, + "rewards/margins": 0.0014553042128682137, + "rewards/rejected": 0.013506315648555756, + "step": 88 + }, + { + "epoch": 0.03711426188490409, + "grad_norm": 26.646682739257812, + "learning_rate": 1.0308084317813298e-07, + "logits/chosen": -1.3610093593597412, + "logits/rejected": -0.6138274669647217, + "logps/chosen": -988.4589233398438, + "logps/rejected": -325.4477233886719, + "loss": 0.7059, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.00020523217972368002, + "rewards/margins": -0.02002449333667755, + "rewards/rejected": 0.020229723304510117, + "step": 89 + }, + { + "epoch": 0.03753127606338615, + "grad_norm": 21.537296295166016, + "learning_rate": 1.0423905489923558e-07, + "logits/chosen": -0.3670506477355957, + "logits/rejected": -1.2307283878326416, + "logps/chosen": -882.1390380859375, + "logps/rejected": -452.778564453125, + "loss": 0.6991, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.015892794355750084, + "rewards/margins": -0.007961275056004524, + "rewards/rejected": 0.023854069411754608, + "step": 90 + }, + { + "epoch": 0.03794829024186822, + "grad_norm": 17.17719841003418, + "learning_rate": 1.053972666203382e-07, + "logits/chosen": -0.9104897975921631, + "logits/rejected": NaN, + "logps/chosen": -913.6765747070312, + "logps/rejected": -352.795166015625, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0410614013671875, + "rewards/margins": 0.0147705078125, + "rewards/rejected": 0.02629089541733265, + "step": 91 + }, + { + "epoch": 0.03836530442035029, + "grad_norm": 17.480484008789062, + "learning_rate": 1.0655547834144082e-07, + "logits/chosen": -1.4227473735809326, + "logits/rejected": NaN, + "logps/chosen": -1056.820556640625, + "logps/rejected": -269.52423095703125, + "loss": 0.7323, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0858863815665245, + "rewards/margins": -0.07235755771398544, + "rewards/rejected": -0.013528823852539062, + "step": 92 + }, + { + "epoch": 0.03878231859883236, + "grad_norm": 15.149821281433105, + "learning_rate": 1.0771369006254345e-07, + "logits/chosen": -0.47390222549438477, + "logits/rejected": -1.037414789199829, + "logps/chosen": -846.0308837890625, + "logps/rejected": -547.6422119140625, + "loss": 0.6636, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.045998383313417435, + "rewards/margins": 0.06256867200136185, + "rewards/rejected": -0.016570281237363815, + "step": 93 + }, + { + "epoch": 0.03919933277731443, + "grad_norm": 21.305984497070312, + "learning_rate": 1.0887190178364607e-07, + "logits/chosen": -1.0670357942581177, + "logits/rejected": NaN, + "logps/chosen": -1157.65673828125, + "logps/rejected": -511.64569091796875, + "loss": 0.7143, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03162231296300888, + "rewards/margins": -0.04006042331457138, + "rewards/rejected": 0.0084381103515625, + "step": 94 + }, + { + "epoch": 0.039616346955796494, + "grad_norm": 23.730026245117188, + "learning_rate": 1.1003011350474867e-07, + "logits/chosen": -0.3061126470565796, + "logits/rejected": -1.0413663387298584, + "logps/chosen": -1304.73681640625, + "logps/rejected": -592.681396484375, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008296966552734375, + "rewards/margins": 0.005216214805841446, + "rewards/rejected": 0.0030807480216026306, + "step": 95 + }, + { + "epoch": 0.040033361134278564, + "grad_norm": 19.358613967895508, + "learning_rate": 1.111883252258513e-07, + "logits/chosen": -0.8130037784576416, + "logits/rejected": -1.1115543842315674, + "logps/chosen": -1018.016357421875, + "logps/rejected": -552.5582275390625, + "loss": 0.673, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0013061519712209702, + "rewards/margins": 0.04369983822107315, + "rewards/rejected": -0.04500599205493927, + "step": 96 + }, + { + "epoch": 0.040450375312760634, + "grad_norm": 15.860090255737305, + "learning_rate": 1.1234653694695391e-07, + "logits/chosen": -1.1262555122375488, + "logits/rejected": -0.8223533630371094, + "logps/chosen": -1017.1997680664062, + "logps/rejected": -603.900390625, + "loss": 0.7039, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.022472381591796875, + "rewards/margins": -0.0180511474609375, + "rewards/rejected": -0.0044212331995368, + "step": 97 + }, + { + "epoch": 0.0408673894912427, + "grad_norm": 17.37933349609375, + "learning_rate": 1.1350474866805654e-07, + "logits/chosen": -0.24534295499324799, + "logits/rejected": -0.5598949790000916, + "logps/chosen": -1198.012939453125, + "logps/rejected": -765.760986328125, + "loss": 0.6503, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06968384236097336, + "rewards/margins": 0.09195518493652344, + "rewards/rejected": -0.022271348163485527, + "step": 98 + }, + { + "epoch": 0.04128440366972477, + "grad_norm": 17.47412109375, + "learning_rate": 1.1466296038915914e-07, + "logits/chosen": -0.3964734971523285, + "logits/rejected": -1.1009342670440674, + "logps/chosen": -840.9531860351562, + "logps/rejected": -473.5782165527344, + "loss": 0.6991, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024275969713926315, + "rewards/margins": -0.010202025063335896, + "rewards/rejected": 0.034477997571229935, + "step": 99 + }, + { + "epoch": 0.041701417848206836, + "grad_norm": 19.261178970336914, + "learning_rate": 1.1582117211026176e-07, + "logits/chosen": -0.23722217977046967, + "logits/rejected": -1.2409905195236206, + "logps/chosen": -921.0654296875, + "logps/rejected": -545.7203979492188, + "loss": 0.7177, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.010101322084665298, + "rewards/margins": -0.0414833128452301, + "rewards/rejected": 0.031381990760564804, + "step": 100 + }, + { + "epoch": 0.042118432026688905, + "grad_norm": 91.9643783569336, + "learning_rate": 1.1697938383136438e-07, + "logits/chosen": -1.9062199592590332, + "logits/rejected": NaN, + "logps/chosen": -1113.515869140625, + "logps/rejected": -446.49554443359375, + "loss": 0.7205, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04626350477337837, + "rewards/margins": -0.04530840739607811, + "rewards/rejected": -0.0009551048278808594, + "step": 101 + }, + { + "epoch": 0.042535446205170975, + "grad_norm": 18.060379028320312, + "learning_rate": 1.18137595552467e-07, + "logits/chosen": -0.4446738660335541, + "logits/rejected": -1.1742877960205078, + "logps/chosen": -691.2208251953125, + "logps/rejected": -435.8133239746094, + "loss": 0.7065, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.025057602673768997, + "rewards/margins": -0.014395520091056824, + "rewards/rejected": 0.03945312649011612, + "step": 102 + }, + { + "epoch": 0.042952460383653045, + "grad_norm": 14.427650451660156, + "learning_rate": 1.1929580727356962e-07, + "logits/chosen": -0.6077316999435425, + "logits/rejected": -1.0485197305679321, + "logps/chosen": -815.0374145507812, + "logps/rejected": -476.0773010253906, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09681206196546555, + "rewards/margins": 0.13536511361598969, + "rewards/rejected": -0.03855305165052414, + "step": 103 + }, + { + "epoch": 0.043369474562135114, + "grad_norm": 15.403131484985352, + "learning_rate": 1.2045401899467223e-07, + "logits/chosen": -0.324718713760376, + "logits/rejected": -1.1426432132720947, + "logps/chosen": -838.029296875, + "logps/rejected": -457.87701416015625, + "loss": 0.6631, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08718490600585938, + "rewards/margins": 0.06446647644042969, + "rewards/rejected": 0.022718429565429688, + "step": 104 + }, + { + "epoch": 0.043786488740617184, + "grad_norm": 15.961088180541992, + "learning_rate": 1.2161223071577485e-07, + "logits/chosen": -1.0842293500900269, + "logits/rejected": -1.015629529953003, + "logps/chosen": -982.311767578125, + "logps/rejected": -480.48052978515625, + "loss": 0.7073, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04700622335076332, + "rewards/margins": -0.02630767785012722, + "rewards/rejected": -0.02069854736328125, + "step": 105 + }, + { + "epoch": 0.04420350291909925, + "grad_norm": 18.914470672607422, + "learning_rate": 1.2277044243687746e-07, + "logits/chosen": -0.26083576679229736, + "logits/rejected": -0.737055778503418, + "logps/chosen": -1355.3277587890625, + "logps/rejected": -762.8406372070312, + "loss": 0.6496, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06643372029066086, + "rewards/margins": 0.09570293128490448, + "rewards/rejected": -0.029269222170114517, + "step": 106 + }, + { + "epoch": 0.044620517097581316, + "grad_norm": 13.108752250671387, + "learning_rate": 1.239286541579801e-07, + "logits/chosen": -0.5288923978805542, + "logits/rejected": -1.102329969406128, + "logps/chosen": -940.214599609375, + "logps/rejected": -622.1988525390625, + "loss": 0.7085, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0017730696126818657, + "rewards/margins": -0.027761459350585938, + "rewards/rejected": 0.029534529894590378, + "step": 107 + }, + { + "epoch": 0.045037531276063386, + "grad_norm": 16.061426162719727, + "learning_rate": 1.2508686587908272e-07, + "logits/chosen": -0.32691699266433716, + "logits/rejected": -0.9782832860946655, + "logps/chosen": -933.0322265625, + "logps/rejected": -514.4949951171875, + "loss": 0.7021, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.01657257042825222, + "rewards/margins": -0.014153673313558102, + "rewards/rejected": 0.030726242810487747, + "step": 108 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 15.34627914428711, + "learning_rate": 1.2624507760018533e-07, + "logits/chosen": -0.394502192735672, + "logits/rejected": -1.0906426906585693, + "logps/chosen": -775.928466796875, + "logps/rejected": -478.93780517578125, + "loss": 0.717, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0006156950257718563, + "rewards/margins": -0.04375400394201279, + "rewards/rejected": 0.04436969757080078, + "step": 109 + }, + { + "epoch": 0.045871559633027525, + "grad_norm": 53.971031188964844, + "learning_rate": 1.2740328932128794e-07, + "logits/chosen": -0.5428272485733032, + "logits/rejected": -0.9307764768600464, + "logps/chosen": -1109.373046875, + "logps/rejected": -673.364990234375, + "loss": 0.7024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022211456671357155, + "rewards/margins": -0.010321427136659622, + "rewards/rejected": -0.011890028603374958, + "step": 110 + }, + { + "epoch": 0.04628857381150959, + "grad_norm": 20.645055770874023, + "learning_rate": 1.2856150104239056e-07, + "logits/chosen": -0.9106751680374146, + "logits/rejected": -0.9916196465492249, + "logps/chosen": -919.875, + "logps/rejected": -460.3050842285156, + "loss": 0.6782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004055023193359375, + "rewards/margins": 0.039391327649354935, + "rewards/rejected": -0.04344634711742401, + "step": 111 + }, + { + "epoch": 0.04670558798999166, + "grad_norm": 16.949172973632812, + "learning_rate": 1.2971971276349317e-07, + "logits/chosen": -0.8086934089660645, + "logits/rejected": -0.9770311117172241, + "logps/chosen": -1150.049072265625, + "logps/rejected": -582.0318603515625, + "loss": 0.7082, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0089950580149889, + "rewards/margins": -0.024771500378847122, + "rewards/rejected": 0.015776444226503372, + "step": 112 + }, + { + "epoch": 0.04712260216847373, + "grad_norm": 16.83028221130371, + "learning_rate": 1.3087792448459579e-07, + "logits/chosen": -1.0879194736480713, + "logits/rejected": NaN, + "logps/chosen": -917.09228515625, + "logps/rejected": -445.5961608886719, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01571807637810707, + "rewards/margins": 0.0258331298828125, + "rewards/rejected": -0.01011505164206028, + "step": 113 + }, + { + "epoch": 0.0475396163469558, + "grad_norm": 15.415282249450684, + "learning_rate": 1.320361362056984e-07, + "logits/chosen": -0.29015326499938965, + "logits/rejected": -0.9449247717857361, + "logps/chosen": -678.3671264648438, + "logps/rejected": -372.5124206542969, + "loss": 0.6942, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.020849991589784622, + "rewards/margins": 6.885640323162079e-05, + "rewards/rejected": 0.020781135186553, + "step": 114 + }, + { + "epoch": 0.04795663052543787, + "grad_norm": 15.575323104858398, + "learning_rate": 1.3319434792680104e-07, + "logits/chosen": -0.34743446111679077, + "logits/rejected": -0.9842329025268555, + "logps/chosen": -1014.93310546875, + "logps/rejected": -603.7559814453125, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05037841945886612, + "rewards/margins": 0.011532217264175415, + "rewards/rejected": 0.038846205919981, + "step": 115 + }, + { + "epoch": 0.04837364470391994, + "grad_norm": 14.344321250915527, + "learning_rate": 1.3435255964790365e-07, + "logits/chosen": -0.9816644191741943, + "logits/rejected": -1.1351289749145508, + "logps/chosen": -723.10400390625, + "logps/rejected": -378.32318115234375, + "loss": 0.6651, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03587837144732475, + "rewards/margins": 0.059589579701423645, + "rewards/rejected": -0.023711202666163445, + "step": 116 + }, + { + "epoch": 0.048790658882402, + "grad_norm": 48.08913803100586, + "learning_rate": 1.3551077136900627e-07, + "logits/chosen": -1.3095242977142334, + "logits/rejected": -0.714737594127655, + "logps/chosen": -1391.7171630859375, + "logps/rejected": -680.27490234375, + "loss": 0.6673, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04538116604089737, + "rewards/margins": 0.05721550062298775, + "rewards/rejected": -0.011834336444735527, + "step": 117 + }, + { + "epoch": 0.04920767306088407, + "grad_norm": 18.102249145507812, + "learning_rate": 1.3666898309010888e-07, + "logits/chosen": -1.0817623138427734, + "logits/rejected": -0.7686195373535156, + "logps/chosen": -1101.66162109375, + "logps/rejected": -625.8189086914062, + "loss": 0.702, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0008323667570948601, + "rewards/margins": -0.014323808252811432, + "rewards/rejected": 0.015156174078583717, + "step": 118 + }, + { + "epoch": 0.04962468723936614, + "grad_norm": 19.966508865356445, + "learning_rate": 1.378271948112115e-07, + "logits/chosen": -0.9210702180862427, + "logits/rejected": -0.9271761178970337, + "logps/chosen": -898.1781616210938, + "logps/rejected": -353.2750244140625, + "loss": 0.7265, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.038298800587654114, + "rewards/margins": -0.06197242811322212, + "rewards/rejected": 0.02367362752556801, + "step": 119 + }, + { + "epoch": 0.05004170141784821, + "grad_norm": 22.214035034179688, + "learning_rate": 1.389854065323141e-07, + "logits/chosen": -1.0294233560562134, + "logits/rejected": -0.7662121057510376, + "logps/chosen": -1220.337890625, + "logps/rejected": -559.8800659179688, + "loss": 0.6591, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07535400241613388, + "rewards/margins": 0.07475834339857101, + "rewards/rejected": 0.0005956660024821758, + "step": 120 + }, + { + "epoch": 0.05045871559633028, + "grad_norm": 15.8386869430542, + "learning_rate": 1.4014361825341675e-07, + "logits/chosen": -0.18675629794597626, + "logits/rejected": -0.9903164505958557, + "logps/chosen": -983.67138671875, + "logps/rejected": -632.8486328125, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01935577392578125, + "rewards/margins": 0.0043163299560546875, + "rewards/rejected": -0.023672107607126236, + "step": 121 + }, + { + "epoch": 0.05087572977481234, + "grad_norm": 19.275901794433594, + "learning_rate": 1.4130182997451934e-07, + "logits/chosen": -0.22933819890022278, + "logits/rejected": -0.9596302509307861, + "logps/chosen": -1219.72412109375, + "logps/rejected": -658.7528076171875, + "loss": 0.7003, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.024492645636200905, + "rewards/margins": -0.011378096416592598, + "rewards/rejected": -0.013114548288285732, + "step": 122 + }, + { + "epoch": 0.05129274395329441, + "grad_norm": 20.673608779907227, + "learning_rate": 1.4246004169562198e-07, + "logits/chosen": -0.25747668743133545, + "logits/rejected": -1.1251367330551147, + "logps/chosen": -808.6586303710938, + "logps/rejected": -430.3555603027344, + "loss": 0.6906, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008142853155732155, + "rewards/margins": 0.009696964174509048, + "rewards/rejected": -0.017839815467596054, + "step": 123 + }, + { + "epoch": 0.05170975813177648, + "grad_norm": 15.72778034210205, + "learning_rate": 1.436182534167246e-07, + "logits/chosen": -0.3682924807071686, + "logits/rejected": -1.1229493618011475, + "logps/chosen": -766.9910278320312, + "logps/rejected": -406.4599609375, + "loss": 0.7172, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.040433116257190704, + "rewards/margins": -0.04248886555433273, + "rewards/rejected": 0.002055739052593708, + "step": 124 + }, + { + "epoch": 0.05212677231025855, + "grad_norm": 38.49226760864258, + "learning_rate": 1.447764651378272e-07, + "logits/chosen": -0.968271791934967, + "logits/rejected": -0.8008549213409424, + "logps/chosen": -1273.470947265625, + "logps/rejected": -652.4854736328125, + "loss": 0.7451, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05941467359662056, + "rewards/margins": -0.09326133131980896, + "rewards/rejected": 0.0338466614484787, + "step": 125 + }, + { + "epoch": 0.05254378648874062, + "grad_norm": 16.2724666595459, + "learning_rate": 1.4593467685892982e-07, + "logits/chosen": -0.8043951988220215, + "logits/rejected": -0.9435769319534302, + "logps/chosen": -884.66015625, + "logps/rejected": -509.9393005371094, + "loss": 0.6957, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.036594390869140625, + "rewards/margins": -0.0023818952031433582, + "rewards/rejected": 0.038976289331912994, + "step": 126 + }, + { + "epoch": 0.05296080066722268, + "grad_norm": 24.529891967773438, + "learning_rate": 1.4709288858003244e-07, + "logits/chosen": -1.4285173416137695, + "logits/rejected": -0.8821724057197571, + "logps/chosen": -1135.6915283203125, + "logps/rejected": -509.512939453125, + "loss": 0.7222, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.043817706406116486, + "rewards/margins": -0.049567222595214844, + "rewards/rejected": 0.005749511998146772, + "step": 127 + }, + { + "epoch": 0.05337781484570475, + "grad_norm": 18.098474502563477, + "learning_rate": 1.4825110030113505e-07, + "logits/chosen": -0.46148234605789185, + "logits/rejected": -1.1430015563964844, + "logps/chosen": -808.203125, + "logps/rejected": -435.9547424316406, + "loss": 0.7017, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.031984712928533554, + "rewards/margins": -0.016345787793397903, + "rewards/rejected": 0.04833050072193146, + "step": 128 + }, + { + "epoch": 0.05379482902418682, + "grad_norm": 13.038899421691895, + "learning_rate": 1.494093120222377e-07, + "logits/chosen": -1.012717843055725, + "logits/rejected": NaN, + "logps/chosen": -620.4797973632812, + "logps/rejected": -298.6163635253906, + "loss": 0.6819, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.045272063463926315, + "rewards/margins": 0.024580955505371094, + "rewards/rejected": 0.02069110982120037, + "step": 129 + }, + { + "epoch": 0.05421184320266889, + "grad_norm": 15.197786331176758, + "learning_rate": 1.505675237433403e-07, + "logits/chosen": -0.30757445096969604, + "logits/rejected": -0.9579191207885742, + "logps/chosen": -804.5476684570312, + "logps/rejected": -430.1116027832031, + "loss": 0.7012, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00217514019459486, + "rewards/margins": -0.012145616114139557, + "rewards/rejected": 0.014320755377411842, + "step": 130 + }, + { + "epoch": 0.05462885738115096, + "grad_norm": 16.18636131286621, + "learning_rate": 1.5172573546444292e-07, + "logits/chosen": -1.1975288391113281, + "logits/rejected": -0.7701647877693176, + "logps/chosen": -810.7947998046875, + "logps/rejected": -431.254150390625, + "loss": 0.7178, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03704643249511719, + "rewards/margins": -0.04701252281665802, + "rewards/rejected": 0.00996608566492796, + "step": 131 + }, + { + "epoch": 0.05504587155963303, + "grad_norm": 20.91913414001465, + "learning_rate": 1.5288394718554553e-07, + "logits/chosen": -0.5308362245559692, + "logits/rejected": -0.8783502578735352, + "logps/chosen": -778.7050170898438, + "logps/rejected": -457.10394287109375, + "loss": 0.7313, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.022925568744540215, + "rewards/margins": -0.07120818644762039, + "rewards/rejected": 0.04828261956572533, + "step": 132 + }, + { + "epoch": 0.05546288573811509, + "grad_norm": 17.371862411499023, + "learning_rate": 1.5404215890664815e-07, + "logits/chosen": -0.9245917797088623, + "logits/rejected": -1.0911917686462402, + "logps/chosen": -906.4658813476562, + "logps/rejected": -497.3775939941406, + "loss": 0.6538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05284271016716957, + "rewards/margins": 0.08604927361011505, + "rewards/rejected": -0.03320655971765518, + "step": 133 + }, + { + "epoch": 0.05587989991659716, + "grad_norm": 16.90860366821289, + "learning_rate": 1.5520037062775076e-07, + "logits/chosen": -0.9303147792816162, + "logits/rejected": -1.0880553722381592, + "logps/chosen": -895.3506469726562, + "logps/rejected": -471.0828857421875, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.009093474596738815, + "rewards/margins": 0.004067800473421812, + "rewards/rejected": 0.005025672726333141, + "step": 134 + }, + { + "epoch": 0.05629691409507923, + "grad_norm": 16.05911636352539, + "learning_rate": 1.5635858234885337e-07, + "logits/chosen": -0.22461439669132233, + "logits/rejected": -1.0137046575546265, + "logps/chosen": -914.1187744140625, + "logps/rejected": -511.53924560546875, + "loss": 0.6997, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.018575288355350494, + "rewards/margins": -0.008988186717033386, + "rewards/rejected": 0.02756347693502903, + "step": 135 + }, + { + "epoch": 0.0567139282735613, + "grad_norm": 16.63825798034668, + "learning_rate": 1.57516794069956e-07, + "logits/chosen": -1.0471822023391724, + "logits/rejected": -0.9069421887397766, + "logps/chosen": -844.0193481445312, + "logps/rejected": -337.95404052734375, + "loss": 0.6942, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03731193393468857, + "rewards/margins": 0.0009937267750501633, + "rewards/rejected": 0.036318209022283554, + "step": 136 + }, + { + "epoch": 0.05713094245204337, + "grad_norm": 16.143508911132812, + "learning_rate": 1.586750057910586e-07, + "logits/chosen": -0.39095550775527954, + "logits/rejected": -1.047715663909912, + "logps/chosen": -926.266845703125, + "logps/rejected": -484.08587646484375, + "loss": 0.6636, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07091446220874786, + "rewards/margins": 0.06880474090576172, + "rewards/rejected": 0.0021097175776958466, + "step": 137 + }, + { + "epoch": 0.057547956630525435, + "grad_norm": 25.503097534179688, + "learning_rate": 1.5983321751216124e-07, + "logits/chosen": -1.3059309720993042, + "logits/rejected": -0.6376598477363586, + "logps/chosen": -1180.940185546875, + "logps/rejected": -786.3671875, + "loss": 0.726, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04057159647345543, + "rewards/margins": -0.05779685825109482, + "rewards/rejected": 0.017225265502929688, + "step": 138 + }, + { + "epoch": 0.057964970809007504, + "grad_norm": 85.17687225341797, + "learning_rate": 1.6099142923326386e-07, + "logits/chosen": -0.9423524737358093, + "logits/rejected": -0.907435417175293, + "logps/chosen": -1002.1343994140625, + "logps/rejected": -561.3392944335938, + "loss": 0.6827, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05428696051239967, + "rewards/margins": 0.0379459410905838, + "rewards/rejected": 0.016341019421815872, + "step": 139 + }, + { + "epoch": 0.058381984987489574, + "grad_norm": 17.96173095703125, + "learning_rate": 1.6214964095436647e-07, + "logits/chosen": -0.24961334466934204, + "logits/rejected": -0.9326527714729309, + "logps/chosen": -1168.9599609375, + "logps/rejected": -644.4119873046875, + "loss": 0.6946, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01823120005428791, + "rewards/margins": 0.0007009468972682953, + "rewards/rejected": -0.018932152539491653, + "step": 140 + }, + { + "epoch": 0.058798999165971644, + "grad_norm": 14.371733665466309, + "learning_rate": 1.633078526754691e-07, + "logits/chosen": -0.31156066060066223, + "logits/rejected": -1.0210416316986084, + "logps/chosen": -878.0667724609375, + "logps/rejected": -473.63580322265625, + "loss": 0.7304, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05401420593261719, + "rewards/margins": -0.07010441273450851, + "rewards/rejected": 0.01609020307660103, + "step": 141 + }, + { + "epoch": 0.05921601334445371, + "grad_norm": 18.71565055847168, + "learning_rate": 1.6446606439657173e-07, + "logits/chosen": -0.2816826105117798, + "logits/rejected": -1.2145004272460938, + "logps/chosen": -983.5244140625, + "logps/rejected": -504.5870361328125, + "loss": 0.66, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.045204926282167435, + "rewards/margins": 0.07460404187440872, + "rewards/rejected": -0.02939911186695099, + "step": 142 + }, + { + "epoch": 0.05963302752293578, + "grad_norm": 16.249265670776367, + "learning_rate": 1.6562427611767431e-07, + "logits/chosen": -0.879584789276123, + "logits/rejected": -0.7893844842910767, + "logps/chosen": -973.1734008789062, + "logps/rejected": -458.14007568359375, + "loss": 0.6846, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.030928805470466614, + "rewards/margins": 0.024716762825846672, + "rewards/rejected": 0.006212042644619942, + "step": 143 + }, + { + "epoch": 0.060050041701417846, + "grad_norm": 2306.220703125, + "learning_rate": 1.6678248783877693e-07, + "logits/chosen": -0.857746958732605, + "logits/rejected": -1.0907026529312134, + "logps/chosen": -963.4865112304688, + "logps/rejected": -534.8985595703125, + "loss": 0.6696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.024713898077607155, + "rewards/margins": 0.05052681267261505, + "rewards/rejected": -0.025812914595007896, + "step": 144 + }, + { + "epoch": 0.060467055879899916, + "grad_norm": 24.226587295532227, + "learning_rate": 1.6794069955987954e-07, + "logits/chosen": -1.1033821105957031, + "logits/rejected": -0.868928074836731, + "logps/chosen": -1221.96337890625, + "logps/rejected": -659.6525268554688, + "loss": 0.6786, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03473625332117081, + "rewards/margins": 0.0317203551530838, + "rewards/rejected": 0.0030158981680870056, + "step": 145 + }, + { + "epoch": 0.060884070058381985, + "grad_norm": 16.909944534301758, + "learning_rate": 1.6909891128098218e-07, + "logits/chosen": -0.22573509812355042, + "logits/rejected": -0.7512255907058716, + "logps/chosen": -1014.9890747070312, + "logps/rejected": -624.56689453125, + "loss": 0.71, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.050672151148319244, + "rewards/margins": -0.02603187784552574, + "rewards/rejected": -0.024640273302793503, + "step": 146 + }, + { + "epoch": 0.061301084236864055, + "grad_norm": 16.76839828491211, + "learning_rate": 1.702571230020848e-07, + "logits/chosen": -0.6073859333992004, + "logits/rejected": -1.0282937288284302, + "logps/chosen": -1147.544921875, + "logps/rejected": -744.5361938476562, + "loss": 0.6677, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.034712985157966614, + "rewards/margins": 0.059048086404800415, + "rewards/rejected": -0.024335097521543503, + "step": 147 + }, + { + "epoch": 0.061718098415346125, + "grad_norm": 13.075651168823242, + "learning_rate": 1.714153347231874e-07, + "logits/chosen": -0.803361177444458, + "logits/rejected": -1.1310362815856934, + "logps/chosen": -774.4264526367188, + "logps/rejected": -443.89410400390625, + "loss": 0.7231, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.050740811973810196, + "rewards/margins": -0.05279655382037163, + "rewards/rejected": 0.0020557399839162827, + "step": 148 + }, + { + "epoch": 0.06213511259382819, + "grad_norm": 27.12015724182129, + "learning_rate": 1.7257354644429002e-07, + "logits/chosen": -0.904604434967041, + "logits/rejected": -0.7471905946731567, + "logps/chosen": -1159.098388671875, + "logps/rejected": -448.0513610839844, + "loss": 0.7058, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.013756181113421917, + "rewards/margins": -0.015613172203302383, + "rewards/rejected": 0.029369352385401726, + "step": 149 + }, + { + "epoch": 0.06255212677231026, + "grad_norm": 74.9928970336914, + "learning_rate": 1.7373175816539266e-07, + "logits/chosen": -1.4024608135223389, + "logits/rejected": -0.5760458111763, + "logps/chosen": -1014.1785888671875, + "logps/rejected": -512.1514892578125, + "loss": 0.6917, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.022800447419285774, + "rewards/margins": 0.008842086419463158, + "rewards/rejected": -0.03164253383874893, + "step": 150 + }, + { + "epoch": 0.06296914095079233, + "grad_norm": 27.793537139892578, + "learning_rate": 1.7488996988649528e-07, + "logits/chosen": -1.652491569519043, + "logits/rejected": -0.3842669725418091, + "logps/chosen": -1309.1025390625, + "logps/rejected": -674.9772338867188, + "loss": 0.668, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.048065949231386185, + "rewards/margins": 0.05372238904237747, + "rewards/rejected": -0.0056564342230558395, + "step": 151 + }, + { + "epoch": 0.0633861551292744, + "grad_norm": 17.742572784423828, + "learning_rate": 1.7604818160759787e-07, + "logits/chosen": -0.22095084190368652, + "logits/rejected": -0.8693237900733948, + "logps/chosen": -885.8328857421875, + "logps/rejected": -506.28466796875, + "loss": 0.6658, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.104248046875, + "rewards/margins": 0.05906296521425247, + "rewards/rejected": 0.045185089111328125, + "step": 152 + }, + { + "epoch": 0.06380316930775647, + "grad_norm": 15.796443939208984, + "learning_rate": 1.7720639332870048e-07, + "logits/chosen": -1.0774059295654297, + "logits/rejected": -0.8355752825737, + "logps/chosen": -1272.6600341796875, + "logps/rejected": -627.5744018554688, + "loss": 0.6384, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10991058498620987, + "rewards/margins": 0.11489563435316086, + "rewards/rejected": -0.00498504564166069, + "step": 153 + }, + { + "epoch": 0.06422018348623854, + "grad_norm": 17.689197540283203, + "learning_rate": 1.7836460504980312e-07, + "logits/chosen": -0.34612104296684265, + "logits/rejected": -1.2777527570724487, + "logps/chosen": -869.3475952148438, + "logps/rejected": -461.9928894042969, + "loss": 0.7013, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.022393420338630676, + "rewards/margins": -0.014456748962402344, + "rewards/rejected": -0.007936667650938034, + "step": 154 + }, + { + "epoch": 0.0646371976647206, + "grad_norm": 20.81015396118164, + "learning_rate": 1.7952281677090574e-07, + "logits/chosen": -1.2211790084838867, + "logits/rejected": NaN, + "logps/chosen": -1129.420166015625, + "logps/rejected": -451.69451904296875, + "loss": 0.703, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0038398755714297295, + "rewards/margins": -0.014441677369177341, + "rewards/rejected": 0.01828155666589737, + "step": 155 + }, + { + "epoch": 0.06505421184320268, + "grad_norm": 19.941085815429688, + "learning_rate": 1.8068102849200835e-07, + "logits/chosen": -1.1713666915893555, + "logits/rejected": -0.9197109937667847, + "logps/chosen": -1008.890625, + "logps/rejected": -558.8369140625, + "loss": 0.6642, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14192277193069458, + "rewards/margins": 0.06730270385742188, + "rewards/rejected": 0.07462005317211151, + "step": 156 + }, + { + "epoch": 0.06547122602168473, + "grad_norm": 17.39375877380371, + "learning_rate": 1.8183924021311096e-07, + "logits/chosen": -0.1975356638431549, + "logits/rejected": -1.0653579235076904, + "logps/chosen": -992.18115234375, + "logps/rejected": -492.1889343261719, + "loss": 0.6652, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07128220051527023, + "rewards/margins": 0.0697362869977951, + "rewards/rejected": 0.0015459060668945312, + "step": 157 + }, + { + "epoch": 0.0658882402001668, + "grad_norm": 16.774898529052734, + "learning_rate": 1.829974519342136e-07, + "logits/chosen": -0.6784007549285889, + "logits/rejected": -0.6572502851486206, + "logps/chosen": -1108.28955078125, + "logps/rejected": -694.0288696289062, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05938720703125, + "rewards/margins": 0.06432534009218216, + "rewards/rejected": -0.004938125144690275, + "step": 158 + }, + { + "epoch": 0.06630525437864887, + "grad_norm": 13.822752952575684, + "learning_rate": 1.8415566365531622e-07, + "logits/chosen": -0.3474418818950653, + "logits/rejected": -1.1134905815124512, + "logps/chosen": -968.8282470703125, + "logps/rejected": -560.3597412109375, + "loss": 0.6339, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14742432534694672, + "rewards/margins": 0.12827034294605255, + "rewards/rejected": 0.01915397308766842, + "step": 159 + }, + { + "epoch": 0.06672226855713094, + "grad_norm": 14.481270790100098, + "learning_rate": 1.8531387537641883e-07, + "logits/chosen": -0.7814346551895142, + "logits/rejected": NaN, + "logps/chosen": -1056.097412109375, + "logps/rejected": -579.8106079101562, + "loss": 0.6636, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07847366482019424, + "rewards/margins": 0.06136131286621094, + "rewards/rejected": 0.017112351953983307, + "step": 160 + }, + { + "epoch": 0.06713928273561301, + "grad_norm": 16.283483505249023, + "learning_rate": 1.8647208709752145e-07, + "logits/chosen": -0.8693715333938599, + "logits/rejected": -0.853403627872467, + "logps/chosen": -881.1563720703125, + "logps/rejected": -502.8149108886719, + "loss": 0.7069, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002833176404237747, + "rewards/margins": -0.01950283721089363, + "rewards/rejected": 0.02233600616455078, + "step": 161 + }, + { + "epoch": 0.06755629691409508, + "grad_norm": 19.45885467529297, + "learning_rate": 1.8763029881862403e-07, + "logits/chosen": -0.9204495549201965, + "logits/rejected": -1.15439772605896, + "logps/chosen": -873.215576171875, + "logps/rejected": -343.0669860839844, + "loss": 0.6866, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00629806611686945, + "rewards/margins": 0.01939830370247364, + "rewards/rejected": -0.013100242242217064, + "step": 162 + }, + { + "epoch": 0.06797331109257715, + "grad_norm": 76.13990020751953, + "learning_rate": 1.8878851053972667e-07, + "logits/chosen": -0.8452050685882568, + "logits/rejected": -0.8972876071929932, + "logps/chosen": -859.2573852539062, + "logps/rejected": -334.1949157714844, + "loss": 0.6741, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0396517775952816, + "rewards/margins": 0.04129248112440109, + "rewards/rejected": -0.001640702597796917, + "step": 163 + }, + { + "epoch": 0.06839032527105922, + "grad_norm": 84.23938751220703, + "learning_rate": 1.899467222608293e-07, + "logits/chosen": -1.6562488079071045, + "logits/rejected": -0.6285548210144043, + "logps/chosen": -1208.630615234375, + "logps/rejected": -583.557373046875, + "loss": 0.737, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06493759155273438, + "rewards/margins": -0.06869125366210938, + "rewards/rejected": 0.0037536616437137127, + "step": 164 + }, + { + "epoch": 0.06880733944954129, + "grad_norm": 16.13762855529785, + "learning_rate": 1.911049339819319e-07, + "logits/chosen": -0.4287877082824707, + "logits/rejected": -1.2666531801223755, + "logps/chosen": -903.7000122070312, + "logps/rejected": -532.0977783203125, + "loss": 0.6771, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01604156754910946, + "rewards/margins": 0.03988857939839363, + "rewards/rejected": -0.023847008123993874, + "step": 165 + }, + { + "epoch": 0.06922435362802336, + "grad_norm": 147.62796020507812, + "learning_rate": 1.9226314570303454e-07, + "logits/chosen": -1.2340037822723389, + "logits/rejected": -0.7955665588378906, + "logps/chosen": -978.3566284179688, + "logps/rejected": -556.5897216796875, + "loss": 0.6937, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06027641147375107, + "rewards/margins": 0.0027650855481624603, + "rewards/rejected": 0.057511329650878906, + "step": 166 + }, + { + "epoch": 0.06964136780650543, + "grad_norm": 30.039339065551758, + "learning_rate": 1.9342135742413716e-07, + "logits/chosen": -0.9307671189308167, + "logits/rejected": -0.8695867657661438, + "logps/chosen": -1081.7802734375, + "logps/rejected": -411.8323974609375, + "loss": 0.6563, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10660552978515625, + "rewards/margins": 0.08417816460132599, + "rewards/rejected": 0.02242736890912056, + "step": 167 + }, + { + "epoch": 0.07005838198498748, + "grad_norm": 504.4578857421875, + "learning_rate": 1.9457956914523977e-07, + "logits/chosen": -1.076599359512329, + "logits/rejected": -0.6337015628814697, + "logps/chosen": -888.7259521484375, + "logps/rejected": -528.1907958984375, + "loss": 0.746, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03604736551642418, + "rewards/margins": -0.0963992178440094, + "rewards/rejected": 0.06035184860229492, + "step": 168 + }, + { + "epoch": 0.07047539616346955, + "grad_norm": 20.31352996826172, + "learning_rate": 1.9573778086634238e-07, + "logits/chosen": -1.1064772605895996, + "logits/rejected": -0.7056853175163269, + "logps/chosen": -1390.380126953125, + "logps/rejected": -727.2810668945312, + "loss": 0.695, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0022041331976652145, + "rewards/margins": 0.004159928299486637, + "rewards/rejected": -0.006364060565829277, + "step": 169 + }, + { + "epoch": 0.07089241034195162, + "grad_norm": 13.401070594787598, + "learning_rate": 1.9689599258744503e-07, + "logits/chosen": -0.49836522340774536, + "logits/rejected": -1.137736201286316, + "logps/chosen": -741.930419921875, + "logps/rejected": -426.00445556640625, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005536271724849939, + "rewards/margins": 0.004550553858280182, + "rewards/rejected": 0.0009857174009084702, + "step": 170 + }, + { + "epoch": 0.07130942452043369, + "grad_norm": 13.172532081604004, + "learning_rate": 1.980542043085476e-07, + "logits/chosen": -0.46047860383987427, + "logits/rejected": -1.2281675338745117, + "logps/chosen": -752.8194580078125, + "logps/rejected": -386.3710632324219, + "loss": 0.7149, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.00369186419993639, + "rewards/margins": -0.041150473058223724, + "rewards/rejected": 0.04484234005212784, + "step": 171 + }, + { + "epoch": 0.07172643869891576, + "grad_norm": 17.68221092224121, + "learning_rate": 1.9921241602965023e-07, + "logits/chosen": -1.373253583908081, + "logits/rejected": NaN, + "logps/chosen": -946.45263671875, + "logps/rejected": -186.00146484375, + "loss": 0.7286, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008769229054450989, + "rewards/margins": -0.0650734007358551, + "rewards/rejected": 0.056304167956113815, + "step": 172 + }, + { + "epoch": 0.07214345287739783, + "grad_norm": 37.89616775512695, + "learning_rate": 2.0037062775075284e-07, + "logits/chosen": -0.9311386942863464, + "logits/rejected": -1.0262730121612549, + "logps/chosen": -1056.0738525390625, + "logps/rejected": -500.7293701171875, + "loss": 0.6634, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05344314128160477, + "rewards/margins": 0.0623599998652935, + "rewards/rejected": -0.008916854858398438, + "step": 173 + }, + { + "epoch": 0.0725604670558799, + "grad_norm": 18.862733840942383, + "learning_rate": 2.0152883947185546e-07, + "logits/chosen": -0.5530217289924622, + "logits/rejected": -1.2045960426330566, + "logps/chosen": -804.709228515625, + "logps/rejected": -468.7178039550781, + "loss": 0.7187, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.043747711926698685, + "rewards/margins": -0.04873047024011612, + "rewards/rejected": 0.004982759244740009, + "step": 174 + }, + { + "epoch": 0.07297748123436197, + "grad_norm": 21.33814239501953, + "learning_rate": 2.026870511929581e-07, + "logits/chosen": -1.0136651992797852, + "logits/rejected": -0.6115394234657288, + "logps/chosen": -1226.987060546875, + "logps/rejected": -765.171875, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08806896954774857, + "rewards/margins": 0.015852542594075203, + "rewards/rejected": 0.07221642136573792, + "step": 175 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 56.11030197143555, + "learning_rate": 2.038452629140607e-07, + "logits/chosen": -0.6898629069328308, + "logits/rejected": -0.8750792145729065, + "logps/chosen": -961.9356689453125, + "logps/rejected": -453.7397155761719, + "loss": 0.6563, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12688980996608734, + "rewards/margins": 0.08137931674718857, + "rewards/rejected": 0.045510485768318176, + "step": 176 + }, + { + "epoch": 0.07381150959132611, + "grad_norm": 16.292837142944336, + "learning_rate": 2.0500347463516332e-07, + "logits/chosen": -0.43237173557281494, + "logits/rejected": -1.054295301437378, + "logps/chosen": -866.3851318359375, + "logps/rejected": -556.2374267578125, + "loss": 0.6189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15802688896656036, + "rewards/margins": 0.16030731797218323, + "rewards/rejected": -0.00228042621165514, + "step": 177 + }, + { + "epoch": 0.07422852376980818, + "grad_norm": 26.620071411132812, + "learning_rate": 2.0616168635626596e-07, + "logits/chosen": -0.7184302806854248, + "logits/rejected": -0.89468914270401, + "logps/chosen": -989.036865234375, + "logps/rejected": -481.721923828125, + "loss": 0.7456, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.02332010306417942, + "rewards/margins": -0.09907083213329315, + "rewards/rejected": 0.07575073093175888, + "step": 178 + }, + { + "epoch": 0.07464553794829024, + "grad_norm": 60.21271896362305, + "learning_rate": 2.0731989807736858e-07, + "logits/chosen": -1.472786784172058, + "logits/rejected": -0.5800337791442871, + "logps/chosen": -1266.1749267578125, + "logps/rejected": -719.3257446289062, + "loss": 0.6856, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.033321380615234375, + "rewards/margins": 0.021685030311346054, + "rewards/rejected": 0.01163635216653347, + "step": 179 + }, + { + "epoch": 0.0750625521267723, + "grad_norm": 16.64470100402832, + "learning_rate": 2.0847810979847117e-07, + "logits/chosen": -0.752143144607544, + "logits/rejected": NaN, + "logps/chosen": -1019.294921875, + "logps/rejected": -447.37493896484375, + "loss": 0.663, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.061182402074337006, + "rewards/margins": 0.06391449272632599, + "rewards/rejected": -0.0027320869266986847, + "step": 180 + }, + { + "epoch": 0.07547956630525438, + "grad_norm": 14.526205062866211, + "learning_rate": 2.0963632151957378e-07, + "logits/chosen": -0.9155081510543823, + "logits/rejected": NaN, + "logps/chosen": -823.2879638671875, + "logps/rejected": -436.7016906738281, + "loss": 0.684, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06318511813879013, + "rewards/margins": 0.021646497771143913, + "rewards/rejected": 0.04153861850500107, + "step": 181 + }, + { + "epoch": 0.07589658048373644, + "grad_norm": 20.05136489868164, + "learning_rate": 2.107945332406764e-07, + "logits/chosen": -0.7565939426422119, + "logits/rejected": -0.9944447875022888, + "logps/chosen": -1026.6199951171875, + "logps/rejected": -467.4626770019531, + "loss": 0.7077, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.01672973483800888, + "rewards/margins": -0.025464819744229317, + "rewards/rejected": 0.04219455644488335, + "step": 182 + }, + { + "epoch": 0.07631359466221851, + "grad_norm": 13.514947891235352, + "learning_rate": 2.1195274496177903e-07, + "logits/chosen": -0.7588256597518921, + "logits/rejected": -0.8387703895568848, + "logps/chosen": -924.4324951171875, + "logps/rejected": -550.2349243164062, + "loss": 0.6781, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11475753784179688, + "rewards/margins": 0.03420095890760422, + "rewards/rejected": 0.08055658638477325, + "step": 183 + }, + { + "epoch": 0.07673060884070058, + "grad_norm": 16.897945404052734, + "learning_rate": 2.1311095668288165e-07, + "logits/chosen": -0.9325801134109497, + "logits/rejected": -0.8909918069839478, + "logps/chosen": -864.4896240234375, + "logps/rejected": -589.8580932617188, + "loss": 0.6777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04542693495750427, + "rewards/margins": 0.040610507130622864, + "rewards/rejected": 0.004816437140107155, + "step": 184 + }, + { + "epoch": 0.07714762301918265, + "grad_norm": 96.70034790039062, + "learning_rate": 2.1426916840398426e-07, + "logits/chosen": -1.2628037929534912, + "logits/rejected": -0.8122460246086121, + "logps/chosen": -1064.6302490234375, + "logps/rejected": -459.68597412109375, + "loss": 0.6863, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0023834221065044403, + "rewards/margins": 0.0156312957406044, + "rewards/rejected": -0.01801471784710884, + "step": 185 + }, + { + "epoch": 0.07756463719766472, + "grad_norm": 21.766469955444336, + "learning_rate": 2.154273801250869e-07, + "logits/chosen": -0.6155120134353638, + "logits/rejected": -0.9541844725608826, + "logps/chosen": -1024.6883544921875, + "logps/rejected": -612.878662109375, + "loss": 0.7065, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0889892578125, + "rewards/margins": -0.018499378114938736, + "rewards/rejected": 0.10748863220214844, + "step": 186 + }, + { + "epoch": 0.0779816513761468, + "grad_norm": 41.48780822753906, + "learning_rate": 2.1658559184618952e-07, + "logits/chosen": -1.0657482147216797, + "logits/rejected": -0.7513991594314575, + "logps/chosen": -1020.0751953125, + "logps/rejected": -393.763671875, + "loss": 0.6776, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.071070097386837, + "rewards/margins": 0.037700653076171875, + "rewards/rejected": 0.03336944803595543, + "step": 187 + }, + { + "epoch": 0.07839866555462886, + "grad_norm": 15.420797348022461, + "learning_rate": 2.1774380356729213e-07, + "logits/chosen": -1.210871696472168, + "logits/rejected": NaN, + "logps/chosen": -802.0234375, + "logps/rejected": -333.5543518066406, + "loss": 0.6887, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00756683386862278, + "rewards/margins": 0.017990879714488983, + "rewards/rejected": -0.010424042120575905, + "step": 188 + }, + { + "epoch": 0.07881567973311093, + "grad_norm": 12.952091217041016, + "learning_rate": 2.1890201528839472e-07, + "logits/chosen": -0.5086954832077026, + "logits/rejected": -1.1818231344223022, + "logps/chosen": -954.8768920898438, + "logps/rejected": -532.1190185546875, + "loss": 0.6139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13691788911819458, + "rewards/margins": 0.16865062713623047, + "rewards/rejected": -0.03173275291919708, + "step": 189 + }, + { + "epoch": 0.07923269391159299, + "grad_norm": 17.616695404052734, + "learning_rate": 2.2006022700949733e-07, + "logits/chosen": -1.4916826486587524, + "logits/rejected": NaN, + "logps/chosen": -907.0608520507812, + "logps/rejected": -211.0211181640625, + "loss": 0.6679, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05354595556855202, + "rewards/margins": 0.054273705929517746, + "rewards/rejected": -0.0007277489639818668, + "step": 190 + }, + { + "epoch": 0.07964970809007506, + "grad_norm": 15.81710433959961, + "learning_rate": 2.2121843873059997e-07, + "logits/chosen": -0.7839399576187134, + "logits/rejected": -1.044818639755249, + "logps/chosen": -807.9933471679688, + "logps/rejected": -405.1636962890625, + "loss": 0.6738, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06609192490577698, + "rewards/margins": 0.04150981828570366, + "rewards/rejected": 0.02458210289478302, + "step": 191 + }, + { + "epoch": 0.08006672226855713, + "grad_norm": 76.59547424316406, + "learning_rate": 2.223766504517026e-07, + "logits/chosen": -1.1459205150604248, + "logits/rejected": NaN, + "logps/chosen": -1300.5111083984375, + "logps/rejected": -554.088623046875, + "loss": 0.6625, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03017120435833931, + "rewards/margins": 0.06824378669261932, + "rewards/rejected": -0.03807258605957031, + "step": 192 + }, + { + "epoch": 0.0804837364470392, + "grad_norm": 137.90182495117188, + "learning_rate": 2.235348621728052e-07, + "logits/chosen": -0.5574963092803955, + "logits/rejected": -1.0382349491119385, + "logps/chosen": -1128.1868896484375, + "logps/rejected": -621.6861572265625, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08467484265565872, + "rewards/margins": 0.04345284029841423, + "rewards/rejected": 0.04122200235724449, + "step": 193 + }, + { + "epoch": 0.08090075062552127, + "grad_norm": 19.06510353088379, + "learning_rate": 2.2469307389390782e-07, + "logits/chosen": -1.1945112943649292, + "logits/rejected": NaN, + "logps/chosen": -1108.740234375, + "logps/rejected": -450.93743896484375, + "loss": 0.6483, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17488862574100494, + "rewards/margins": 0.1051097959280014, + "rewards/rejected": 0.06977882236242294, + "step": 194 + }, + { + "epoch": 0.08131776480400334, + "grad_norm": 28.90352439880371, + "learning_rate": 2.2585128561501046e-07, + "logits/chosen": -0.8536111116409302, + "logits/rejected": -0.9626466631889343, + "logps/chosen": -1017.177490234375, + "logps/rejected": -432.2199401855469, + "loss": 0.7399, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.019336704164743423, + "rewards/margins": -0.0668613389134407, + "rewards/rejected": 0.047524645924568176, + "step": 195 + }, + { + "epoch": 0.0817347789824854, + "grad_norm": 18.9742431640625, + "learning_rate": 2.2700949733611307e-07, + "logits/chosen": -0.8728917837142944, + "logits/rejected": -0.902883768081665, + "logps/chosen": -1053.9295654296875, + "logps/rejected": -551.234375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12730637192726135, + "rewards/margins": 0.0017711622640490532, + "rewards/rejected": 0.12553520500659943, + "step": 196 + }, + { + "epoch": 0.08215179316096748, + "grad_norm": 17.921096801757812, + "learning_rate": 2.2816770905721568e-07, + "logits/chosen": -1.624250888824463, + "logits/rejected": -1.0021578073501587, + "logps/chosen": -1005.0425415039062, + "logps/rejected": -503.0292053222656, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0059082042425870895, + "rewards/margins": 0.016225244849920273, + "rewards/rejected": -0.010317039676010609, + "step": 197 + }, + { + "epoch": 0.08256880733944955, + "grad_norm": 16.70212173461914, + "learning_rate": 2.2932592077831827e-07, + "logits/chosen": -0.8595750331878662, + "logits/rejected": -1.1089214086532593, + "logps/chosen": -979.0332641601562, + "logps/rejected": -426.44488525390625, + "loss": 0.68, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06824760884046555, + "rewards/margins": 0.02839489094913006, + "rewards/rejected": 0.03985271602869034, + "step": 198 + }, + { + "epoch": 0.08298582151793162, + "grad_norm": 17.663320541381836, + "learning_rate": 2.304841324994209e-07, + "logits/chosen": -0.7235796451568604, + "logits/rejected": -0.8155123591423035, + "logps/chosen": -868.98779296875, + "logps/rejected": -668.3648681640625, + "loss": 0.6787, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07853241264820099, + "rewards/margins": 0.03560524061322212, + "rewards/rejected": 0.04292716830968857, + "step": 199 + }, + { + "epoch": 0.08340283569641367, + "grad_norm": 45.1196174621582, + "learning_rate": 2.3164234422052353e-07, + "logits/chosen": -1.1544238328933716, + "logits/rejected": -0.6366955637931824, + "logps/chosen": -1244.16552734375, + "logps/rejected": -878.2661743164062, + "loss": 0.7033, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08822021633386612, + "rewards/margins": -0.013228608295321465, + "rewards/rejected": 0.10144881904125214, + "step": 200 + }, + { + "epoch": 0.08381984987489574, + "grad_norm": 19.24353790283203, + "learning_rate": 2.3280055594162614e-07, + "logits/chosen": -1.3004251718521118, + "logits/rejected": NaN, + "logps/chosen": -1041.7159423828125, + "logps/rejected": -410.1809387207031, + "loss": 0.654, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12377548217773438, + "rewards/margins": 0.08727188408374786, + "rewards/rejected": 0.03650360554456711, + "step": 201 + }, + { + "epoch": 0.08423686405337781, + "grad_norm": 41.09825897216797, + "learning_rate": 2.3395876766272875e-07, + "logits/chosen": -0.8094826340675354, + "logits/rejected": -0.8403177261352539, + "logps/chosen": -1198.058837890625, + "logps/rejected": -751.1346435546875, + "loss": 0.6603, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.039412688463926315, + "rewards/margins": 0.06822167336940765, + "rewards/rejected": -0.028808973729610443, + "step": 202 + }, + { + "epoch": 0.08465387823185988, + "grad_norm": 16.8994140625, + "learning_rate": 2.351169793838314e-07, + "logits/chosen": -1.1605072021484375, + "logits/rejected": -0.7713668346405029, + "logps/chosen": -1007.8375244140625, + "logps/rejected": -540.33740234375, + "loss": 0.6507, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11157456040382385, + "rewards/margins": 0.09369125217199326, + "rewards/rejected": 0.01788330078125, + "step": 203 + }, + { + "epoch": 0.08507089241034195, + "grad_norm": 30.917932510375977, + "learning_rate": 2.36275191104934e-07, + "logits/chosen": -1.215205430984497, + "logits/rejected": -0.8357929587364197, + "logps/chosen": -1099.541259765625, + "logps/rejected": -586.235595703125, + "loss": 0.7, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02956085279583931, + "rewards/margins": -0.009917831979691982, + "rewards/rejected": 0.03947868198156357, + "step": 204 + }, + { + "epoch": 0.08548790658882402, + "grad_norm": 17.266525268554688, + "learning_rate": 2.3743340282603662e-07, + "logits/chosen": -1.1936404705047607, + "logits/rejected": -1.2213857173919678, + "logps/chosen": -864.9459228515625, + "logps/rejected": -535.5136108398438, + "loss": 0.6517, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10736389458179474, + "rewards/margins": 0.09199543297290802, + "rewards/rejected": 0.015368463471531868, + "step": 205 + }, + { + "epoch": 0.08590492076730609, + "grad_norm": 17.981657028198242, + "learning_rate": 2.3859161454713924e-07, + "logits/chosen": -1.6697956323623657, + "logits/rejected": -0.8830596208572388, + "logps/chosen": -1095.1900634765625, + "logps/rejected": -680.8612060546875, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0902656614780426, + "rewards/margins": 0.08114968240261078, + "rewards/rejected": 0.0091159762814641, + "step": 206 + }, + { + "epoch": 0.08632193494578816, + "grad_norm": 70.7901611328125, + "learning_rate": 2.3974982626824185e-07, + "logits/chosen": -1.0489321947097778, + "logits/rejected": -0.7875484228134155, + "logps/chosen": -1141.376220703125, + "logps/rejected": -709.7373046875, + "loss": 0.6284, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15349578857421875, + "rewards/margins": 0.13711853325366974, + "rewards/rejected": 0.01637725904583931, + "step": 207 + }, + { + "epoch": 0.08673894912427023, + "grad_norm": 20.045400619506836, + "learning_rate": 2.4090803798934447e-07, + "logits/chosen": -0.6123969554901123, + "logits/rejected": -1.1536002159118652, + "logps/chosen": -767.2320556640625, + "logps/rejected": -376.2362060546875, + "loss": 0.6333, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17492905259132385, + "rewards/margins": 0.12806205451488495, + "rewards/rejected": 0.04686698690056801, + "step": 208 + }, + { + "epoch": 0.0871559633027523, + "grad_norm": 24.76424789428711, + "learning_rate": 2.420662497104471e-07, + "logits/chosen": -1.1186541318893433, + "logits/rejected": -0.7759391069412231, + "logps/chosen": -1094.38916015625, + "logps/rejected": -528.775146484375, + "loss": 0.6309, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.129200741648674, + "rewards/margins": 0.13545647263526917, + "rewards/rejected": -0.006255721673369408, + "step": 209 + }, + { + "epoch": 0.08757297748123437, + "grad_norm": 16.30167007446289, + "learning_rate": 2.432244614315497e-07, + "logits/chosen": -0.32253965735435486, + "logits/rejected": -0.9565478563308716, + "logps/chosen": -1033.5594482421875, + "logps/rejected": -484.91424560546875, + "loss": 0.6106, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18883512914180756, + "rewards/margins": 0.18016470968723297, + "rewards/rejected": 0.008670425042510033, + "step": 210 + }, + { + "epoch": 0.08798999165971642, + "grad_norm": 15.63703441619873, + "learning_rate": 2.443826731526523e-07, + "logits/chosen": -0.5749693512916565, + "logits/rejected": -0.8548868298530579, + "logps/chosen": -1232.82958984375, + "logps/rejected": -554.3174438476562, + "loss": 0.6644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10012207180261612, + "rewards/margins": 0.061289213597774506, + "rewards/rejected": 0.038832858204841614, + "step": 211 + }, + { + "epoch": 0.0884070058381985, + "grad_norm": 18.16373062133789, + "learning_rate": 2.455408848737549e-07, + "logits/chosen": -0.8083324432373047, + "logits/rejected": -0.8440595865249634, + "logps/chosen": -1100.3646240234375, + "logps/rejected": -638.4691162109375, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12176360934972763, + "rewards/margins": 0.016787149012088776, + "rewards/rejected": 0.10497646033763885, + "step": 212 + }, + { + "epoch": 0.08882402001668056, + "grad_norm": 16.00562286376953, + "learning_rate": 2.466990965948576e-07, + "logits/chosen": -0.5078340768814087, + "logits/rejected": -1.233609676361084, + "logps/chosen": -773.0233764648438, + "logps/rejected": -475.5267639160156, + "loss": 0.6877, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.060259632766246796, + "rewards/margins": 0.018875695765018463, + "rewards/rejected": 0.04138393700122833, + "step": 213 + }, + { + "epoch": 0.08924103419516263, + "grad_norm": 15.668010711669922, + "learning_rate": 2.478573083159602e-07, + "logits/chosen": -0.2882254123687744, + "logits/rejected": -1.1089779138565063, + "logps/chosen": -739.6641235351562, + "logps/rejected": -428.29864501953125, + "loss": 0.6735, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.091278076171875, + "rewards/margins": 0.04631004482507706, + "rewards/rejected": 0.04496803134679794, + "step": 214 + }, + { + "epoch": 0.0896580483736447, + "grad_norm": 16.126453399658203, + "learning_rate": 2.490155200370628e-07, + "logits/chosen": -0.2680278718471527, + "logits/rejected": -1.0121304988861084, + "logps/chosen": -888.962646484375, + "logps/rejected": -408.6927490234375, + "loss": 0.6383, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1694667786359787, + "rewards/margins": 0.11951082944869995, + "rewards/rejected": 0.04995594173669815, + "step": 215 + }, + { + "epoch": 0.09007506255212677, + "grad_norm": 16.397607803344727, + "learning_rate": 2.5017373175816543e-07, + "logits/chosen": -0.6647254824638367, + "logits/rejected": -0.7998936772346497, + "logps/chosen": -1273.6563720703125, + "logps/rejected": -619.7455444335938, + "loss": 0.6558, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1349029541015625, + "rewards/margins": 0.08326225727796555, + "rewards/rejected": 0.05164070054888725, + "step": 216 + }, + { + "epoch": 0.09049207673060884, + "grad_norm": 16.946949005126953, + "learning_rate": 2.51331943479268e-07, + "logits/chosen": -0.6220259666442871, + "logits/rejected": -1.2059946060180664, + "logps/chosen": -869.1473388671875, + "logps/rejected": -399.66925048828125, + "loss": 0.6017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21462899446487427, + "rewards/margins": 0.1956939697265625, + "rewards/rejected": 0.018935013562440872, + "step": 217 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 21.773883819580078, + "learning_rate": 2.5249015520037066e-07, + "logits/chosen": -0.9791173934936523, + "logits/rejected": -0.735191822052002, + "logps/chosen": -1170.97802734375, + "logps/rejected": -630.2200317382812, + "loss": 0.6174, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12176971882581711, + "rewards/margins": 0.16338539123535156, + "rewards/rejected": -0.04161567613482475, + "step": 218 + }, + { + "epoch": 0.09132610508757298, + "grad_norm": 19.048603057861328, + "learning_rate": 2.5364836692147327e-07, + "logits/chosen": -0.279596745967865, + "logits/rejected": -0.8057091236114502, + "logps/chosen": -1181.9498291015625, + "logps/rejected": -620.42236328125, + "loss": 0.6654, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16344605386257172, + "rewards/margins": 0.06334056705236435, + "rewards/rejected": 0.10010547935962677, + "step": 219 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 14.360029220581055, + "learning_rate": 2.548065786425759e-07, + "logits/chosen": -0.3144441843032837, + "logits/rejected": -1.0849757194519043, + "logps/chosen": -836.57861328125, + "logps/rejected": -454.8494873046875, + "loss": 0.601, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2558487057685852, + "rewards/margins": 0.20176354050636292, + "rewards/rejected": 0.05408515781164169, + "step": 220 + }, + { + "epoch": 0.09216013344453712, + "grad_norm": 18.56598663330078, + "learning_rate": 2.559647903636785e-07, + "logits/chosen": -0.9197547435760498, + "logits/rejected": -0.8935282230377197, + "logps/chosen": -1004.6914672851562, + "logps/rejected": -549.7449951171875, + "loss": 0.6031, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18938523530960083, + "rewards/margins": 0.1953304409980774, + "rewards/rejected": -0.0059452056884765625, + "step": 221 + }, + { + "epoch": 0.09257714762301918, + "grad_norm": 21.447778701782227, + "learning_rate": 2.571230020847811e-07, + "logits/chosen": -1.0541846752166748, + "logits/rejected": -0.7387727499008179, + "logps/chosen": -1056.00537109375, + "logps/rejected": -620.1719970703125, + "loss": 0.6621, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09431304782629013, + "rewards/margins": 0.06789703667163849, + "rewards/rejected": 0.02641601860523224, + "step": 222 + }, + { + "epoch": 0.09299416180150125, + "grad_norm": 15.66723346710205, + "learning_rate": 2.5828121380588373e-07, + "logits/chosen": -0.930359423160553, + "logits/rejected": -0.9587619304656982, + "logps/chosen": -904.5162963867188, + "logps/rejected": -535.5623779296875, + "loss": 0.6423, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19344598054885864, + "rewards/margins": 0.10992459952831268, + "rewards/rejected": 0.08352136611938477, + "step": 223 + }, + { + "epoch": 0.09341117597998332, + "grad_norm": 40.647945404052734, + "learning_rate": 2.5943942552698634e-07, + "logits/chosen": -0.7895680665969849, + "logits/rejected": -0.8505679368972778, + "logps/chosen": -991.3605346679688, + "logps/rejected": -364.9691467285156, + "loss": 0.6217, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13437461853027344, + "rewards/margins": 0.1524757444858551, + "rewards/rejected": -0.018101122230291367, + "step": 224 + }, + { + "epoch": 0.09382819015846539, + "grad_norm": 34.37077713012695, + "learning_rate": 2.60597637248089e-07, + "logits/chosen": -1.0938374996185303, + "logits/rejected": -0.984358012676239, + "logps/chosen": -1095.610107421875, + "logps/rejected": -495.7450866699219, + "loss": 0.6458, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07000046223402023, + "rewards/margins": 0.1003623977303505, + "rewards/rejected": -0.03036193922162056, + "step": 225 + }, + { + "epoch": 0.09424520433694746, + "grad_norm": 20.54073715209961, + "learning_rate": 2.6175584896919157e-07, + "logits/chosen": -1.0607742071151733, + "logits/rejected": NaN, + "logps/chosen": -948.1304931640625, + "logps/rejected": -414.50921630859375, + "loss": 0.7013, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.004204558674246073, + "rewards/margins": -0.013362502679228783, + "rewards/rejected": 0.017567064613103867, + "step": 226 + }, + { + "epoch": 0.09466221851542952, + "grad_norm": 12.022401809692383, + "learning_rate": 2.629140606902942e-07, + "logits/chosen": -0.14431844651699066, + "logits/rejected": -0.8724490404129028, + "logps/chosen": -734.065185546875, + "logps/rejected": -431.85418701171875, + "loss": 0.6398, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1262046843767166, + "rewards/margins": 0.11721878498792648, + "rewards/rejected": 0.00898589938879013, + "step": 227 + }, + { + "epoch": 0.0950792326939116, + "grad_norm": 19.69148826599121, + "learning_rate": 2.640722724113968e-07, + "logits/chosen": -1.2512563467025757, + "logits/rejected": -0.7299496531486511, + "logps/chosen": -1125.381591796875, + "logps/rejected": -663.4277954101562, + "loss": 0.6862, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0831962525844574, + "rewards/margins": 0.020558547228574753, + "rewards/rejected": 0.06263770908117294, + "step": 228 + }, + { + "epoch": 0.09549624687239366, + "grad_norm": 17.551128387451172, + "learning_rate": 2.652304841324994e-07, + "logits/chosen": -1.261021614074707, + "logits/rejected": -0.686134397983551, + "logps/chosen": -1052.6802978515625, + "logps/rejected": -485.8984069824219, + "loss": 0.6019, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29933643341064453, + "rewards/margins": 0.22316455841064453, + "rewards/rejected": 0.0761718824505806, + "step": 229 + }, + { + "epoch": 0.09591326105087573, + "grad_norm": 28.845611572265625, + "learning_rate": 2.663886958536021e-07, + "logits/chosen": -1.3429927825927734, + "logits/rejected": -0.6330551505088806, + "logps/chosen": -1000.4681396484375, + "logps/rejected": -513.9637451171875, + "loss": 0.6804, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06244049221277237, + "rewards/margins": 0.026950452476739883, + "rewards/rejected": 0.03549003601074219, + "step": 230 + }, + { + "epoch": 0.0963302752293578, + "grad_norm": 291.0335693359375, + "learning_rate": 2.675469075747047e-07, + "logits/chosen": -1.4442803859710693, + "logits/rejected": -0.7471365928649902, + "logps/chosen": -1178.294189453125, + "logps/rejected": -576.0390625, + "loss": 0.612, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16752929985523224, + "rewards/margins": 0.1750386357307434, + "rewards/rejected": -0.0075093284249305725, + "step": 231 + }, + { + "epoch": 0.09674728940783987, + "grad_norm": 20.27515411376953, + "learning_rate": 2.687051192958073e-07, + "logits/chosen": -0.5177587270736694, + "logits/rejected": -0.8045967221260071, + "logps/chosen": -1044.532470703125, + "logps/rejected": -623.6506958007812, + "loss": 0.6513, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17965394258499146, + "rewards/margins": 0.09976005554199219, + "rewards/rejected": 0.07989387959241867, + "step": 232 + }, + { + "epoch": 0.09716430358632193, + "grad_norm": 61.282806396484375, + "learning_rate": 2.698633310169099e-07, + "logits/chosen": -0.8966441750526428, + "logits/rejected": -0.8639256954193115, + "logps/chosen": -908.210205078125, + "logps/rejected": -525.6875, + "loss": 0.6376, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14873962104320526, + "rewards/margins": 0.11724567413330078, + "rewards/rejected": 0.03149394690990448, + "step": 233 + }, + { + "epoch": 0.097581317764804, + "grad_norm": 12.1847505569458, + "learning_rate": 2.7102154273801254e-07, + "logits/chosen": -0.39486533403396606, + "logits/rejected": -1.2417372465133667, + "logps/chosen": -649.6317138671875, + "logps/rejected": -366.49920654296875, + "loss": 0.6102, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17818908393383026, + "rewards/margins": 0.18042391538619995, + "rewards/rejected": -0.0022348416969180107, + "step": 234 + }, + { + "epoch": 0.09799833194328607, + "grad_norm": 18.472909927368164, + "learning_rate": 2.7217975445911515e-07, + "logits/chosen": -0.48689839243888855, + "logits/rejected": -0.9660793542861938, + "logps/chosen": -904.3525390625, + "logps/rejected": -654.8763427734375, + "loss": 0.6639, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12870025634765625, + "rewards/margins": 0.07386550307273865, + "rewards/rejected": 0.0548347532749176, + "step": 235 + }, + { + "epoch": 0.09841534612176814, + "grad_norm": 14.27440071105957, + "learning_rate": 2.7333796618021777e-07, + "logits/chosen": -0.18033407628536224, + "logits/rejected": -1.0429768562316895, + "logps/chosen": -832.7952270507812, + "logps/rejected": -480.2188720703125, + "loss": 0.6108, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20584869384765625, + "rewards/margins": 0.17503929138183594, + "rewards/rejected": 0.030809402465820312, + "step": 236 + }, + { + "epoch": 0.09883236030025021, + "grad_norm": 139.0686798095703, + "learning_rate": 2.744961779013204e-07, + "logits/chosen": -1.1416773796081543, + "logits/rejected": -0.9203497767448425, + "logps/chosen": -1180.834716796875, + "logps/rejected": -757.0255126953125, + "loss": 0.621, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20721283555030823, + "rewards/margins": 0.15547791123390198, + "rewards/rejected": 0.05173492431640625, + "step": 237 + }, + { + "epoch": 0.09924937447873228, + "grad_norm": 24.540939331054688, + "learning_rate": 2.75654389622423e-07, + "logits/chosen": -1.2055429220199585, + "logits/rejected": -0.8867220878601074, + "logps/chosen": -1318.04736328125, + "logps/rejected": -706.634521484375, + "loss": 0.7164, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08184738457202911, + "rewards/margins": -0.034890368580818176, + "rewards/rejected": 0.11673775315284729, + "step": 238 + }, + { + "epoch": 0.09966638865721435, + "grad_norm": 17.575857162475586, + "learning_rate": 2.768126013435256e-07, + "logits/chosen": -0.30541661381721497, + "logits/rejected": -1.3499764204025269, + "logps/chosen": -1124.03125, + "logps/rejected": -492.5777587890625, + "loss": 0.6493, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18285788595676422, + "rewards/margins": 0.09315929561853409, + "rewards/rejected": 0.08969860523939133, + "step": 239 + }, + { + "epoch": 0.10008340283569642, + "grad_norm": 14.577101707458496, + "learning_rate": 2.779708130646282e-07, + "logits/chosen": -0.5087881684303284, + "logits/rejected": -1.147740364074707, + "logps/chosen": -784.4324951171875, + "logps/rejected": -441.43890380859375, + "loss": 0.6323, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20461122691631317, + "rewards/margins": 0.13452492654323578, + "rewards/rejected": 0.0700862854719162, + "step": 240 + }, + { + "epoch": 0.10050041701417849, + "grad_norm": 25.420352935791016, + "learning_rate": 2.7912902478573084e-07, + "logits/chosen": -1.3822611570358276, + "logits/rejected": -0.47059667110443115, + "logps/chosen": -1206.4320068359375, + "logps/rejected": -746.700439453125, + "loss": 0.6029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20833131670951843, + "rewards/margins": 0.19102324545383453, + "rewards/rejected": 0.01730804517865181, + "step": 241 + }, + { + "epoch": 0.10091743119266056, + "grad_norm": 20.149301528930664, + "learning_rate": 2.802872365068335e-07, + "logits/chosen": -1.1623871326446533, + "logits/rejected": -0.6261639595031738, + "logps/chosen": -1111.4410400390625, + "logps/rejected": -656.0640869140625, + "loss": 0.6481, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1804603636264801, + "rewards/margins": 0.09670696407556534, + "rewards/rejected": 0.08375339210033417, + "step": 242 + }, + { + "epoch": 0.10133444537114263, + "grad_norm": 13.837777137756348, + "learning_rate": 2.814454482279361e-07, + "logits/chosen": -0.4748861789703369, + "logits/rejected": -1.2136765718460083, + "logps/chosen": -700.0236206054688, + "logps/rejected": -344.68597412109375, + "loss": 0.6169, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2125343233346939, + "rewards/margins": 0.17169484496116638, + "rewards/rejected": 0.040839482098817825, + "step": 243 + }, + { + "epoch": 0.10175145954962468, + "grad_norm": 17.940114974975586, + "learning_rate": 2.826036599490387e-07, + "logits/chosen": -1.1872299909591675, + "logits/rejected": -0.8927431106567383, + "logps/chosen": -1093.10791015625, + "logps/rejected": -473.0823974609375, + "loss": 0.6422, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11776351928710938, + "rewards/margins": 0.11371803283691406, + "rewards/rejected": 0.004045487381517887, + "step": 244 + }, + { + "epoch": 0.10216847372810675, + "grad_norm": 22.07712745666504, + "learning_rate": 2.837618716701413e-07, + "logits/chosen": -1.7253856658935547, + "logits/rejected": NaN, + "logps/chosen": -1012.3094482421875, + "logps/rejected": -411.92193603515625, + "loss": 0.6852, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13286209106445312, + "rewards/margins": 0.03010406717658043, + "rewards/rejected": 0.102758027613163, + "step": 245 + }, + { + "epoch": 0.10258548790658882, + "grad_norm": 50.55930709838867, + "learning_rate": 2.8492008339124396e-07, + "logits/chosen": -0.8724709749221802, + "logits/rejected": -1.1303483247756958, + "logps/chosen": -1036.98779296875, + "logps/rejected": -510.228759765625, + "loss": 0.6193, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19679643213748932, + "rewards/margins": 0.17021293938159943, + "rewards/rejected": 0.0265834778547287, + "step": 246 + }, + { + "epoch": 0.10300250208507089, + "grad_norm": 18.964214324951172, + "learning_rate": 2.8607829511234657e-07, + "logits/chosen": -0.5094012022018433, + "logits/rejected": -0.9018294215202332, + "logps/chosen": -1082.474853515625, + "logps/rejected": -522.43408203125, + "loss": 0.6218, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2127944976091385, + "rewards/margins": 0.15186253190040588, + "rewards/rejected": 0.0609319694340229, + "step": 247 + }, + { + "epoch": 0.10341951626355296, + "grad_norm": 60.015785217285156, + "learning_rate": 2.872365068334492e-07, + "logits/chosen": -1.9803352355957031, + "logits/rejected": -0.4845694899559021, + "logps/chosen": -1266.86376953125, + "logps/rejected": -724.8395385742188, + "loss": 0.6473, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15559157729148865, + "rewards/margins": 0.10005645453929901, + "rewards/rejected": 0.055535126477479935, + "step": 248 + }, + { + "epoch": 0.10383653044203503, + "grad_norm": 16.38298225402832, + "learning_rate": 2.883947185545518e-07, + "logits/chosen": -0.7945104837417603, + "logits/rejected": -0.867763340473175, + "logps/chosen": -890.9544677734375, + "logps/rejected": -511.3885192871094, + "loss": 0.6551, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12873421609401703, + "rewards/margins": 0.08658944070339203, + "rewards/rejected": 0.042144775390625, + "step": 249 + }, + { + "epoch": 0.1042535446205171, + "grad_norm": 14.518606185913086, + "learning_rate": 2.895529302756544e-07, + "logits/chosen": -1.0374222993850708, + "logits/rejected": -0.9003971219062805, + "logps/chosen": -835.463134765625, + "logps/rejected": -508.512939453125, + "loss": 0.6121, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20270919799804688, + "rewards/margins": 0.17358151078224182, + "rewards/rejected": 0.02912769466638565, + "step": 250 + }, + { + "epoch": 0.10467055879899917, + "grad_norm": 13.824320793151855, + "learning_rate": 2.9071114199675703e-07, + "logits/chosen": -0.5193870067596436, + "logits/rejected": -1.2627581357955933, + "logps/chosen": -798.88330078125, + "logps/rejected": -409.64105224609375, + "loss": 0.6052, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2563156187534332, + "rewards/margins": 0.19226084649562836, + "rewards/rejected": 0.06405477225780487, + "step": 251 + }, + { + "epoch": 0.10508757297748124, + "grad_norm": 18.48811149597168, + "learning_rate": 2.9186935371785964e-07, + "logits/chosen": -0.9632860422134399, + "logits/rejected": NaN, + "logps/chosen": -985.0609130859375, + "logps/rejected": -308.38916015625, + "loss": 0.67, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1845848262310028, + "rewards/margins": 0.048345185816287994, + "rewards/rejected": 0.13623961806297302, + "step": 252 + }, + { + "epoch": 0.10550458715596331, + "grad_norm": 15.286216735839844, + "learning_rate": 2.9302756543896226e-07, + "logits/chosen": -1.0521239042282104, + "logits/rejected": -1.1310068368911743, + "logps/chosen": -886.0570068359375, + "logps/rejected": -468.1670227050781, + "loss": 0.6394, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16315919160842896, + "rewards/margins": 0.11797905713319778, + "rewards/rejected": 0.04518013074994087, + "step": 253 + }, + { + "epoch": 0.10592160133444536, + "grad_norm": 13.092817306518555, + "learning_rate": 2.9418577716006487e-07, + "logits/chosen": -0.3078972101211548, + "logits/rejected": -1.0359314680099487, + "logps/chosen": -851.590576171875, + "logps/rejected": -542.2706298828125, + "loss": 0.6303, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2479393184185028, + "rewards/margins": 0.13447076082229614, + "rewards/rejected": 0.11346855759620667, + "step": 254 + }, + { + "epoch": 0.10633861551292743, + "grad_norm": 28.490493774414062, + "learning_rate": 2.953439888811675e-07, + "logits/chosen": -0.5377565026283264, + "logits/rejected": -0.9366105794906616, + "logps/chosen": -1040.5145263671875, + "logps/rejected": -633.24462890625, + "loss": 0.5713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35242921113967896, + "rewards/margins": 0.27363815903663635, + "rewards/rejected": 0.078791044652462, + "step": 255 + }, + { + "epoch": 0.1067556296914095, + "grad_norm": 15.60693359375, + "learning_rate": 2.965022006022701e-07, + "logits/chosen": -0.610664427280426, + "logits/rejected": -0.7601006031036377, + "logps/chosen": -1212.7943115234375, + "logps/rejected": -604.4202880859375, + "loss": 0.573, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2842552363872528, + "rewards/margins": 0.26195448637008667, + "rewards/rejected": 0.02230072021484375, + "step": 256 + }, + { + "epoch": 0.10717264386989157, + "grad_norm": 15.87313175201416, + "learning_rate": 2.976604123233727e-07, + "logits/chosen": -0.4382545053958893, + "logits/rejected": -1.0214958190917969, + "logps/chosen": -866.442138671875, + "logps/rejected": -572.4998168945312, + "loss": 0.6242, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.276947021484375, + "rewards/margins": 0.15360127389431, + "rewards/rejected": 0.1233457550406456, + "step": 257 + }, + { + "epoch": 0.10758965804837364, + "grad_norm": 19.007665634155273, + "learning_rate": 2.988186240444754e-07, + "logits/chosen": -0.2589291036128998, + "logits/rejected": -0.9729577302932739, + "logps/chosen": -991.1255493164062, + "logps/rejected": -559.2017822265625, + "loss": 0.6518, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1605277955532074, + "rewards/margins": 0.090204618871212, + "rewards/rejected": 0.07032318413257599, + "step": 258 + }, + { + "epoch": 0.10800667222685571, + "grad_norm": 13.881446838378906, + "learning_rate": 2.99976835765578e-07, + "logits/chosen": -0.4227820336818695, + "logits/rejected": -1.08079195022583, + "logps/chosen": -800.5531005859375, + "logps/rejected": -407.4799499511719, + "loss": 0.6011, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18804016709327698, + "rewards/margins": 0.20428180694580078, + "rewards/rejected": -0.01624164544045925, + "step": 259 + }, + { + "epoch": 0.10842368640533778, + "grad_norm": 21.981597900390625, + "learning_rate": 3.011350474866806e-07, + "logits/chosen": -1.0578898191452026, + "logits/rejected": -0.9369192719459534, + "logps/chosen": -751.7718505859375, + "logps/rejected": -492.4425354003906, + "loss": 0.6118, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20488891005516052, + "rewards/margins": 0.1762973815202713, + "rewards/rejected": 0.028591537848114967, + "step": 260 + }, + { + "epoch": 0.10884070058381985, + "grad_norm": 98.58331298828125, + "learning_rate": 3.022932592077832e-07, + "logits/chosen": -0.7017648220062256, + "logits/rejected": -0.8486148118972778, + "logps/chosen": -960.8590087890625, + "logps/rejected": -521.2827758789062, + "loss": 0.6098, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22426146268844604, + "rewards/margins": 0.18157464265823364, + "rewards/rejected": 0.04268684610724449, + "step": 261 + }, + { + "epoch": 0.10925771476230192, + "grad_norm": 26.407007217407227, + "learning_rate": 3.0345147092888584e-07, + "logits/chosen": -0.5353441834449768, + "logits/rejected": -1.2533634901046753, + "logps/chosen": -831.4158935546875, + "logps/rejected": -475.330078125, + "loss": 0.6041, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3369339108467102, + "rewards/margins": 0.20098495483398438, + "rewards/rejected": 0.13594894111156464, + "step": 262 + }, + { + "epoch": 0.10967472894078399, + "grad_norm": 26.680646896362305, + "learning_rate": 3.0460968264998845e-07, + "logits/chosen": -0.728193998336792, + "logits/rejected": -0.8041986227035522, + "logps/chosen": -1167.4620361328125, + "logps/rejected": -631.2430419921875, + "loss": 0.6525, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.132771298289299, + "rewards/margins": 0.08983077853918076, + "rewards/rejected": 0.042940523475408554, + "step": 263 + }, + { + "epoch": 0.11009174311926606, + "grad_norm": 21.717830657958984, + "learning_rate": 3.0576789437109106e-07, + "logits/chosen": -1.2679827213287354, + "logits/rejected": -0.7724223136901855, + "logps/chosen": -1074.4912109375, + "logps/rejected": -647.8201904296875, + "loss": 0.6292, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20858880877494812, + "rewards/margins": 0.13597793877124786, + "rewards/rejected": 0.07261085510253906, + "step": 264 + }, + { + "epoch": 0.11050875729774812, + "grad_norm": 12.785720825195312, + "learning_rate": 3.069261060921937e-07, + "logits/chosen": -0.058946993201971054, + "logits/rejected": -0.9428904056549072, + "logps/chosen": -964.64013671875, + "logps/rejected": -470.12213134765625, + "loss": 0.6224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20001718401908875, + "rewards/margins": 0.15249785780906677, + "rewards/rejected": 0.04751930385828018, + "step": 265 + }, + { + "epoch": 0.11092577147623019, + "grad_norm": 13.761826515197754, + "learning_rate": 3.080843178132963e-07, + "logits/chosen": -0.42276063561439514, + "logits/rejected": -1.0071310997009277, + "logps/chosen": -974.1000366210938, + "logps/rejected": -575.1447143554688, + "loss": 0.6258, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2587288022041321, + "rewards/margins": 0.14694786071777344, + "rewards/rejected": 0.11178093403577805, + "step": 266 + }, + { + "epoch": 0.11134278565471226, + "grad_norm": 21.18310546875, + "learning_rate": 3.092425295343989e-07, + "logits/chosen": -1.34767746925354, + "logits/rejected": -0.5707989931106567, + "logps/chosen": -1011.53662109375, + "logps/rejected": -450.6890869140625, + "loss": 0.667, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14605025947093964, + "rewards/margins": 0.05874290689826012, + "rewards/rejected": 0.08730735629796982, + "step": 267 + }, + { + "epoch": 0.11175979983319433, + "grad_norm": 17.18611717224121, + "learning_rate": 3.104007412555015e-07, + "logits/chosen": -0.32892176508903503, + "logits/rejected": -1.0322444438934326, + "logps/chosen": -945.5836791992188, + "logps/rejected": -523.26904296875, + "loss": 0.577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3476661741733551, + "rewards/margins": 0.25370368361473083, + "rewards/rejected": 0.09396248310804367, + "step": 268 + }, + { + "epoch": 0.1121768140116764, + "grad_norm": 27.707008361816406, + "learning_rate": 3.1155895297660414e-07, + "logits/chosen": -1.830579161643982, + "logits/rejected": -0.4077012836933136, + "logps/chosen": -1117.026611328125, + "logps/rejected": -559.3352661132812, + "loss": 0.6636, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07206147164106369, + "rewards/margins": 0.061827778816223145, + "rewards/rejected": 0.010233689099550247, + "step": 269 + }, + { + "epoch": 0.11259382819015847, + "grad_norm": 15.74800968170166, + "learning_rate": 3.1271716469770675e-07, + "logits/chosen": 0.08004956692457199, + "logits/rejected": -0.8324182033538818, + "logps/chosen": -1130.655517578125, + "logps/rejected": -584.619384765625, + "loss": 0.6142, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.218017578125, + "rewards/margins": 0.17504388093948364, + "rewards/rejected": 0.04297371208667755, + "step": 270 + }, + { + "epoch": 0.11301084236864054, + "grad_norm": 14.319452285766602, + "learning_rate": 3.138753764188094e-07, + "logits/chosen": -0.476803719997406, + "logits/rejected": -1.1503020524978638, + "logps/chosen": -690.22900390625, + "logps/rejected": -403.5578308105469, + "loss": 0.5803, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3197021484375, + "rewards/margins": 0.2540786564350128, + "rewards/rejected": 0.06562346965074539, + "step": 271 + }, + { + "epoch": 0.1134278565471226, + "grad_norm": 16.714094161987305, + "learning_rate": 3.15033588139912e-07, + "logits/chosen": -1.3220869302749634, + "logits/rejected": -0.7616578340530396, + "logps/chosen": -917.1904296875, + "logps/rejected": -558.5955200195312, + "loss": 0.6293, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18191643059253693, + "rewards/margins": 0.1335395872592926, + "rewards/rejected": 0.04837685078382492, + "step": 272 + }, + { + "epoch": 0.11384487072560467, + "grad_norm": 18.961973190307617, + "learning_rate": 3.1619179986101464e-07, + "logits/chosen": -1.030076503753662, + "logits/rejected": -0.9435204267501831, + "logps/chosen": -979.475830078125, + "logps/rejected": -527.1456298828125, + "loss": 0.6476, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22119753062725067, + "rewards/margins": 0.09960117191076279, + "rewards/rejected": 0.1215963363647461, + "step": 273 + }, + { + "epoch": 0.11426188490408674, + "grad_norm": 14.15468692779541, + "learning_rate": 3.173500115821172e-07, + "logits/chosen": -0.4975714683532715, + "logits/rejected": -1.1708725690841675, + "logps/chosen": -799.0260009765625, + "logps/rejected": -477.74798583984375, + "loss": 0.6177, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29983749985694885, + "rewards/margins": 0.162120059132576, + "rewards/rejected": 0.13771744072437286, + "step": 274 + }, + { + "epoch": 0.11467889908256881, + "grad_norm": 13.403182983398438, + "learning_rate": 3.1850822330321987e-07, + "logits/chosen": -0.3800122141838074, + "logits/rejected": -1.01504385471344, + "logps/chosen": -956.7235107421875, + "logps/rejected": -499.7164001464844, + "loss": 0.5932, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3037223815917969, + "rewards/margins": 0.2179853618144989, + "rewards/rejected": 0.08573703467845917, + "step": 275 + }, + { + "epoch": 0.11509591326105087, + "grad_norm": 15.86799430847168, + "learning_rate": 3.196664350243225e-07, + "logits/chosen": -0.0732722282409668, + "logits/rejected": -0.9773910045623779, + "logps/chosen": -955.8633422851562, + "logps/rejected": -461.00103759765625, + "loss": 0.5975, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32872849702835083, + "rewards/margins": 0.20409545302391052, + "rewards/rejected": 0.12463302910327911, + "step": 276 + }, + { + "epoch": 0.11551292743953294, + "grad_norm": 15.43904972076416, + "learning_rate": 3.2082464674542505e-07, + "logits/chosen": -0.47112059593200684, + "logits/rejected": -1.154751181602478, + "logps/chosen": -1022.7349853515625, + "logps/rejected": -575.4202880859375, + "loss": 0.5441, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45607683062553406, + "rewards/margins": 0.34384119510650635, + "rewards/rejected": 0.1122356429696083, + "step": 277 + }, + { + "epoch": 0.11592994161801501, + "grad_norm": 129.31581115722656, + "learning_rate": 3.219828584665277e-07, + "logits/chosen": -1.000683069229126, + "logits/rejected": -0.8523465394973755, + "logps/chosen": -803.85546875, + "logps/rejected": -426.3437805175781, + "loss": 0.6458, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1849401593208313, + "rewards/margins": 0.0998992919921875, + "rewards/rejected": 0.0850408524274826, + "step": 278 + }, + { + "epoch": 0.11634695579649708, + "grad_norm": 164.3040771484375, + "learning_rate": 3.231410701876303e-07, + "logits/chosen": -1.0134719610214233, + "logits/rejected": -0.8528445363044739, + "logps/chosen": -928.1425170898438, + "logps/rejected": -519.6156005859375, + "loss": 0.6052, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34030264616012573, + "rewards/margins": 0.22288742661476135, + "rewards/rejected": 0.11741524189710617, + "step": 279 + }, + { + "epoch": 0.11676396997497915, + "grad_norm": 25.648113250732422, + "learning_rate": 3.2429928190873294e-07, + "logits/chosen": -0.9519089460372925, + "logits/rejected": NaN, + "logps/chosen": -930.7451171875, + "logps/rejected": -309.508056640625, + "loss": 0.559, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33251190185546875, + "rewards/margins": 0.2984071969985962, + "rewards/rejected": 0.03410473093390465, + "step": 280 + }, + { + "epoch": 0.11718098415346122, + "grad_norm": 17.406246185302734, + "learning_rate": 3.2545749362983556e-07, + "logits/chosen": -0.7665159702301025, + "logits/rejected": -0.8148435354232788, + "logps/chosen": -1287.32958984375, + "logps/rejected": -607.2664184570312, + "loss": 0.5812, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3177086114883423, + "rewards/margins": 0.26285552978515625, + "rewards/rejected": 0.054853059351444244, + "step": 281 + }, + { + "epoch": 0.11759799833194329, + "grad_norm": 16.922182083129883, + "learning_rate": 3.266157053509382e-07, + "logits/chosen": -0.39797911047935486, + "logits/rejected": -0.9407958984375, + "logps/chosen": -1156.9713134765625, + "logps/rejected": -744.2662353515625, + "loss": 0.5975, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3219100832939148, + "rewards/margins": 0.2084100842475891, + "rewards/rejected": 0.11350002139806747, + "step": 282 + }, + { + "epoch": 0.11801501251042536, + "grad_norm": 16.284711837768555, + "learning_rate": 3.277739170720408e-07, + "logits/chosen": -0.5767046213150024, + "logits/rejected": -1.085089921951294, + "logps/chosen": -756.1465454101562, + "logps/rejected": -369.9927062988281, + "loss": 0.5367, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4165462553501129, + "rewards/margins": 0.35974520444869995, + "rewards/rejected": 0.05680103600025177, + "step": 283 + }, + { + "epoch": 0.11843202668890743, + "grad_norm": 19.101137161254883, + "learning_rate": 3.2893212879314345e-07, + "logits/chosen": -1.1577928066253662, + "logits/rejected": -0.49070975184440613, + "logps/chosen": -1124.8343505859375, + "logps/rejected": -641.9097900390625, + "loss": 0.5752, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3563152253627777, + "rewards/margins": 0.2612079977989197, + "rewards/rejected": 0.09510727226734161, + "step": 284 + }, + { + "epoch": 0.1188490408673895, + "grad_norm": 25.41668701171875, + "learning_rate": 3.30090340514246e-07, + "logits/chosen": -0.6472392678260803, + "logits/rejected": -0.7874640226364136, + "logps/chosen": -1045.0966796875, + "logps/rejected": -487.4134521484375, + "loss": 0.571, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3116043210029602, + "rewards/margins": 0.2692949175834656, + "rewards/rejected": 0.04230938106775284, + "step": 285 + }, + { + "epoch": 0.11926605504587157, + "grad_norm": 15.661775588989258, + "learning_rate": 3.3124855223534863e-07, + "logits/chosen": -0.7353535294532776, + "logits/rejected": -0.9742954969406128, + "logps/chosen": -1107.79443359375, + "logps/rejected": -615.9818725585938, + "loss": 0.6415, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24303092062473297, + "rewards/margins": 0.11996039748191833, + "rewards/rejected": 0.12307052314281464, + "step": 286 + }, + { + "epoch": 0.11968306922435362, + "grad_norm": 48.99085998535156, + "learning_rate": 3.324067639564513e-07, + "logits/chosen": -1.1178187131881714, + "logits/rejected": -1.0094727277755737, + "logps/chosen": -823.798828125, + "logps/rejected": -455.085205078125, + "loss": 0.651, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15042877197265625, + "rewards/margins": 0.0908961296081543, + "rewards/rejected": 0.05953264236450195, + "step": 287 + }, + { + "epoch": 0.12010008340283569, + "grad_norm": 100.10910034179688, + "learning_rate": 3.3356497567755386e-07, + "logits/chosen": -1.102083444595337, + "logits/rejected": -0.7412380576133728, + "logps/chosen": -982.9866943359375, + "logps/rejected": -559.0426025390625, + "loss": 0.5928, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3449627161026001, + "rewards/margins": 0.2310028374195099, + "rewards/rejected": 0.1139598861336708, + "step": 288 + }, + { + "epoch": 0.12051709758131776, + "grad_norm": 14.623226165771484, + "learning_rate": 3.347231873986565e-07, + "logits/chosen": -1.2325422763824463, + "logits/rejected": NaN, + "logps/chosen": -970.08984375, + "logps/rejected": -439.04833984375, + "loss": 0.6014, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23705598711967468, + "rewards/margins": 0.19940854609012604, + "rewards/rejected": 0.03764743730425835, + "step": 289 + }, + { + "epoch": 0.12093411175979983, + "grad_norm": 19.914995193481445, + "learning_rate": 3.358813991197591e-07, + "logits/chosen": -0.7561019062995911, + "logits/rejected": -0.8573811054229736, + "logps/chosen": -1022.3549194335938, + "logps/rejected": -545.7236938476562, + "loss": 0.6029, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32747042179107666, + "rewards/margins": 0.19994354248046875, + "rewards/rejected": 0.12752686440944672, + "step": 290 + }, + { + "epoch": 0.1213511259382819, + "grad_norm": 13.509117126464844, + "learning_rate": 3.3703961084086175e-07, + "logits/chosen": -0.42464780807495117, + "logits/rejected": -1.041014552116394, + "logps/chosen": -734.803955078125, + "logps/rejected": -419.85748291015625, + "loss": 0.5882, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33721238374710083, + "rewards/margins": 0.2248075306415558, + "rewards/rejected": 0.11240482330322266, + "step": 291 + }, + { + "epoch": 0.12176814011676397, + "grad_norm": 28.727006912231445, + "learning_rate": 3.3819782256196436e-07, + "logits/chosen": -0.34327465295791626, + "logits/rejected": -0.9475568532943726, + "logps/chosen": -901.2915649414062, + "logps/rejected": -532.632568359375, + "loss": 0.6336, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2386978268623352, + "rewards/margins": 0.13491249084472656, + "rewards/rejected": 0.10378532111644745, + "step": 292 + }, + { + "epoch": 0.12218515429524604, + "grad_norm": 29.622276306152344, + "learning_rate": 3.39356034283067e-07, + "logits/chosen": -1.564403772354126, + "logits/rejected": -0.5808234810829163, + "logps/chosen": -1153.540283203125, + "logps/rejected": -613.7374267578125, + "loss": 0.642, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1763198971748352, + "rewards/margins": 0.11242657899856567, + "rewards/rejected": 0.06389331817626953, + "step": 293 + }, + { + "epoch": 0.12260216847372811, + "grad_norm": 13.001314163208008, + "learning_rate": 3.405142460041696e-07, + "logits/chosen": -0.4080500602722168, + "logits/rejected": -0.9897705316543579, + "logps/chosen": -813.2212524414062, + "logps/rejected": -482.4291687011719, + "loss": 0.5866, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33896714448928833, + "rewards/margins": 0.22862473130226135, + "rewards/rejected": 0.11034240573644638, + "step": 294 + }, + { + "epoch": 0.12301918265221018, + "grad_norm": 17.335098266601562, + "learning_rate": 3.4167245772527215e-07, + "logits/chosen": -0.923582911491394, + "logits/rejected": -1.0623629093170166, + "logps/chosen": -786.0119018554688, + "logps/rejected": -281.2886962890625, + "loss": 0.6262, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2118915617465973, + "rewards/margins": 0.15064340829849243, + "rewards/rejected": 0.061248160898685455, + "step": 295 + }, + { + "epoch": 0.12343619683069225, + "grad_norm": 13.761141777038574, + "learning_rate": 3.428306694463748e-07, + "logits/chosen": -0.8556644916534424, + "logits/rejected": -0.8346487283706665, + "logps/chosen": -766.1517333984375, + "logps/rejected": -529.2616577148438, + "loss": 0.6206, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3992534577846527, + "rewards/margins": 0.1634693294763565, + "rewards/rejected": 0.2357841432094574, + "step": 296 + }, + { + "epoch": 0.12385321100917432, + "grad_norm": 14.231635093688965, + "learning_rate": 3.4398888116747743e-07, + "logits/chosen": -0.8521167039871216, + "logits/rejected": -0.657128632068634, + "logps/chosen": -1124.268310546875, + "logps/rejected": -763.4170532226562, + "loss": 0.621, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.27871477603912354, + "rewards/margins": 0.1636028289794922, + "rewards/rejected": 0.11511192470788956, + "step": 297 + }, + { + "epoch": 0.12427022518765637, + "grad_norm": 17.039060592651367, + "learning_rate": 3.4514709288858005e-07, + "logits/chosen": -0.685045063495636, + "logits/rejected": -1.0246684551239014, + "logps/chosen": -986.7290649414062, + "logps/rejected": -485.9781188964844, + "loss": 0.5456, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5397445559501648, + "rewards/margins": 0.3497011065483093, + "rewards/rejected": 0.19004344940185547, + "step": 298 + }, + { + "epoch": 0.12468723936613844, + "grad_norm": 21.12545394897461, + "learning_rate": 3.4630530460968266e-07, + "logits/chosen": -1.5864230394363403, + "logits/rejected": -0.664618968963623, + "logps/chosen": -1066.5985107421875, + "logps/rejected": -591.6007080078125, + "loss": 0.6306, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2085868865251541, + "rewards/margins": 0.14333878457546234, + "rewards/rejected": 0.06524810940027237, + "step": 299 + }, + { + "epoch": 0.12510425354462051, + "grad_norm": 23.226051330566406, + "learning_rate": 3.4746351633078533e-07, + "logits/chosen": -1.1755446195602417, + "logits/rejected": -0.8898254632949829, + "logps/chosen": -1049.88232421875, + "logps/rejected": -601.6104125976562, + "loss": 0.6113, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2995544672012329, + "rewards/margins": 0.18071404099464417, + "rewards/rejected": 0.11884041130542755, + "step": 300 + }, + { + "epoch": 0.1255212677231026, + "grad_norm": 24.800697326660156, + "learning_rate": 3.486217280518879e-07, + "logits/chosen": -1.332313060760498, + "logits/rejected": -0.7048108577728271, + "logps/chosen": -910.2355346679688, + "logps/rejected": -437.7936706542969, + "loss": 0.6537, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15585727989673615, + "rewards/margins": 0.08546161651611328, + "rewards/rejected": 0.07039566338062286, + "step": 301 + }, + { + "epoch": 0.12593828190158465, + "grad_norm": 13.729028701782227, + "learning_rate": 3.4977993977299056e-07, + "logits/chosen": -0.44453680515289307, + "logits/rejected": -1.035276174545288, + "logps/chosen": -946.500244140625, + "logps/rejected": -514.6066284179688, + "loss": 0.5912, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26160508394241333, + "rewards/margins": 0.22384873032569885, + "rewards/rejected": 0.03775634616613388, + "step": 302 + }, + { + "epoch": 0.1263552960800667, + "grad_norm": 11.035944938659668, + "learning_rate": 3.5093815149409317e-07, + "logits/chosen": -0.7015379667282104, + "logits/rejected": NaN, + "logps/chosen": -704.529541015625, + "logps/rejected": -383.03131103515625, + "loss": 0.6032, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2774414122104645, + "rewards/margins": 0.19393309950828552, + "rewards/rejected": 0.08350830525159836, + "step": 303 + }, + { + "epoch": 0.1267723102585488, + "grad_norm": 16.79279327392578, + "learning_rate": 3.5209636321519573e-07, + "logits/chosen": -1.3826301097869873, + "logits/rejected": -0.907406210899353, + "logps/chosen": -944.8985595703125, + "logps/rejected": -517.269775390625, + "loss": 0.5419, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46019136905670166, + "rewards/margins": 0.35288089513778687, + "rewards/rejected": 0.10731048882007599, + "step": 304 + }, + { + "epoch": 0.12718932443703085, + "grad_norm": 14.32365894317627, + "learning_rate": 3.532545749362984e-07, + "logits/chosen": -0.9686000943183899, + "logits/rejected": NaN, + "logps/chosen": -867.9672241210938, + "logps/rejected": -328.475341796875, + "loss": 0.552, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3414936065673828, + "rewards/margins": 0.3115116059780121, + "rewards/rejected": 0.02998199500143528, + "step": 305 + }, + { + "epoch": 0.12760633861551293, + "grad_norm": 13.418131828308105, + "learning_rate": 3.5441278665740096e-07, + "logits/chosen": -0.6943694949150085, + "logits/rejected": -0.8238021731376648, + "logps/chosen": -1029.6630859375, + "logps/rejected": -621.7518310546875, + "loss": 0.5359, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5134620666503906, + "rewards/margins": 0.3744209110736847, + "rewards/rejected": 0.13904114067554474, + "step": 306 + }, + { + "epoch": 0.128023352793995, + "grad_norm": 16.34125518798828, + "learning_rate": 3.5557099837850363e-07, + "logits/chosen": -0.8894606232643127, + "logits/rejected": -1.0863574743270874, + "logps/chosen": -922.3950805664062, + "logps/rejected": -484.7933349609375, + "loss": 0.5389, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47550278902053833, + "rewards/margins": 0.35067519545555115, + "rewards/rejected": 0.12482757121324539, + "step": 307 + }, + { + "epoch": 0.12844036697247707, + "grad_norm": 102.25609588623047, + "learning_rate": 3.5672921009960624e-07, + "logits/chosen": -1.525533676147461, + "logits/rejected": NaN, + "logps/chosen": -1136.581787109375, + "logps/rejected": -539.6312255859375, + "loss": 0.6294, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2575237452983856, + "rewards/margins": 0.14240531623363495, + "rewards/rejected": 0.11511840671300888, + "step": 308 + }, + { + "epoch": 0.12885738115095913, + "grad_norm": 11.474930763244629, + "learning_rate": 3.5788742182070886e-07, + "logits/chosen": -0.18352298438549042, + "logits/rejected": -0.7352811098098755, + "logps/chosen": -943.2340087890625, + "logps/rejected": -682.1069946289062, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5108886957168579, + "rewards/margins": 0.33565714955329895, + "rewards/rejected": 0.17523154616355896, + "step": 309 + }, + { + "epoch": 0.1292743953294412, + "grad_norm": 15.754068374633789, + "learning_rate": 3.5904563354181147e-07, + "logits/chosen": -0.3233254551887512, + "logits/rejected": -0.9084727764129639, + "logps/chosen": -945.72802734375, + "logps/rejected": -577.976318359375, + "loss": 0.5055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6962143182754517, + "rewards/margins": 0.4513900876045227, + "rewards/rejected": 0.24482423067092896, + "step": 310 + }, + { + "epoch": 0.12969140950792327, + "grad_norm": 22.600061416625977, + "learning_rate": 3.6020384526291414e-07, + "logits/chosen": -0.7422985434532166, + "logits/rejected": -0.7999512553215027, + "logps/chosen": -1327.4820556640625, + "logps/rejected": -808.49365234375, + "loss": 0.6038, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.457833856344223, + "rewards/margins": 0.21224595606327057, + "rewards/rejected": 0.24558793008327484, + "step": 311 + }, + { + "epoch": 0.13010842368640535, + "grad_norm": 14.20523452758789, + "learning_rate": 3.613620569840167e-07, + "logits/chosen": -0.5239366292953491, + "logits/rejected": -1.0001394748687744, + "logps/chosen": -947.9267578125, + "logps/rejected": -511.11517333984375, + "loss": 0.5503, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4917305111885071, + "rewards/margins": 0.33904343843460083, + "rewards/rejected": 0.15268708765506744, + "step": 312 + }, + { + "epoch": 0.1305254378648874, + "grad_norm": 13.7387113571167, + "learning_rate": 3.625202687051193e-07, + "logits/chosen": -0.31597334146499634, + "logits/rejected": -1.104378581047058, + "logps/chosen": -895.18603515625, + "logps/rejected": -481.8163146972656, + "loss": 0.4582, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7516441345214844, + "rewards/margins": 0.6039574146270752, + "rewards/rejected": 0.14768676459789276, + "step": 313 + }, + { + "epoch": 0.13094245204336946, + "grad_norm": 12.988226890563965, + "learning_rate": 3.6367848042622193e-07, + "logits/chosen": -0.399503231048584, + "logits/rejected": -1.147547721862793, + "logps/chosen": -831.865478515625, + "logps/rejected": -479.0484313964844, + "loss": 0.4732, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7129462957382202, + "rewards/margins": 0.536803662776947, + "rewards/rejected": 0.17614269256591797, + "step": 314 + }, + { + "epoch": 0.13135946622185155, + "grad_norm": 14.837563514709473, + "learning_rate": 3.6483669214732454e-07, + "logits/chosen": -0.8074657320976257, + "logits/rejected": -1.0999995470046997, + "logps/chosen": -861.6229248046875, + "logps/rejected": -405.7557373046875, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4627174437046051, + "rewards/margins": 0.35095614194869995, + "rewards/rejected": 0.11176128685474396, + "step": 315 + }, + { + "epoch": 0.1317764804003336, + "grad_norm": 11.580769538879395, + "learning_rate": 3.659949038684272e-07, + "logits/chosen": -0.8946155309677124, + "logits/rejected": NaN, + "logps/chosen": -904.111328125, + "logps/rejected": -411.387451171875, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49702683091163635, + "rewards/margins": 0.37719956040382385, + "rewards/rejected": 0.1198272705078125, + "step": 316 + }, + { + "epoch": 0.13219349457881568, + "grad_norm": 14.495375633239746, + "learning_rate": 3.6715311558952977e-07, + "logits/chosen": -0.48898759484291077, + "logits/rejected": -0.7831185460090637, + "logps/chosen": -966.1182250976562, + "logps/rejected": -644.5802001953125, + "loss": 0.585, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.45233458280563354, + "rewards/margins": 0.24880409240722656, + "rewards/rejected": 0.20353049039840698, + "step": 317 + }, + { + "epoch": 0.13261050875729774, + "grad_norm": 14.849709510803223, + "learning_rate": 3.6831132731063244e-07, + "logits/chosen": -0.5907967686653137, + "logits/rejected": -1.059241533279419, + "logps/chosen": -1267.926513671875, + "logps/rejected": -679.572021484375, + "loss": 0.4988, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5776993036270142, + "rewards/margins": 0.4460074305534363, + "rewards/rejected": 0.1316918432712555, + "step": 318 + }, + { + "epoch": 0.13302752293577982, + "grad_norm": 71.13780212402344, + "learning_rate": 3.69469539031735e-07, + "logits/chosen": -1.2221256494522095, + "logits/rejected": -0.7080258727073669, + "logps/chosen": -1069.4385986328125, + "logps/rejected": -587.063232421875, + "loss": 0.5733, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35266992449760437, + "rewards/margins": 0.2681390643119812, + "rewards/rejected": 0.08453083038330078, + "step": 319 + }, + { + "epoch": 0.13344453711426188, + "grad_norm": 36.142730712890625, + "learning_rate": 3.7062775075283766e-07, + "logits/chosen": -0.7666437029838562, + "logits/rejected": -0.8107439279556274, + "logps/chosen": -1049.0242919921875, + "logps/rejected": -679.2271118164062, + "loss": 0.6768, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4926140010356903, + "rewards/margins": 0.09494096785783768, + "rewards/rejected": 0.3976730406284332, + "step": 320 + }, + { + "epoch": 0.13386155129274396, + "grad_norm": 86.22736358642578, + "learning_rate": 3.717859624739403e-07, + "logits/chosen": -1.3044458627700806, + "logits/rejected": -0.9170904755592346, + "logps/chosen": -1010.1453857421875, + "logps/rejected": -575.3018798828125, + "loss": 0.5221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48729097843170166, + "rewards/margins": 0.3856784999370575, + "rewards/rejected": 0.10161247849464417, + "step": 321 + }, + { + "epoch": 0.13427856547122602, + "grad_norm": 11.166650772094727, + "learning_rate": 3.729441741950429e-07, + "logits/chosen": -0.6076871752738953, + "logits/rejected": -0.9757373929023743, + "logps/chosen": -867.0377197265625, + "logps/rejected": -535.1961059570312, + "loss": 0.4396, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7410805225372314, + "rewards/margins": 0.6322473883628845, + "rewards/rejected": 0.10883312672376633, + "step": 322 + }, + { + "epoch": 0.1346955796497081, + "grad_norm": 13.445954322814941, + "learning_rate": 3.741023859161455e-07, + "logits/chosen": -0.16205574572086334, + "logits/rejected": -0.754179835319519, + "logps/chosen": -941.9066162109375, + "logps/rejected": -612.751220703125, + "loss": 0.5334, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6874176263809204, + "rewards/margins": 0.39099693298339844, + "rewards/rejected": 0.296420693397522, + "step": 323 + }, + { + "epoch": 0.13511259382819016, + "grad_norm": 26.259580612182617, + "learning_rate": 3.7526059763724807e-07, + "logits/chosen": -1.327556848526001, + "logits/rejected": -0.4644814729690552, + "logps/chosen": -1006.6309814453125, + "logps/rejected": -487.8663330078125, + "loss": 0.5352, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42712557315826416, + "rewards/margins": 0.3592952787876129, + "rewards/rejected": 0.06783027946949005, + "step": 324 + }, + { + "epoch": 0.13552960800667221, + "grad_norm": 12.30532169342041, + "learning_rate": 3.7641880935835073e-07, + "logits/chosen": -0.3513561189174652, + "logits/rejected": -0.8751335144042969, + "logps/chosen": -919.9573364257812, + "logps/rejected": -561.7574462890625, + "loss": 0.541, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6127991080284119, + "rewards/margins": 0.36116331815719604, + "rewards/rejected": 0.25163573026657104, + "step": 325 + }, + { + "epoch": 0.1359466221851543, + "grad_norm": 17.39298439025879, + "learning_rate": 3.7757702107945335e-07, + "logits/chosen": -1.1843748092651367, + "logits/rejected": -1.084370732307434, + "logps/chosen": -816.52001953125, + "logps/rejected": -426.57293701171875, + "loss": 0.576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45261192321777344, + "rewards/margins": 0.26123544573783875, + "rewards/rejected": 0.1913764923810959, + "step": 326 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 12.24465274810791, + "learning_rate": 3.78735232800556e-07, + "logits/chosen": -0.3115065097808838, + "logits/rejected": -1.0348033905029297, + "logps/chosen": -800.1463012695312, + "logps/rejected": -386.0819091796875, + "loss": 0.5225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5527015924453735, + "rewards/margins": 0.39116519689559937, + "rewards/rejected": 0.16153641045093536, + "step": 327 + }, + { + "epoch": 0.13678065054211844, + "grad_norm": 13.531145095825195, + "learning_rate": 3.798934445216586e-07, + "logits/chosen": -1.002925992012024, + "logits/rejected": -1.0890674591064453, + "logps/chosen": -810.9515380859375, + "logps/rejected": -455.56658935546875, + "loss": 0.5304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5257571935653687, + "rewards/margins": 0.37379246950149536, + "rewards/rejected": 0.15196475386619568, + "step": 328 + }, + { + "epoch": 0.1371976647206005, + "grad_norm": 11.544013977050781, + "learning_rate": 3.8105165624276124e-07, + "logits/chosen": -0.4798283278942108, + "logits/rejected": -0.7644235491752625, + "logps/chosen": -966.4276123046875, + "logps/rejected": -475.18377685546875, + "loss": 0.5769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47641295194625854, + "rewards/margins": 0.25745201110839844, + "rewards/rejected": 0.2189609557390213, + "step": 329 + }, + { + "epoch": 0.13761467889908258, + "grad_norm": 22.8399600982666, + "learning_rate": 3.822098679638638e-07, + "logits/chosen": -1.0522290468215942, + "logits/rejected": -0.6683793663978577, + "logps/chosen": -1124.358642578125, + "logps/rejected": -650.5267333984375, + "loss": 0.603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4275779724121094, + "rewards/margins": 0.19395485520362854, + "rewards/rejected": 0.23362313210964203, + "step": 330 + }, + { + "epoch": 0.13803169307756463, + "grad_norm": 18.039440155029297, + "learning_rate": 3.8336807968496647e-07, + "logits/chosen": -1.1692569255828857, + "logits/rejected": -0.9580920338630676, + "logps/chosen": -803.9678344726562, + "logps/rejected": -436.08209228515625, + "loss": 0.5574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4786926507949829, + "rewards/margins": 0.30869293212890625, + "rewards/rejected": 0.16999970376491547, + "step": 331 + }, + { + "epoch": 0.13844870725604672, + "grad_norm": 16.909826278686523, + "learning_rate": 3.845262914060691e-07, + "logits/chosen": -1.0405621528625488, + "logits/rejected": -0.8062006831169128, + "logps/chosen": -1246.5771484375, + "logps/rejected": -744.6220703125, + "loss": 0.5437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5856796503067017, + "rewards/margins": 0.35108909010887146, + "rewards/rejected": 0.2345905303955078, + "step": 332 + }, + { + "epoch": 0.13886572143452877, + "grad_norm": 13.201735496520996, + "learning_rate": 3.8568450312717165e-07, + "logits/chosen": -0.31926029920578003, + "logits/rejected": -0.9815890789031982, + "logps/chosen": -925.9400634765625, + "logps/rejected": -525.7822265625, + "loss": 0.553, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5305637717247009, + "rewards/margins": 0.33497947454452515, + "rewards/rejected": 0.19558429718017578, + "step": 333 + }, + { + "epoch": 0.13928273561301086, + "grad_norm": 100.88213348388672, + "learning_rate": 3.868427148482743e-07, + "logits/chosen": -0.7977476119995117, + "logits/rejected": -0.7031158804893494, + "logps/chosen": -1097.8321533203125, + "logps/rejected": -735.2101440429688, + "loss": 0.5216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5230041742324829, + "rewards/margins": 0.3945716917514801, + "rewards/rejected": 0.1284324675798416, + "step": 334 + }, + { + "epoch": 0.1396997497914929, + "grad_norm": 12.346538543701172, + "learning_rate": 3.880009265693769e-07, + "logits/chosen": -0.40471434593200684, + "logits/rejected": -1.0294396877288818, + "logps/chosen": -1062.220947265625, + "logps/rejected": -494.89068603515625, + "loss": 0.5352, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5657791495323181, + "rewards/margins": 0.355782151222229, + "rewards/rejected": 0.2099969983100891, + "step": 335 + }, + { + "epoch": 0.14011676396997497, + "grad_norm": 12.755851745605469, + "learning_rate": 3.8915913829047954e-07, + "logits/chosen": -0.837600827217102, + "logits/rejected": NaN, + "logps/chosen": -979.638916015625, + "logps/rejected": -404.4629821777344, + "loss": 0.4775, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6763709783554077, + "rewards/margins": 0.5294257998466492, + "rewards/rejected": 0.14694519340991974, + "step": 336 + }, + { + "epoch": 0.14053377814845705, + "grad_norm": 110.21279907226562, + "learning_rate": 3.9031735001158216e-07, + "logits/chosen": -0.6601301431655884, + "logits/rejected": -0.7076990604400635, + "logps/chosen": -1040.8779296875, + "logps/rejected": -614.47021484375, + "loss": 0.5408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5164222717285156, + "rewards/margins": 0.35047948360443115, + "rewards/rejected": 0.16594275832176208, + "step": 337 + }, + { + "epoch": 0.1409507923269391, + "grad_norm": 11.228528022766113, + "learning_rate": 3.9147556173268477e-07, + "logits/chosen": -0.7032935619354248, + "logits/rejected": -0.840206503868103, + "logps/chosen": -857.791748046875, + "logps/rejected": -494.33856201171875, + "loss": 0.5556, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46978530287742615, + "rewards/margins": 0.3115478754043579, + "rewards/rejected": 0.15823745727539062, + "step": 338 + }, + { + "epoch": 0.1413678065054212, + "grad_norm": 45.237388610839844, + "learning_rate": 3.926337734537874e-07, + "logits/chosen": -0.579269528388977, + "logits/rejected": -0.7849897146224976, + "logps/chosen": -1324.705810546875, + "logps/rejected": -709.93017578125, + "loss": 0.5695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3848251700401306, + "rewards/margins": 0.29568442702293396, + "rewards/rejected": 0.08914070576429367, + "step": 339 + }, + { + "epoch": 0.14178482068390325, + "grad_norm": 14.632072448730469, + "learning_rate": 3.9379198517489005e-07, + "logits/chosen": -0.6267716884613037, + "logits/rejected": -0.8754431009292603, + "logps/chosen": -1131.423828125, + "logps/rejected": -647.4896240234375, + "loss": 0.5193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5312767028808594, + "rewards/margins": 0.39016836881637573, + "rewards/rejected": 0.14110833406448364, + "step": 340 + }, + { + "epoch": 0.14220183486238533, + "grad_norm": 13.276863098144531, + "learning_rate": 3.949501968959926e-07, + "logits/chosen": -0.6591827273368835, + "logits/rejected": -1.1536133289337158, + "logps/chosen": -799.3724365234375, + "logps/rejected": -415.87030029296875, + "loss": 0.5315, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6607334017753601, + "rewards/margins": 0.3720436096191406, + "rewards/rejected": 0.2886897921562195, + "step": 341 + }, + { + "epoch": 0.14261884904086738, + "grad_norm": 16.0549259185791, + "learning_rate": 3.961084086170952e-07, + "logits/chosen": -0.870244026184082, + "logits/rejected": -1.0907477140426636, + "logps/chosen": -751.9605712890625, + "logps/rejected": -342.9728698730469, + "loss": 0.5387, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42497479915618896, + "rewards/margins": 0.34450605511665344, + "rewards/rejected": 0.08046874403953552, + "step": 342 + }, + { + "epoch": 0.14303586321934947, + "grad_norm": 12.42183780670166, + "learning_rate": 3.9726662033819784e-07, + "logits/chosen": -0.17122240364551544, + "logits/rejected": -0.9965936541557312, + "logps/chosen": -943.2937622070312, + "logps/rejected": -575.47998046875, + "loss": 0.5147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6430008411407471, + "rewards/margins": 0.4076618254184723, + "rewards/rejected": 0.2353389859199524, + "step": 343 + }, + { + "epoch": 0.14345287739783152, + "grad_norm": 30.9060115814209, + "learning_rate": 3.9842483205930045e-07, + "logits/chosen": -1.9070618152618408, + "logits/rejected": -0.4352344274520874, + "logps/chosen": -1217.758544921875, + "logps/rejected": -512.03271484375, + "loss": 0.5878, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37952953577041626, + "rewards/margins": 0.2321127951145172, + "rewards/rejected": 0.14741678535938263, + "step": 344 + }, + { + "epoch": 0.1438698915763136, + "grad_norm": 14.586682319641113, + "learning_rate": 3.995830437804031e-07, + "logits/chosen": -1.3152159452438354, + "logits/rejected": -1.1257363557815552, + "logps/chosen": -941.9683837890625, + "logps/rejected": -490.158935546875, + "loss": 0.5278, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5210319757461548, + "rewards/margins": 0.3891699016094208, + "rewards/rejected": 0.131862074136734, + "step": 345 + }, + { + "epoch": 0.14428690575479566, + "grad_norm": 12.795869827270508, + "learning_rate": 4.007412555015057e-07, + "logits/chosen": -0.5429605841636658, + "logits/rejected": -1.3382625579833984, + "logps/chosen": -791.9901123046875, + "logps/rejected": -444.44342041015625, + "loss": 0.4121, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.958855390548706, + "rewards/margins": 0.708622395992279, + "rewards/rejected": 0.25023308396339417, + "step": 346 + }, + { + "epoch": 0.14470391993327772, + "grad_norm": 18.874794006347656, + "learning_rate": 4.0189946722260835e-07, + "logits/chosen": -0.6341562867164612, + "logits/rejected": -0.63084876537323, + "logps/chosen": -1040.537109375, + "logps/rejected": -534.5978393554688, + "loss": 0.5715, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4701888859272003, + "rewards/margins": 0.2824958860874176, + "rewards/rejected": 0.1876930296421051, + "step": 347 + }, + { + "epoch": 0.1451209341117598, + "grad_norm": 10.258122444152832, + "learning_rate": 4.030576789437109e-07, + "logits/chosen": -0.2783251702785492, + "logits/rejected": -0.9401609897613525, + "logps/chosen": -669.30126953125, + "logps/rejected": -406.7869567871094, + "loss": 0.5722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4375903904438019, + "rewards/margins": 0.26851367950439453, + "rewards/rejected": 0.16907674074172974, + "step": 348 + }, + { + "epoch": 0.14553794829024186, + "grad_norm": 10.319664001464844, + "learning_rate": 4.042158906648136e-07, + "logits/chosen": -0.5805032253265381, + "logits/rejected": -1.1073102951049805, + "logps/chosen": -650.5101318359375, + "logps/rejected": -412.12750244140625, + "loss": 0.4797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5991600155830383, + "rewards/margins": 0.4989927411079407, + "rewards/rejected": 0.10016727447509766, + "step": 349 + }, + { + "epoch": 0.14595496246872394, + "grad_norm": 16.731414794921875, + "learning_rate": 4.053741023859162e-07, + "logits/chosen": -0.6838538646697998, + "logits/rejected": -0.8862987756729126, + "logps/chosen": -1003.5492553710938, + "logps/rejected": -622.7390747070312, + "loss": 0.4975, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7616981863975525, + "rewards/margins": 0.45180362462997437, + "rewards/rejected": 0.3098945915699005, + "step": 350 + }, + { + "epoch": 0.146371976647206, + "grad_norm": 39.11015319824219, + "learning_rate": 4.0653231410701875e-07, + "logits/chosen": -0.8175166249275208, + "logits/rejected": -0.7409030199050903, + "logps/chosen": -1016.2408447265625, + "logps/rejected": -523.9697265625, + "loss": 0.5357, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5071596503257751, + "rewards/margins": 0.3690607249736786, + "rewards/rejected": 0.13809891045093536, + "step": 351 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 19.210979461669922, + "learning_rate": 4.076905258281214e-07, + "logits/chosen": -1.2036937475204468, + "logits/rejected": -0.6634312272071838, + "logps/chosen": -982.445556640625, + "logps/rejected": -345.0450744628906, + "loss": 0.4693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6847565174102783, + "rewards/margins": 0.5490773916244507, + "rewards/rejected": 0.13567905128002167, + "step": 352 + }, + { + "epoch": 0.14720600500417014, + "grad_norm": 30.889236450195312, + "learning_rate": 4.0884873754922403e-07, + "logits/chosen": -0.7199698686599731, + "logits/rejected": -0.8660650253295898, + "logps/chosen": -1064.428955078125, + "logps/rejected": -569.984619140625, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7931686639785767, + "rewards/margins": 0.6315436959266663, + "rewards/rejected": 0.16162489354610443, + "step": 353 + }, + { + "epoch": 0.14762301918265222, + "grad_norm": 10.279319763183594, + "learning_rate": 4.1000694927032665e-07, + "logits/chosen": -0.2899320125579834, + "logits/rejected": -1.202415943145752, + "logps/chosen": -753.51220703125, + "logps/rejected": -436.6993713378906, + "loss": 0.4363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8876644372940063, + "rewards/margins": 0.6752098202705383, + "rewards/rejected": 0.21245460212230682, + "step": 354 + }, + { + "epoch": 0.14804003336113428, + "grad_norm": 12.473258018493652, + "learning_rate": 4.1116516099142926e-07, + "logits/chosen": -0.6558365225791931, + "logits/rejected": -1.083247423171997, + "logps/chosen": -927.4114379882812, + "logps/rejected": -472.3642272949219, + "loss": 0.4359, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8873398303985596, + "rewards/margins": 0.6623085141181946, + "rewards/rejected": 0.2250312864780426, + "step": 355 + }, + { + "epoch": 0.14845704753961636, + "grad_norm": 11.29211139678955, + "learning_rate": 4.1232337271253193e-07, + "logits/chosen": -0.6761496663093567, + "logits/rejected": -0.7616601586341858, + "logps/chosen": -961.3195190429688, + "logps/rejected": -636.5220947265625, + "loss": 0.5238, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6438732147216797, + "rewards/margins": 0.3950210511684418, + "rewards/rejected": 0.24885216355323792, + "step": 356 + }, + { + "epoch": 0.14887406171809842, + "grad_norm": 31.918323516845703, + "learning_rate": 4.134815844336345e-07, + "logits/chosen": -0.8656318187713623, + "logits/rejected": -0.9370603561401367, + "logps/chosen": -1084.7181396484375, + "logps/rejected": -563.8648071289062, + "loss": 0.4724, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7814071178436279, + "rewards/margins": 0.5427873730659485, + "rewards/rejected": 0.2386198341846466, + "step": 357 + }, + { + "epoch": 0.14929107589658047, + "grad_norm": 14.30555534362793, + "learning_rate": 4.1463979615473716e-07, + "logits/chosen": -0.3970358371734619, + "logits/rejected": -0.7957658767700195, + "logps/chosen": -1060.0406494140625, + "logps/rejected": -647.9384765625, + "loss": 0.4834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9413845539093018, + "rewards/margins": 0.49206697940826416, + "rewards/rejected": 0.4493175745010376, + "step": 358 + }, + { + "epoch": 0.14970809007506256, + "grad_norm": 11.322866439819336, + "learning_rate": 4.157980078758397e-07, + "logits/chosen": -0.791776180267334, + "logits/rejected": -1.031135082244873, + "logps/chosen": -976.4908447265625, + "logps/rejected": -447.529052734375, + "loss": 0.4471, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7744693160057068, + "rewards/margins": 0.6116501092910767, + "rewards/rejected": 0.1628192961215973, + "step": 359 + }, + { + "epoch": 0.1501251042535446, + "grad_norm": 44.45161819458008, + "learning_rate": 4.1695621959694233e-07, + "logits/chosen": -0.5423130393028259, + "logits/rejected": -1.0293302536010742, + "logps/chosen": -1079.07568359375, + "logps/rejected": -597.0283203125, + "loss": 0.5209, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7551567554473877, + "rewards/margins": 0.4404817819595337, + "rewards/rejected": 0.3146749436855316, + "step": 360 + }, + { + "epoch": 0.1505421184320267, + "grad_norm": 13.900465965270996, + "learning_rate": 4.18114431318045e-07, + "logits/chosen": -0.6871705055236816, + "logits/rejected": -0.9282790422439575, + "logps/chosen": -945.0524291992188, + "logps/rejected": -484.08013916015625, + "loss": 0.4304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.901354968547821, + "rewards/margins": 0.6404787302017212, + "rewards/rejected": 0.26087626814842224, + "step": 361 + }, + { + "epoch": 0.15095913261050875, + "grad_norm": 10.500494003295898, + "learning_rate": 4.1927264303914756e-07, + "logits/chosen": -0.8376412391662598, + "logits/rejected": -0.8762015104293823, + "logps/chosen": -941.7789916992188, + "logps/rejected": -522.9681396484375, + "loss": 0.4758, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5987259149551392, + "rewards/margins": 0.5156761407852173, + "rewards/rejected": 0.08304977416992188, + "step": 362 + }, + { + "epoch": 0.15137614678899083, + "grad_norm": 18.55982208251953, + "learning_rate": 4.2043085476025023e-07, + "logits/chosen": -1.3759214878082275, + "logits/rejected": -0.732184886932373, + "logps/chosen": -989.166748046875, + "logps/rejected": -342.82037353515625, + "loss": 0.465, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7290886044502258, + "rewards/margins": 0.5582578778266907, + "rewards/rejected": 0.17083072662353516, + "step": 363 + }, + { + "epoch": 0.1517931609674729, + "grad_norm": 20.100488662719727, + "learning_rate": 4.215890664813528e-07, + "logits/chosen": -0.6532016396522522, + "logits/rejected": -0.9075236320495605, + "logps/chosen": -1039.431884765625, + "logps/rejected": -591.7620849609375, + "loss": 0.522, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5068130493164062, + "rewards/margins": 0.4309927225112915, + "rewards/rejected": 0.07582035660743713, + "step": 364 + }, + { + "epoch": 0.15221017514595497, + "grad_norm": 10.866423606872559, + "learning_rate": 4.2274727820245546e-07, + "logits/chosen": -0.693335771560669, + "logits/rejected": -1.2196584939956665, + "logps/chosen": -889.2175903320312, + "logps/rejected": -601.3970947265625, + "loss": 0.461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.734844982624054, + "rewards/margins": 0.6119255423545837, + "rewards/rejected": 0.122919462621212, + "step": 365 + }, + { + "epoch": 0.15262718932443703, + "grad_norm": 12.792485237121582, + "learning_rate": 4.2390548992355807e-07, + "logits/chosen": -0.7306763529777527, + "logits/rejected": -0.947529673576355, + "logps/chosen": -1026.1007080078125, + "logps/rejected": -555.5172729492188, + "loss": 0.4788, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8059936761856079, + "rewards/margins": 0.5581695437431335, + "rewards/rejected": 0.24782410264015198, + "step": 366 + }, + { + "epoch": 0.1530442035029191, + "grad_norm": 10.392457962036133, + "learning_rate": 4.250637016446607e-07, + "logits/chosen": -0.6597774028778076, + "logits/rejected": -0.9925334453582764, + "logps/chosen": -914.408203125, + "logps/rejected": -477.616455078125, + "loss": 0.4744, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.647997260093689, + "rewards/margins": 0.5398001074790955, + "rewards/rejected": 0.10819721966981888, + "step": 367 + }, + { + "epoch": 0.15346121768140117, + "grad_norm": 13.955161094665527, + "learning_rate": 4.262219133657633e-07, + "logits/chosen": -0.8542149662971497, + "logits/rejected": -0.6223567724227905, + "logps/chosen": -902.4345092773438, + "logps/rejected": -560.85888671875, + "loss": 0.4756, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6814327239990234, + "rewards/margins": 0.5418258905410767, + "rewards/rejected": 0.13960686326026917, + "step": 368 + }, + { + "epoch": 0.15387823185988322, + "grad_norm": 11.818373680114746, + "learning_rate": 4.2738012508686586e-07, + "logits/chosen": -0.2925170660018921, + "logits/rejected": -1.021242380142212, + "logps/chosen": -953.9765014648438, + "logps/rejected": -479.9374694824219, + "loss": 0.4594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8009036779403687, + "rewards/margins": 0.5716110467910767, + "rewards/rejected": 0.22929269075393677, + "step": 369 + }, + { + "epoch": 0.1542952460383653, + "grad_norm": 12.560595512390137, + "learning_rate": 4.285383368079685e-07, + "logits/chosen": -0.8970688581466675, + "logits/rejected": NaN, + "logps/chosen": -1028.404052734375, + "logps/rejected": -404.85406494140625, + "loss": 0.4381, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8304691910743713, + "rewards/margins": 0.6241626739501953, + "rewards/rejected": 0.20630647242069244, + "step": 370 + }, + { + "epoch": 0.15471226021684736, + "grad_norm": 9.532631874084473, + "learning_rate": 4.2969654852907114e-07, + "logits/chosen": -0.08245757222175598, + "logits/rejected": -0.9726629257202148, + "logps/chosen": -927.9483642578125, + "logps/rejected": -489.7080078125, + "loss": 0.3894, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0149635076522827, + "rewards/margins": 0.8830410242080688, + "rewards/rejected": 0.13192254304885864, + "step": 371 + }, + { + "epoch": 0.15512927439532945, + "grad_norm": 16.79618263244629, + "learning_rate": 4.308547602501738e-07, + "logits/chosen": -1.6826022863388062, + "logits/rejected": -0.6280582547187805, + "logps/chosen": -817.57763671875, + "logps/rejected": -353.19195556640625, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6091701984405518, + "rewards/margins": 0.41327935457229614, + "rewards/rejected": 0.19589081406593323, + "step": 372 + }, + { + "epoch": 0.1555462885738115, + "grad_norm": 11.521952629089355, + "learning_rate": 4.3201297197127637e-07, + "logits/chosen": -0.48714402318000793, + "logits/rejected": -1.2125217914581299, + "logps/chosen": -816.1738891601562, + "logps/rejected": -465.6083984375, + "loss": 0.5041, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7348839044570923, + "rewards/margins": 0.4740303158760071, + "rewards/rejected": 0.2608535885810852, + "step": 373 + }, + { + "epoch": 0.1559633027522936, + "grad_norm": 48.49416732788086, + "learning_rate": 4.3317118369237903e-07, + "logits/chosen": -0.7028602957725525, + "logits/rejected": -0.7399444580078125, + "logps/chosen": -1359.393310546875, + "logps/rejected": -589.7894287109375, + "loss": 0.5654, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5138145685195923, + "rewards/margins": 0.29820841550827026, + "rewards/rejected": 0.21560612320899963, + "step": 374 + }, + { + "epoch": 0.15638031693077564, + "grad_norm": 9.126105308532715, + "learning_rate": 4.343293954134816e-07, + "logits/chosen": -0.3981127440929413, + "logits/rejected": -1.046824336051941, + "logps/chosen": -786.0223999023438, + "logps/rejected": -388.6266174316406, + "loss": 0.4294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.88287353515625, + "rewards/margins": 0.6807083487510681, + "rewards/rejected": 0.20216523110866547, + "step": 375 + }, + { + "epoch": 0.15679733110925773, + "grad_norm": 13.52473258972168, + "learning_rate": 4.3548760713458426e-07, + "logits/chosen": -0.8351296782493591, + "logits/rejected": -0.9293588995933533, + "logps/chosen": -1092.8701171875, + "logps/rejected": -719.095458984375, + "loss": 0.4709, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.931023359298706, + "rewards/margins": 0.549706220626831, + "rewards/rejected": 0.381317138671875, + "step": 376 + }, + { + "epoch": 0.15721434528773978, + "grad_norm": 9.77913761138916, + "learning_rate": 4.366458188556869e-07, + "logits/chosen": -0.2266019582748413, + "logits/rejected": -0.9905781149864197, + "logps/chosen": -828.0623779296875, + "logps/rejected": -554.344482421875, + "loss": 0.4512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9684123992919922, + "rewards/margins": 0.6247425079345703, + "rewards/rejected": 0.3436698913574219, + "step": 377 + }, + { + "epoch": 0.15763135946622187, + "grad_norm": 11.354632377624512, + "learning_rate": 4.3780403057678944e-07, + "logits/chosen": -1.104763150215149, + "logits/rejected": NaN, + "logps/chosen": -991.6264038085938, + "logps/rejected": -446.3929748535156, + "loss": 0.4343, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8210197687149048, + "rewards/margins": 0.6701938509941101, + "rewards/rejected": 0.1508258879184723, + "step": 378 + }, + { + "epoch": 0.15804837364470392, + "grad_norm": 13.129554748535156, + "learning_rate": 4.389622422978921e-07, + "logits/chosen": -0.21491341292858124, + "logits/rejected": -1.0178534984588623, + "logps/chosen": -1174.4798583984375, + "logps/rejected": -687.4852294921875, + "loss": 0.5161, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8787490725517273, + "rewards/margins": 0.5874553918838501, + "rewards/rejected": 0.291293740272522, + "step": 379 + }, + { + "epoch": 0.15846538782318598, + "grad_norm": 14.93078327178955, + "learning_rate": 4.4012045401899467e-07, + "logits/chosen": -1.1661901473999023, + "logits/rejected": -0.7830691337585449, + "logps/chosen": -962.7269287109375, + "logps/rejected": -565.2860107421875, + "loss": 0.5686, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6201667785644531, + "rewards/margins": 0.29999351501464844, + "rewards/rejected": 0.3201732635498047, + "step": 380 + }, + { + "epoch": 0.15888240200166806, + "grad_norm": 51.15723419189453, + "learning_rate": 4.4127866574009733e-07, + "logits/chosen": -0.9445866346359253, + "logits/rejected": -0.8365349769592285, + "logps/chosen": -1005.3157958984375, + "logps/rejected": -494.9892883300781, + "loss": 0.4425, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9201198816299438, + "rewards/margins": 0.6145691275596619, + "rewards/rejected": 0.305550754070282, + "step": 381 + }, + { + "epoch": 0.15929941618015012, + "grad_norm": 9.431507110595703, + "learning_rate": 4.4243687746119995e-07, + "logits/chosen": -0.46435022354125977, + "logits/rejected": -1.3163169622421265, + "logps/chosen": -793.6776123046875, + "logps/rejected": -389.82452392578125, + "loss": 0.3767, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0399771928787231, + "rewards/margins": 0.8441517353057861, + "rewards/rejected": 0.19582557678222656, + "step": 382 + }, + { + "epoch": 0.1597164303586322, + "grad_norm": 11.74196720123291, + "learning_rate": 4.4359508918230256e-07, + "logits/chosen": -0.1642545908689499, + "logits/rejected": -0.9364635348320007, + "logps/chosen": -966.7145385742188, + "logps/rejected": -566.3836669921875, + "loss": 0.463, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8481498956680298, + "rewards/margins": 0.6093059778213501, + "rewards/rejected": 0.23884394764900208, + "step": 383 + }, + { + "epoch": 0.16013344453711426, + "grad_norm": 14.56221866607666, + "learning_rate": 4.447533009034052e-07, + "logits/chosen": -0.780667245388031, + "logits/rejected": -0.6983203887939453, + "logps/chosen": -1070.5341796875, + "logps/rejected": -589.2648315429688, + "loss": 0.5254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.754633367061615, + "rewards/margins": 0.3904529809951782, + "rewards/rejected": 0.36418038606643677, + "step": 384 + }, + { + "epoch": 0.16055045871559634, + "grad_norm": 10.382242202758789, + "learning_rate": 4.4591151262450784e-07, + "logits/chosen": -0.5092645287513733, + "logits/rejected": -0.9581844806671143, + "logps/chosen": -974.4227905273438, + "logps/rejected": -594.760009765625, + "loss": 0.3599, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2044953107833862, + "rewards/margins": 0.9050499796867371, + "rewards/rejected": 0.29944533109664917, + "step": 385 + }, + { + "epoch": 0.1609674728940784, + "grad_norm": 11.160368919372559, + "learning_rate": 4.470697243456104e-07, + "logits/chosen": -0.7419039011001587, + "logits/rejected": -0.8124860525131226, + "logps/chosen": -1029.4288330078125, + "logps/rejected": -571.4923095703125, + "loss": 0.4011, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0966904163360596, + "rewards/margins": 0.8046284317970276, + "rewards/rejected": 0.292061984539032, + "step": 386 + }, + { + "epoch": 0.16138448707256048, + "grad_norm": 113.7445297241211, + "learning_rate": 4.48227936066713e-07, + "logits/chosen": -0.7659960985183716, + "logits/rejected": -0.6691083908081055, + "logps/chosen": -1328.605224609375, + "logps/rejected": -739.8016967773438, + "loss": 0.5522, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4544025659561157, + "rewards/margins": 0.3320876955986023, + "rewards/rejected": 0.12231483310461044, + "step": 387 + }, + { + "epoch": 0.16180150125104253, + "grad_norm": 10.511970520019531, + "learning_rate": 4.4938614778781563e-07, + "logits/chosen": -0.6387245655059814, + "logits/rejected": -1.0864508152008057, + "logps/chosen": -1009.6182861328125, + "logps/rejected": -572.1314086914062, + "loss": 0.3754, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.262943983078003, + "rewards/margins": 0.8976157903671265, + "rewards/rejected": 0.36532822251319885, + "step": 388 + }, + { + "epoch": 0.1622185154295246, + "grad_norm": 9.82158088684082, + "learning_rate": 4.5054435950891825e-07, + "logits/chosen": -1.1494851112365723, + "logits/rejected": -0.8683257102966309, + "logps/chosen": -771.0970458984375, + "logps/rejected": -373.26275634765625, + "loss": 0.4307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7890560030937195, + "rewards/margins": 0.6587884426116943, + "rewards/rejected": 0.13026753067970276, + "step": 389 + }, + { + "epoch": 0.16263552960800667, + "grad_norm": 19.325788497924805, + "learning_rate": 4.517025712300209e-07, + "logits/chosen": -1.9514497518539429, + "logits/rejected": -0.3647792637348175, + "logps/chosen": -1589.0023193359375, + "logps/rejected": -808.64794921875, + "loss": 0.4462, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7337280511856079, + "rewards/margins": 0.6055451035499573, + "rewards/rejected": 0.12818299233913422, + "step": 390 + }, + { + "epoch": 0.16305254378648873, + "grad_norm": 9.379402160644531, + "learning_rate": 4.528607829511235e-07, + "logits/chosen": -0.3800397217273712, + "logits/rejected": -0.8778358697891235, + "logps/chosen": -928.1690673828125, + "logps/rejected": -553.2015991210938, + "loss": 0.4251, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.031315565109253, + "rewards/margins": 0.6723453402519226, + "rewards/rejected": 0.3589702546596527, + "step": 391 + }, + { + "epoch": 0.1634695579649708, + "grad_norm": 12.427104949951172, + "learning_rate": 4.5401899467222614e-07, + "logits/chosen": -0.8805733919143677, + "logits/rejected": -0.8505033254623413, + "logps/chosen": -1013.798583984375, + "logps/rejected": -674.81298828125, + "loss": 0.4305, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1119362115859985, + "rewards/margins": 0.7497532367706299, + "rewards/rejected": 0.36218300461769104, + "step": 392 + }, + { + "epoch": 0.16388657214345287, + "grad_norm": 21.36281394958496, + "learning_rate": 4.551772063933287e-07, + "logits/chosen": -1.1938717365264893, + "logits/rejected": -0.590828001499176, + "logps/chosen": -1153.324951171875, + "logps/rejected": -591.6104125976562, + "loss": 0.484, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7626121640205383, + "rewards/margins": 0.5597617626190186, + "rewards/rejected": 0.2028503566980362, + "step": 393 + }, + { + "epoch": 0.16430358632193495, + "grad_norm": 15.094047546386719, + "learning_rate": 4.5633541811443137e-07, + "logits/chosen": -0.8496018648147583, + "logits/rejected": -0.985026478767395, + "logps/chosen": -888.77587890625, + "logps/rejected": -509.3603210449219, + "loss": 0.4299, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1138012409210205, + "rewards/margins": 0.749058723449707, + "rewards/rejected": 0.3647424876689911, + "step": 394 + }, + { + "epoch": 0.164720600500417, + "grad_norm": 9.67310619354248, + "learning_rate": 4.57493629835534e-07, + "logits/chosen": -0.4071289896965027, + "logits/rejected": -1.214938998222351, + "logps/chosen": -613.734130859375, + "logps/rejected": -373.37359619140625, + "loss": 0.4041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0293824672698975, + "rewards/margins": 0.72564697265625, + "rewards/rejected": 0.3037353754043579, + "step": 395 + }, + { + "epoch": 0.1651376146788991, + "grad_norm": 33.515262603759766, + "learning_rate": 4.5865184155663654e-07, + "logits/chosen": -0.8824983835220337, + "logits/rejected": -0.7522385120391846, + "logps/chosen": -1108.8349609375, + "logps/rejected": -566.99853515625, + "loss": 0.3649, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1444813013076782, + "rewards/margins": 0.9490276575088501, + "rewards/rejected": 0.19545362889766693, + "step": 396 + }, + { + "epoch": 0.16555462885738115, + "grad_norm": 140.37753295898438, + "learning_rate": 4.598100532777392e-07, + "logits/chosen": -1.3900153636932373, + "logits/rejected": -0.672933042049408, + "logps/chosen": -1040.2332763671875, + "logps/rejected": -517.0945434570312, + "loss": 0.3938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8762657642364502, + "rewards/margins": 0.7937136888504028, + "rewards/rejected": 0.08255195617675781, + "step": 397 + }, + { + "epoch": 0.16597164303586323, + "grad_norm": 25.180227279663086, + "learning_rate": 4.609682649988418e-07, + "logits/chosen": -1.583716630935669, + "logits/rejected": -0.8964716196060181, + "logps/chosen": -1130.8358154296875, + "logps/rejected": -615.9847412109375, + "loss": 0.4079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9525130987167358, + "rewards/margins": 0.7474132776260376, + "rewards/rejected": 0.20509986579418182, + "step": 398 + }, + { + "epoch": 0.1663886572143453, + "grad_norm": 22.23629379272461, + "learning_rate": 4.6212647671994444e-07, + "logits/chosen": -0.693473756313324, + "logits/rejected": -0.9743697643280029, + "logps/chosen": -1003.96435546875, + "logps/rejected": -484.1935729980469, + "loss": 0.3882, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9626343250274658, + "rewards/margins": 0.7900956869125366, + "rewards/rejected": 0.17253856360912323, + "step": 399 + }, + { + "epoch": 0.16680567139282734, + "grad_norm": 11.062320709228516, + "learning_rate": 4.6328468844104705e-07, + "logits/chosen": -1.0075571537017822, + "logits/rejected": NaN, + "logps/chosen": -913.0453491210938, + "logps/rejected": -449.5362854003906, + "loss": 0.4604, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9872928857803345, + "rewards/margins": 0.6148762106895447, + "rewards/rejected": 0.3724166750907898, + "step": 400 + }, + { + "epoch": 0.16722268557130943, + "grad_norm": 8.61754322052002, + "learning_rate": 4.644429001621497e-07, + "logits/chosen": -0.15306539833545685, + "logits/rejected": -1.1243205070495605, + "logps/chosen": -925.0551147460938, + "logps/rejected": -454.5146179199219, + "loss": 0.3252, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1746994256973267, + "rewards/margins": 1.0145902633666992, + "rewards/rejected": 0.16010914742946625, + "step": 401 + }, + { + "epoch": 0.16763969974979148, + "grad_norm": 11.858991622924805, + "learning_rate": 4.656011118832523e-07, + "logits/chosen": -0.8557257652282715, + "logits/rejected": -0.6695033311843872, + "logps/chosen": -1155.0181884765625, + "logps/rejected": -650.646240234375, + "loss": 0.4029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1554802656173706, + "rewards/margins": 0.8201553225517273, + "rewards/rejected": 0.33532488346099854, + "step": 402 + }, + { + "epoch": 0.16805671392827357, + "grad_norm": 13.933055877685547, + "learning_rate": 4.6675932360435495e-07, + "logits/chosen": -0.6507976651191711, + "logits/rejected": -0.929985761642456, + "logps/chosen": -1204.5203857421875, + "logps/rejected": -705.2244873046875, + "loss": 0.4544, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1383635997772217, + "rewards/margins": 0.6741405129432678, + "rewards/rejected": 0.46422311663627625, + "step": 403 + }, + { + "epoch": 0.16847372810675562, + "grad_norm": 25.26422882080078, + "learning_rate": 4.679175353254575e-07, + "logits/chosen": -0.8579668402671814, + "logits/rejected": -1.0872234106063843, + "logps/chosen": -957.3197631835938, + "logps/rejected": -465.9827575683594, + "loss": 0.3829, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0461105108261108, + "rewards/margins": 0.8798774480819702, + "rewards/rejected": 0.16623306274414062, + "step": 404 + }, + { + "epoch": 0.1688907422852377, + "grad_norm": 18.8845157623291, + "learning_rate": 4.690757470465602e-07, + "logits/chosen": -1.0694754123687744, + "logits/rejected": -0.814071774482727, + "logps/chosen": -1088.4847412109375, + "logps/rejected": -599.3392944335938, + "loss": 0.4367, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7839027643203735, + "rewards/margins": 0.6503978371620178, + "rewards/rejected": 0.13350486755371094, + "step": 405 + }, + { + "epoch": 0.16930775646371976, + "grad_norm": 47.680233001708984, + "learning_rate": 4.702339587676628e-07, + "logits/chosen": -0.6232154369354248, + "logits/rejected": -0.8253195881843567, + "logps/chosen": -1027.8773193359375, + "logps/rejected": -496.5013122558594, + "loss": 0.3653, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.102817177772522, + "rewards/margins": 0.8725694417953491, + "rewards/rejected": 0.23024769127368927, + "step": 406 + }, + { + "epoch": 0.16972477064220184, + "grad_norm": 8.433282852172852, + "learning_rate": 4.7139217048876535e-07, + "logits/chosen": -0.3030654489994049, + "logits/rejected": -1.1567107439041138, + "logps/chosen": -662.6865234375, + "logps/rejected": -374.7866516113281, + "loss": 0.4147, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8907737731933594, + "rewards/margins": 0.7211607694625854, + "rewards/rejected": 0.1696130782365799, + "step": 407 + }, + { + "epoch": 0.1701417848206839, + "grad_norm": 8.495416641235352, + "learning_rate": 4.72550382209868e-07, + "logits/chosen": -0.8343731164932251, + "logits/rejected": NaN, + "logps/chosen": -619.1998291015625, + "logps/rejected": -286.8651123046875, + "loss": 0.4165, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8079773187637329, + "rewards/margins": 0.7249773144721985, + "rewards/rejected": 0.08299998939037323, + "step": 408 + }, + { + "epoch": 0.17055879899916598, + "grad_norm": 9.458263397216797, + "learning_rate": 4.737085939309706e-07, + "logits/chosen": -0.2492261379957199, + "logits/rejected": -1.0585298538208008, + "logps/chosen": -1041.676025390625, + "logps/rejected": -488.0811767578125, + "loss": 0.3269, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4591286182403564, + "rewards/margins": 1.1358743906021118, + "rewards/rejected": 0.32325422763824463, + "step": 409 + }, + { + "epoch": 0.17097581317764804, + "grad_norm": 10.47709846496582, + "learning_rate": 4.7486680565207325e-07, + "logits/chosen": -1.2359280586242676, + "logits/rejected": -0.7334581017494202, + "logps/chosen": -1095.6646728515625, + "logps/rejected": -561.9085083007812, + "loss": 0.3769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9923122525215149, + "rewards/margins": 0.8236513733863831, + "rewards/rejected": 0.1686609387397766, + "step": 410 + }, + { + "epoch": 0.1713928273561301, + "grad_norm": 7.849473476409912, + "learning_rate": 4.7602501737317586e-07, + "logits/chosen": -0.5295383930206299, + "logits/rejected": -1.1938672065734863, + "logps/chosen": -722.981201171875, + "logps/rejected": -387.9132080078125, + "loss": 0.3571, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.174231767654419, + "rewards/margins": 0.9179754257202148, + "rewards/rejected": 0.2562562823295593, + "step": 411 + }, + { + "epoch": 0.17180984153461218, + "grad_norm": 10.466808319091797, + "learning_rate": 4.771832290942785e-07, + "logits/chosen": -1.203951358795166, + "logits/rejected": NaN, + "logps/chosen": -1220.3145751953125, + "logps/rejected": -501.0300598144531, + "loss": 0.3931, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0418198108673096, + "rewards/margins": 0.8159888982772827, + "rewards/rejected": 0.2258308380842209, + "step": 412 + }, + { + "epoch": 0.17222685571309423, + "grad_norm": 10.123409271240234, + "learning_rate": 4.78341440815381e-07, + "logits/chosen": -0.0557040311396122, + "logits/rejected": -0.8783829212188721, + "logps/chosen": -1253.914794921875, + "logps/rejected": -759.3851318359375, + "loss": 0.483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8397613763809204, + "rewards/margins": 0.5337961912155151, + "rewards/rejected": 0.30596524477005005, + "step": 413 + }, + { + "epoch": 0.17264386989157632, + "grad_norm": 12.575692176818848, + "learning_rate": 4.794996525364837e-07, + "logits/chosen": -0.40062034130096436, + "logits/rejected": -0.8996962308883667, + "logps/chosen": -924.6319580078125, + "logps/rejected": -541.8916625976562, + "loss": 0.5024, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9795425534248352, + "rewards/margins": 0.5030224323272705, + "rewards/rejected": 0.4765201210975647, + "step": 414 + }, + { + "epoch": 0.17306088407005837, + "grad_norm": 12.040243148803711, + "learning_rate": 4.806578642575864e-07, + "logits/chosen": -0.2932499349117279, + "logits/rejected": -0.9548584222793579, + "logps/chosen": -972.1904907226562, + "logps/rejected": -459.621337890625, + "loss": 0.3066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4992046356201172, + "rewards/margins": 1.2102619409561157, + "rewards/rejected": 0.28894275426864624, + "step": 415 + }, + { + "epoch": 0.17347789824854046, + "grad_norm": 9.899023056030273, + "learning_rate": 4.818160759786889e-07, + "logits/chosen": -0.403033971786499, + "logits/rejected": -1.194678783416748, + "logps/chosen": -720.298095703125, + "logps/rejected": -408.6534729003906, + "loss": 0.3847, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2899761199951172, + "rewards/margins": 0.8679059743881226, + "rewards/rejected": 0.42207011580467224, + "step": 416 + }, + { + "epoch": 0.1738949124270225, + "grad_norm": 14.706360816955566, + "learning_rate": 4.829742876997916e-07, + "logits/chosen": -1.2809292078018188, + "logits/rejected": -0.41354864835739136, + "logps/chosen": -1515.0595703125, + "logps/rejected": -1003.6505126953125, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6258530616760254, + "rewards/margins": 1.1887860298156738, + "rewards/rejected": 0.43706703186035156, + "step": 417 + }, + { + "epoch": 0.1743119266055046, + "grad_norm": 10.054646492004395, + "learning_rate": 4.841324994208942e-07, + "logits/chosen": -0.9598813056945801, + "logits/rejected": -0.9289623498916626, + "logps/chosen": -889.48681640625, + "logps/rejected": -513.097412109375, + "loss": 0.3994, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.074557900428772, + "rewards/margins": 0.7746117115020752, + "rewards/rejected": 0.29994621872901917, + "step": 418 + }, + { + "epoch": 0.17472894078398665, + "grad_norm": 10.115426063537598, + "learning_rate": 4.852907111419968e-07, + "logits/chosen": -0.7234262824058533, + "logits/rejected": -0.7831937074661255, + "logps/chosen": -979.416748046875, + "logps/rejected": -502.08551025390625, + "loss": 0.3772, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2956442832946777, + "rewards/margins": 0.9065343141555786, + "rewards/rejected": 0.38911011815071106, + "step": 419 + }, + { + "epoch": 0.17514595496246874, + "grad_norm": 25.85379409790039, + "learning_rate": 4.864489228630994e-07, + "logits/chosen": -1.3227511644363403, + "logits/rejected": -0.6083233952522278, + "logps/chosen": -994.1002197265625, + "logps/rejected": -455.73638916015625, + "loss": 0.4105, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8764488101005554, + "rewards/margins": 0.7780067920684814, + "rewards/rejected": 0.09844207018613815, + "step": 420 + }, + { + "epoch": 0.1755629691409508, + "grad_norm": 8.345857620239258, + "learning_rate": 4.876071345842021e-07, + "logits/chosen": -0.6938669681549072, + "logits/rejected": -0.9361906051635742, + "logps/chosen": -836.5305786132812, + "logps/rejected": -462.95184326171875, + "loss": 0.3229, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3967605829238892, + "rewards/margins": 1.0225995779037476, + "rewards/rejected": 0.3741609752178192, + "step": 421 + }, + { + "epoch": 0.17597998331943285, + "grad_norm": 63.941471099853516, + "learning_rate": 4.887653463053046e-07, + "logits/chosen": -0.733321487903595, + "logits/rejected": -0.9194365739822388, + "logps/chosen": -932.4705200195312, + "logps/rejected": -490.20281982421875, + "loss": 0.3507, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2234482765197754, + "rewards/margins": 1.0002437829971313, + "rewards/rejected": 0.22320443391799927, + "step": 422 + }, + { + "epoch": 0.17639699749791493, + "grad_norm": 14.224835395812988, + "learning_rate": 4.899235580264073e-07, + "logits/chosen": -0.15482914447784424, + "logits/rejected": -0.84355229139328, + "logps/chosen": -1113.349609375, + "logps/rejected": -654.54833984375, + "loss": 0.4489, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2364349365234375, + "rewards/margins": 0.7900348901748657, + "rewards/rejected": 0.44640007615089417, + "step": 423 + }, + { + "epoch": 0.176814011676397, + "grad_norm": 9.535834312438965, + "learning_rate": 4.910817697475098e-07, + "logits/chosen": -0.8921198844909668, + "logits/rejected": -1.118802785873413, + "logps/chosen": -776.56005859375, + "logps/rejected": -411.87640380859375, + "loss": 0.3705, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2364059686660767, + "rewards/margins": 0.8972746133804321, + "rewards/rejected": 0.33913135528564453, + "step": 424 + }, + { + "epoch": 0.17723102585487907, + "grad_norm": 13.454277992248535, + "learning_rate": 4.922399814686125e-07, + "logits/chosen": -1.277766466140747, + "logits/rejected": NaN, + "logps/chosen": -1107.3541259765625, + "logps/rejected": -548.9771728515625, + "loss": 0.4959, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6238945126533508, + "rewards/margins": 0.49188232421875, + "rewards/rejected": 0.13201217353343964, + "step": 425 + }, + { + "epoch": 0.17764804003336113, + "grad_norm": 21.66693115234375, + "learning_rate": 4.933981931897152e-07, + "logits/chosen": -0.8080180287361145, + "logits/rejected": -0.842300534248352, + "logps/chosen": -1001.2941284179688, + "logps/rejected": -523.3444213867188, + "loss": 0.356, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3683230876922607, + "rewards/margins": 1.0670900344848633, + "rewards/rejected": 0.30123311281204224, + "step": 426 + }, + { + "epoch": 0.1780650542118432, + "grad_norm": 8.643423080444336, + "learning_rate": 4.945564049108177e-07, + "logits/chosen": -0.060991790145635605, + "logits/rejected": -0.896175742149353, + "logps/chosen": -1200.8221435546875, + "logps/rejected": -588.8486328125, + "loss": 0.3612, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3223366737365723, + "rewards/margins": 0.9656999111175537, + "rewards/rejected": 0.35663682222366333, + "step": 427 + }, + { + "epoch": 0.17848206839032527, + "grad_norm": 10.878077507019043, + "learning_rate": 4.957146166319204e-07, + "logits/chosen": -0.4394509792327881, + "logits/rejected": -1.0909404754638672, + "logps/chosen": -759.8900146484375, + "logps/rejected": -471.627685546875, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0490875244140625, + "rewards/margins": 0.7036136984825134, + "rewards/rejected": 0.34547385573387146, + "step": 428 + }, + { + "epoch": 0.17889908256880735, + "grad_norm": 9.747179985046387, + "learning_rate": 4.96872828353023e-07, + "logits/chosen": -0.6771948337554932, + "logits/rejected": -0.9167685508728027, + "logps/chosen": -821.6554565429688, + "logps/rejected": -416.5440673828125, + "loss": 0.3585, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0864949226379395, + "rewards/margins": 0.8796165585517883, + "rewards/rejected": 0.20687828958034515, + "step": 429 + }, + { + "epoch": 0.1793160967472894, + "grad_norm": 17.712642669677734, + "learning_rate": 4.980310400741256e-07, + "logits/chosen": -0.021820232272148132, + "logits/rejected": -0.8258620500564575, + "logps/chosen": -1120.791748046875, + "logps/rejected": -619.8856201171875, + "loss": 0.3738, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0649696588516235, + "rewards/margins": 0.9522903561592102, + "rewards/rejected": 0.11267929524183273, + "step": 430 + }, + { + "epoch": 0.1797331109257715, + "grad_norm": 9.64108657836914, + "learning_rate": 4.991892517952282e-07, + "logits/chosen": -0.6383921504020691, + "logits/rejected": -0.9700891375541687, + "logps/chosen": -814.2255859375, + "logps/rejected": -469.74578857421875, + "loss": 0.4033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.032196044921875, + "rewards/margins": 0.782368540763855, + "rewards/rejected": 0.2498275637626648, + "step": 431 + }, + { + "epoch": 0.18015012510425354, + "grad_norm": 35.772544860839844, + "learning_rate": 5.003474635163309e-07, + "logits/chosen": -0.7210122346878052, + "logits/rejected": -0.9485037326812744, + "logps/chosen": -841.4863891601562, + "logps/rejected": -468.5830993652344, + "loss": 0.3743, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.092629313468933, + "rewards/margins": 0.9248784780502319, + "rewards/rejected": 0.167750746011734, + "step": 432 + }, + { + "epoch": 0.1805671392827356, + "grad_norm": 10.29698657989502, + "learning_rate": 5.015056752374334e-07, + "logits/chosen": -1.3136658668518066, + "logits/rejected": -0.4952349066734314, + "logps/chosen": -1367.1907958984375, + "logps/rejected": -745.05029296875, + "loss": 0.373, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0556800365447998, + "rewards/margins": 0.8719177842140198, + "rewards/rejected": 0.1837623566389084, + "step": 433 + }, + { + "epoch": 0.18098415346121768, + "grad_norm": 22.594860076904297, + "learning_rate": 5.02663886958536e-07, + "logits/chosen": -0.9725524187088013, + "logits/rejected": -0.7625365257263184, + "logps/chosen": -1332.3341064453125, + "logps/rejected": -723.8422241210938, + "loss": 0.3417, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0934829711914062, + "rewards/margins": 0.9762731790542603, + "rewards/rejected": 0.11720981448888779, + "step": 434 + }, + { + "epoch": 0.18140116763969974, + "grad_norm": 76.25181579589844, + "learning_rate": 5.038220986796387e-07, + "logits/chosen": -0.8150256872177124, + "logits/rejected": -0.5543412566184998, + "logps/chosen": -1393.7176513671875, + "logps/rejected": -503.2115478515625, + "loss": 0.4073, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8021858334541321, + "rewards/margins": 0.8513728976249695, + "rewards/rejected": -0.04918709024786949, + "step": 435 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 8.156095504760742, + "learning_rate": 5.049803104007413e-07, + "logits/chosen": -0.2668836712837219, + "logits/rejected": -1.2863471508026123, + "logps/chosen": -915.9588623046875, + "logps/rejected": -445.58831787109375, + "loss": 0.3217, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3466705083847046, + "rewards/margins": 1.1101391315460205, + "rewards/rejected": 0.23653146624565125, + "step": 436 + }, + { + "epoch": 0.18223519599666388, + "grad_norm": 10.595657348632812, + "learning_rate": 5.061385221218439e-07, + "logits/chosen": -1.2793935537338257, + "logits/rejected": NaN, + "logps/chosen": -856.7852783203125, + "logps/rejected": -470.69793701171875, + "loss": 0.435, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9648643732070923, + "rewards/margins": 0.6660835146903992, + "rewards/rejected": 0.2987808287143707, + "step": 437 + }, + { + "epoch": 0.18265221017514596, + "grad_norm": 10.969059944152832, + "learning_rate": 5.072967338429465e-07, + "logits/chosen": -0.4248613715171814, + "logits/rejected": -1.0245875120162964, + "logps/chosen": -910.3629150390625, + "logps/rejected": -571.6388549804688, + "loss": 0.2823, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6152580976486206, + "rewards/margins": 1.2807300090789795, + "rewards/rejected": 0.33452796936035156, + "step": 438 + }, + { + "epoch": 0.18306922435362802, + "grad_norm": 8.81064224243164, + "learning_rate": 5.084549455640492e-07, + "logits/chosen": -0.40281397104263306, + "logits/rejected": -1.0130987167358398, + "logps/chosen": -915.0746459960938, + "logps/rejected": -545.9220581054688, + "loss": 0.283, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.803553819656372, + "rewards/margins": 1.350416660308838, + "rewards/rejected": 0.45313721895217896, + "step": 439 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 7.345049858093262, + "learning_rate": 5.096131572851518e-07, + "logits/chosen": -0.9484488368034363, + "logits/rejected": NaN, + "logps/chosen": -702.7348022460938, + "logps/rejected": -268.2144775390625, + "loss": 0.3488, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1625409126281738, + "rewards/margins": 1.0043141841888428, + "rewards/rejected": 0.15822657942771912, + "step": 440 + }, + { + "epoch": 0.18390325271059216, + "grad_norm": 29.2155818939209, + "learning_rate": 5.107713690062544e-07, + "logits/chosen": -0.971724271774292, + "logits/rejected": -1.8838496208190918, + "logps/chosen": -835.9578247070312, + "logps/rejected": -343.8309326171875, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5076858401298523, + "rewards/margins": 0.40653589367866516, + "rewards/rejected": 0.10114996880292892, + "step": 441 + }, + { + "epoch": 0.18432026688907424, + "grad_norm": 14.413673400878906, + "learning_rate": 5.11929580727357e-07, + "logits/chosen": -0.3425869941711426, + "logits/rejected": -0.8565261960029602, + "logps/chosen": -974.3545532226562, + "logps/rejected": -595.45947265625, + "loss": 0.4811, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2816375494003296, + "rewards/margins": 0.6104560494422913, + "rewards/rejected": 0.6711814999580383, + "step": 442 + }, + { + "epoch": 0.1847372810675563, + "grad_norm": 13.203817367553711, + "learning_rate": 5.130877924484596e-07, + "logits/chosen": -1.1646212339401245, + "logits/rejected": -0.5620394945144653, + "logps/chosen": -1210.6065673828125, + "logps/rejected": -650.1663208007812, + "loss": 0.365, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0617852210998535, + "rewards/margins": 0.864620566368103, + "rewards/rejected": 0.19716455042362213, + "step": 443 + }, + { + "epoch": 0.18515429524603835, + "grad_norm": 7.643621444702148, + "learning_rate": 5.142460041695622e-07, + "logits/chosen": -0.24690644443035126, + "logits/rejected": -1.0483318567276, + "logps/chosen": -1019.3153686523438, + "logps/rejected": -431.1500244140625, + "loss": 0.2518, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5888878107070923, + "rewards/margins": 1.3414969444274902, + "rewards/rejected": 0.2473907470703125, + "step": 444 + }, + { + "epoch": 0.18557130942452044, + "grad_norm": 7.3949875831604, + "learning_rate": 5.154042158906648e-07, + "logits/chosen": -0.8596327900886536, + "logits/rejected": -1.0333492755889893, + "logps/chosen": -887.9427490234375, + "logps/rejected": -453.2072448730469, + "loss": 0.2844, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5352210998535156, + "rewards/margins": 1.3478690385818481, + "rewards/rejected": 0.18735218048095703, + "step": 445 + }, + { + "epoch": 0.1859883236030025, + "grad_norm": 10.564215660095215, + "learning_rate": 5.165624276117675e-07, + "logits/chosen": -0.7910406589508057, + "logits/rejected": -0.9386079907417297, + "logps/chosen": -860.8548583984375, + "logps/rejected": -468.6083984375, + "loss": 0.4419, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.170739769935608, + "rewards/margins": 0.7561496496200562, + "rewards/rejected": 0.41459009051322937, + "step": 446 + }, + { + "epoch": 0.18640533778148458, + "grad_norm": 17.476667404174805, + "learning_rate": 5.177206393328701e-07, + "logits/chosen": -1.031386137008667, + "logits/rejected": -0.914681077003479, + "logps/chosen": -758.34619140625, + "logps/rejected": -338.1011962890625, + "loss": 0.4815, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0455631017684937, + "rewards/margins": 0.5733994841575623, + "rewards/rejected": 0.4721636474132538, + "step": 447 + }, + { + "epoch": 0.18682235195996663, + "grad_norm": 11.556873321533203, + "learning_rate": 5.188788510539727e-07, + "logits/chosen": -1.2297561168670654, + "logits/rejected": -0.740973949432373, + "logps/chosen": -1085.2034912109375, + "logps/rejected": -679.058837890625, + "loss": 0.3518, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.343300700187683, + "rewards/margins": 1.0664196014404297, + "rewards/rejected": 0.27688103914260864, + "step": 448 + }, + { + "epoch": 0.18723936613844872, + "grad_norm": 29.376041412353516, + "learning_rate": 5.200370627750754e-07, + "logits/chosen": -0.7171022295951843, + "logits/rejected": -0.8553313612937927, + "logps/chosen": -1202.4296875, + "logps/rejected": -670.6727294921875, + "loss": 0.3419, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4389755725860596, + "rewards/margins": 0.991758406162262, + "rewards/rejected": 0.44721719622612, + "step": 449 + }, + { + "epoch": 0.18765638031693077, + "grad_norm": 23.96797752380371, + "learning_rate": 5.21195274496178e-07, + "logits/chosen": -0.2334120273590088, + "logits/rejected": -0.6994651556015015, + "logps/chosen": -1068.4002685546875, + "logps/rejected": -628.2809448242188, + "loss": 0.4991, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1480633020401, + "rewards/margins": 0.6775131225585938, + "rewards/rejected": 0.47055014967918396, + "step": 450 + }, + { + "epoch": 0.18807339449541285, + "grad_norm": 8.404180526733398, + "learning_rate": 5.223534862172806e-07, + "logits/chosen": -0.7989797592163086, + "logits/rejected": -0.7413102388381958, + "logps/chosen": -954.3269653320312, + "logps/rejected": -523.7776489257812, + "loss": 0.3227, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4445083141326904, + "rewards/margins": 1.105954885482788, + "rewards/rejected": 0.33855342864990234, + "step": 451 + }, + { + "epoch": 0.1884904086738949, + "grad_norm": 7.193758010864258, + "learning_rate": 5.235116979383831e-07, + "logits/chosen": -0.6048176288604736, + "logits/rejected": -0.9627269506454468, + "logps/chosen": -943.6515502929688, + "logps/rejected": -426.4438781738281, + "loss": 0.2545, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.667619228363037, + "rewards/margins": 1.4131730794906616, + "rewards/rejected": 0.25444620847702026, + "step": 452 + }, + { + "epoch": 0.188907422852377, + "grad_norm": 6.214148998260498, + "learning_rate": 5.246699096594858e-07, + "logits/chosen": -1.267393708229065, + "logits/rejected": -0.6937335133552551, + "logps/chosen": -938.540283203125, + "logps/rejected": -354.635498046875, + "loss": 0.3316, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2974106073379517, + "rewards/margins": 1.147909164428711, + "rewards/rejected": 0.14950142800807953, + "step": 453 + }, + { + "epoch": 0.18932443703085905, + "grad_norm": 9.67900562286377, + "learning_rate": 5.258281213805884e-07, + "logits/chosen": -0.4650893807411194, + "logits/rejected": -1.0413308143615723, + "logps/chosen": -757.0244750976562, + "logps/rejected": -455.6717224121094, + "loss": 0.267, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.699060082435608, + "rewards/margins": 1.2585471868515015, + "rewards/rejected": 0.44051283597946167, + "step": 454 + }, + { + "epoch": 0.1897414512093411, + "grad_norm": 8.991933822631836, + "learning_rate": 5.26986333101691e-07, + "logits/chosen": -1.3615641593933105, + "logits/rejected": NaN, + "logps/chosen": -923.5623168945312, + "logps/rejected": -294.7633972167969, + "loss": 0.3904, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.335248589515686, + "rewards/margins": 1.0522887706756592, + "rewards/rejected": 0.2829597592353821, + "step": 455 + }, + { + "epoch": 0.1901584653878232, + "grad_norm": 9.706188201904297, + "learning_rate": 5.281445448227936e-07, + "logits/chosen": -0.48474422097206116, + "logits/rejected": -0.7741472721099854, + "logps/chosen": -941.8074951171875, + "logps/rejected": -483.53643798828125, + "loss": 0.3618, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1542637348175049, + "rewards/margins": 0.9527584314346313, + "rewards/rejected": 0.20150528848171234, + "step": 456 + }, + { + "epoch": 0.19057547956630524, + "grad_norm": 8.538509368896484, + "learning_rate": 5.293027565438963e-07, + "logits/chosen": -0.7253445982933044, + "logits/rejected": NaN, + "logps/chosen": -767.12158203125, + "logps/rejected": -411.4326477050781, + "loss": 0.4122, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1854336261749268, + "rewards/margins": 0.7853360772132874, + "rewards/rejected": 0.40009745955467224, + "step": 457 + }, + { + "epoch": 0.19099249374478733, + "grad_norm": 11.412631034851074, + "learning_rate": 5.304609682649988e-07, + "logits/chosen": -0.5738415122032166, + "logits/rejected": -0.7368472814559937, + "logps/chosen": -787.1163330078125, + "logps/rejected": -451.1216735839844, + "loss": 0.4972, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.012704849243164, + "rewards/margins": 0.5042995810508728, + "rewards/rejected": 0.508405327796936, + "step": 458 + }, + { + "epoch": 0.19140950792326938, + "grad_norm": 16.11037254333496, + "learning_rate": 5.316191799861015e-07, + "logits/chosen": -0.654745876789093, + "logits/rejected": -0.8685100078582764, + "logps/chosen": -965.368408203125, + "logps/rejected": -589.831298828125, + "loss": 0.4863, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9739273190498352, + "rewards/margins": 0.5190407037734985, + "rewards/rejected": 0.45488661527633667, + "step": 459 + }, + { + "epoch": 0.19182652210175147, + "grad_norm": 7.726375102996826, + "learning_rate": 5.327773917072042e-07, + "logits/chosen": -1.073552131652832, + "logits/rejected": NaN, + "logps/chosen": -900.2451171875, + "logps/rejected": -355.24029541015625, + "loss": 0.3422, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2405494451522827, + "rewards/margins": 1.0998375415802002, + "rewards/rejected": 0.1407119780778885, + "step": 460 + }, + { + "epoch": 0.19224353628023352, + "grad_norm": 17.408418655395508, + "learning_rate": 5.339356034283067e-07, + "logits/chosen": -1.223734736442566, + "logits/rejected": -0.688335120677948, + "logps/chosen": -950.4491577148438, + "logps/rejected": -450.11541748046875, + "loss": 0.3441, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1629596948623657, + "rewards/margins": 1.0419025421142578, + "rewards/rejected": 0.12105713039636612, + "step": 461 + }, + { + "epoch": 0.1926605504587156, + "grad_norm": 11.605844497680664, + "learning_rate": 5.350938151494094e-07, + "logits/chosen": -0.9530258774757385, + "logits/rejected": -0.6194846034049988, + "logps/chosen": -1317.913330078125, + "logps/rejected": -800.56396484375, + "loss": 0.4264, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4290275573730469, + "rewards/margins": 0.788550615310669, + "rewards/rejected": 0.6404770612716675, + "step": 462 + }, + { + "epoch": 0.19307756463719766, + "grad_norm": 8.053532600402832, + "learning_rate": 5.36252026870512e-07, + "logits/chosen": -0.23324890434741974, + "logits/rejected": -0.8272748589515686, + "logps/chosen": -838.7245483398438, + "logps/rejected": -509.93133544921875, + "loss": 0.3069, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.769778847694397, + "rewards/margins": 1.132826328277588, + "rewards/rejected": 0.6369526386260986, + "step": 463 + }, + { + "epoch": 0.19349457881567975, + "grad_norm": 6.846127986907959, + "learning_rate": 5.374102385916146e-07, + "logits/chosen": -0.49873167276382446, + "logits/rejected": -0.9698046445846558, + "logps/chosen": -993.0609130859375, + "logps/rejected": -551.240234375, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8015908002853394, + "rewards/margins": 1.4089932441711426, + "rewards/rejected": 0.39259758591651917, + "step": 464 + }, + { + "epoch": 0.1939115929941618, + "grad_norm": 16.657190322875977, + "learning_rate": 5.385684503127172e-07, + "logits/chosen": -0.8914758563041687, + "logits/rejected": -0.9914072155952454, + "logps/chosen": -878.8090209960938, + "logps/rejected": -454.7811279296875, + "loss": 0.2653, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6095809936523438, + "rewards/margins": 1.4200468063354492, + "rewards/rejected": 0.18953420221805573, + "step": 465 + }, + { + "epoch": 0.19432860717264386, + "grad_norm": 6.123313903808594, + "learning_rate": 5.397266620338198e-07, + "logits/chosen": -0.24902071058750153, + "logits/rejected": -1.055999755859375, + "logps/chosen": -740.697021484375, + "logps/rejected": -401.1899108886719, + "loss": 0.2754, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4024276733398438, + "rewards/margins": 1.2107495069503784, + "rewards/rejected": 0.1916782408952713, + "step": 466 + }, + { + "epoch": 0.19474562135112594, + "grad_norm": 8.516121864318848, + "learning_rate": 5.408848737549224e-07, + "logits/chosen": -0.7497695088386536, + "logits/rejected": -1.0728498697280884, + "logps/chosen": -938.5393676757812, + "logps/rejected": -411.2930908203125, + "loss": 0.3168, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1587738990783691, + "rewards/margins": 1.0270540714263916, + "rewards/rejected": 0.13171979784965515, + "step": 467 + }, + { + "epoch": 0.195162635529608, + "grad_norm": 7.002023696899414, + "learning_rate": 5.420430854760251e-07, + "logits/chosen": -0.5342711210250854, + "logits/rejected": -1.0714447498321533, + "logps/chosen": -680.7222900390625, + "logps/rejected": -439.2579345703125, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4819155931472778, + "rewards/margins": 1.1946254968643188, + "rewards/rejected": 0.28729021549224854, + "step": 468 + }, + { + "epoch": 0.19557964970809008, + "grad_norm": 48.11528778076172, + "learning_rate": 5.432012971971276e-07, + "logits/chosen": -1.252739429473877, + "logits/rejected": -0.824187159538269, + "logps/chosen": -1127.649169921875, + "logps/rejected": -409.3703308105469, + "loss": 0.3192, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6022571325302124, + "rewards/margins": 1.2890838384628296, + "rewards/rejected": 0.3131732940673828, + "step": 469 + }, + { + "epoch": 0.19599666388657214, + "grad_norm": 6.127768516540527, + "learning_rate": 5.443595089182303e-07, + "logits/chosen": -0.7570332884788513, + "logits/rejected": -1.1899428367614746, + "logps/chosen": -663.0950927734375, + "logps/rejected": -385.686767578125, + "loss": 0.2232, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7008370161056519, + "rewards/margins": 1.5086597204208374, + "rewards/rejected": 0.1921772062778473, + "step": 470 + }, + { + "epoch": 0.19641367806505422, + "grad_norm": 7.4737629890441895, + "learning_rate": 5.45517720639333e-07, + "logits/chosen": -1.0208483934402466, + "logits/rejected": -0.9473154544830322, + "logps/chosen": -966.479248046875, + "logps/rejected": -462.16552734375, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.853031873703003, + "rewards/margins": 1.5238524675369263, + "rewards/rejected": 0.32917940616607666, + "step": 471 + }, + { + "epoch": 0.19683069224353628, + "grad_norm": 9.42990493774414, + "learning_rate": 5.466759323604355e-07, + "logits/chosen": -0.7548729181289673, + "logits/rejected": -0.6683404445648193, + "logps/chosen": -947.81787109375, + "logps/rejected": -615.0987548828125, + "loss": 0.3549, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5003082752227783, + "rewards/margins": 1.0633137226104736, + "rewards/rejected": 0.4369945824146271, + "step": 472 + }, + { + "epoch": 0.19724770642201836, + "grad_norm": 10.07742691040039, + "learning_rate": 5.478341440815382e-07, + "logits/chosen": -0.5082460045814514, + "logits/rejected": -0.9432156682014465, + "logps/chosen": -850.0257568359375, + "logps/rejected": -579.3400268554688, + "loss": 0.3112, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4641411304473877, + "rewards/margins": 1.1625808477401733, + "rewards/rejected": 0.3015602231025696, + "step": 473 + }, + { + "epoch": 0.19766472060050042, + "grad_norm": 18.051807403564453, + "learning_rate": 5.489923558026408e-07, + "logits/chosen": -0.362895131111145, + "logits/rejected": -0.8096672892570496, + "logps/chosen": -1068.0694580078125, + "logps/rejected": -695.491455078125, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3846793174743652, + "rewards/margins": 1.5274813175201416, + "rewards/rejected": 0.8571979999542236, + "step": 474 + }, + { + "epoch": 0.1980817347789825, + "grad_norm": 6.5080084800720215, + "learning_rate": 5.501505675237434e-07, + "logits/chosen": -0.1734878420829773, + "logits/rejected": -1.0314613580703735, + "logps/chosen": -872.0009765625, + "logps/rejected": -455.0384521484375, + "loss": 0.2985, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5060398578643799, + "rewards/margins": 1.1864515542984009, + "rewards/rejected": 0.3195882737636566, + "step": 475 + }, + { + "epoch": 0.19849874895746455, + "grad_norm": 21.213790893554688, + "learning_rate": 5.51308779244846e-07, + "logits/chosen": -1.485741138458252, + "logits/rejected": -0.9049381017684937, + "logps/chosen": -921.39306640625, + "logps/rejected": -411.5072021484375, + "loss": 0.2434, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7291536331176758, + "rewards/margins": 1.49225914478302, + "rewards/rejected": 0.2368946224451065, + "step": 476 + }, + { + "epoch": 0.1989157631359466, + "grad_norm": 8.203289985656738, + "learning_rate": 5.524669909659487e-07, + "logits/chosen": -1.2540874481201172, + "logits/rejected": -0.3685772716999054, + "logps/chosen": -1220.802734375, + "logps/rejected": -591.0242919921875, + "loss": 0.3248, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2334423065185547, + "rewards/margins": 1.1985591650009155, + "rewards/rejected": 0.03488311916589737, + "step": 477 + }, + { + "epoch": 0.1993327773144287, + "grad_norm": 16.04717254638672, + "learning_rate": 5.536252026870512e-07, + "logits/chosen": -0.7452991008758545, + "logits/rejected": NaN, + "logps/chosen": -833.73095703125, + "logps/rejected": -304.424560546875, + "loss": 0.3542, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.5807095766067505, + "rewards/margins": 1.3022902011871338, + "rewards/rejected": 0.2784193158149719, + "step": 478 + }, + { + "epoch": 0.19974979149291075, + "grad_norm": 16.551410675048828, + "learning_rate": 5.547834144081538e-07, + "logits/chosen": -1.3112361431121826, + "logits/rejected": NaN, + "logps/chosen": -880.3460693359375, + "logps/rejected": -381.79010009765625, + "loss": 0.4748, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2344365119934082, + "rewards/margins": 0.7965487241744995, + "rewards/rejected": 0.4378877580165863, + "step": 479 + }, + { + "epoch": 0.20016680567139283, + "grad_norm": 6.878300189971924, + "learning_rate": 5.559416261292564e-07, + "logits/chosen": -0.5329397320747375, + "logits/rejected": -0.91758793592453, + "logps/chosen": -827.4346313476562, + "logps/rejected": -489.2799377441406, + "loss": 0.2853, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7674815654754639, + "rewards/margins": 1.263622760772705, + "rewards/rejected": 0.5038589835166931, + "step": 480 + }, + { + "epoch": 0.2005838198498749, + "grad_norm": 7.075203895568848, + "learning_rate": 5.570998378503591e-07, + "logits/chosen": -0.8046488165855408, + "logits/rejected": -0.7207810282707214, + "logps/chosen": -1175.148193359375, + "logps/rejected": -738.78515625, + "loss": 0.2405, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9927513599395752, + "rewards/margins": 1.55229914188385, + "rewards/rejected": 0.4404521882534027, + "step": 481 + }, + { + "epoch": 0.20100083402835697, + "grad_norm": 15.969755172729492, + "learning_rate": 5.582580495714617e-07, + "logits/chosen": -1.152105689048767, + "logits/rejected": -0.6562500596046448, + "logps/chosen": -1215.0416259765625, + "logps/rejected": -686.17041015625, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8231041431427002, + "rewards/margins": 1.3679776191711426, + "rewards/rejected": 0.4551265835762024, + "step": 482 + }, + { + "epoch": 0.20141784820683903, + "grad_norm": 9.812337875366211, + "learning_rate": 5.594162612925643e-07, + "logits/chosen": -1.44777250289917, + "logits/rejected": -0.4746747612953186, + "logps/chosen": -1145.5645751953125, + "logps/rejected": -638.7828979492188, + "loss": 0.3149, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.388654351234436, + "rewards/margins": 1.2236839532852173, + "rewards/rejected": 0.16497039794921875, + "step": 483 + }, + { + "epoch": 0.2018348623853211, + "grad_norm": 14.536733627319336, + "learning_rate": 5.60574473013667e-07, + "logits/chosen": -1.2167203426361084, + "logits/rejected": NaN, + "logps/chosen": -987.7384643554688, + "logps/rejected": -447.95928955078125, + "loss": 0.3451, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6604728698730469, + "rewards/margins": 1.182968258857727, + "rewards/rejected": 0.4775047302246094, + "step": 484 + }, + { + "epoch": 0.20225187656380317, + "grad_norm": 6.497507095336914, + "learning_rate": 5.617326847347696e-07, + "logits/chosen": -0.1814214587211609, + "logits/rejected": -1.2552616596221924, + "logps/chosen": -942.0364990234375, + "logps/rejected": -404.69757080078125, + "loss": 0.2744, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.614800214767456, + "rewards/margins": 1.3573983907699585, + "rewards/rejected": 0.25740185379981995, + "step": 485 + }, + { + "epoch": 0.20266889074228525, + "grad_norm": 5.72982120513916, + "learning_rate": 5.628908964558722e-07, + "logits/chosen": -0.291603147983551, + "logits/rejected": -1.2197387218475342, + "logps/chosen": -748.9938354492188, + "logps/rejected": -405.5691223144531, + "loss": 0.214, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7279002666473389, + "rewards/margins": 1.4862370491027832, + "rewards/rejected": 0.2416633665561676, + "step": 486 + }, + { + "epoch": 0.2030859049207673, + "grad_norm": 6.53090763092041, + "learning_rate": 5.640491081769748e-07, + "logits/chosen": -0.13249686360359192, + "logits/rejected": -1.0239920616149902, + "logps/chosen": -903.7561645507812, + "logps/rejected": -473.299072265625, + "loss": 0.2137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.137012004852295, + "rewards/margins": 1.8611736297607422, + "rewards/rejected": 0.2758384943008423, + "step": 487 + }, + { + "epoch": 0.20350291909924936, + "grad_norm": 6.9624924659729, + "learning_rate": 5.652073198980774e-07, + "logits/chosen": -0.9678762555122375, + "logits/rejected": NaN, + "logps/chosen": -937.008544921875, + "logps/rejected": -415.16522216796875, + "loss": 0.3651, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3334953784942627, + "rewards/margins": 1.0603268146514893, + "rewards/rejected": 0.27316856384277344, + "step": 488 + }, + { + "epoch": 0.20391993327773145, + "grad_norm": 13.832659721374512, + "learning_rate": 5.6636553161918e-07, + "logits/chosen": -1.1628128290176392, + "logits/rejected": -0.740946888923645, + "logps/chosen": -1164.971923828125, + "logps/rejected": -451.1189880371094, + "loss": 0.1993, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.572029948234558, + "rewards/margins": 1.643568515777588, + "rewards/rejected": -0.071538545191288, + "step": 489 + }, + { + "epoch": 0.2043369474562135, + "grad_norm": 8.58764362335205, + "learning_rate": 5.675237433402826e-07, + "logits/chosen": -0.2367142140865326, + "logits/rejected": -0.8744230270385742, + "logps/chosen": -688.6175537109375, + "logps/rejected": -366.89007568359375, + "loss": 0.3419, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5338054895401, + "rewards/margins": 1.001650094985962, + "rewards/rejected": 0.532155454158783, + "step": 490 + }, + { + "epoch": 0.20475396163469559, + "grad_norm": 4.68087100982666, + "learning_rate": 5.686819550613853e-07, + "logits/chosen": -0.362801730632782, + "logits/rejected": -1.1215585470199585, + "logps/chosen": -907.8131103515625, + "logps/rejected": -451.2996826171875, + "loss": 0.1685, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.316098690032959, + "rewards/margins": 2.113025188446045, + "rewards/rejected": 0.20307369530200958, + "step": 491 + }, + { + "epoch": 0.20517097581317764, + "grad_norm": 5.722754001617432, + "learning_rate": 5.698401667824879e-07, + "logits/chosen": -0.7678266763687134, + "logits/rejected": -0.8559350967407227, + "logps/chosen": -733.9535522460938, + "logps/rejected": -347.02386474609375, + "loss": 0.248, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5575233697891235, + "rewards/margins": 1.4005967378616333, + "rewards/rejected": 0.15692654252052307, + "step": 492 + }, + { + "epoch": 0.20558798999165973, + "grad_norm": 4.694777011871338, + "learning_rate": 5.709983785035905e-07, + "logits/chosen": -0.3085927367210388, + "logits/rejected": -0.9594597220420837, + "logps/chosen": -653.6571044921875, + "logps/rejected": -320.9862060546875, + "loss": 0.2519, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7351586818695068, + "rewards/margins": 1.4682997465133667, + "rewards/rejected": 0.26685887575149536, + "step": 493 + }, + { + "epoch": 0.20600500417014178, + "grad_norm": 8.769272804260254, + "learning_rate": 5.721565902246931e-07, + "logits/chosen": -0.5236677527427673, + "logits/rejected": -0.8422300219535828, + "logps/chosen": -1107.5960693359375, + "logps/rejected": -557.9692993164062, + "loss": 0.288, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3983078002929688, + "rewards/margins": 1.302320957183838, + "rewards/rejected": 0.09598693996667862, + "step": 494 + }, + { + "epoch": 0.20642201834862386, + "grad_norm": 7.95357084274292, + "learning_rate": 5.733148019457958e-07, + "logits/chosen": -1.2512929439544678, + "logits/rejected": -0.7752252817153931, + "logps/chosen": -1170.437255859375, + "logps/rejected": -558.0201416015625, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1249381303787231, + "rewards/margins": 0.8360016345977783, + "rewards/rejected": 0.2889366149902344, + "step": 495 + }, + { + "epoch": 0.20683903252710592, + "grad_norm": 4.474446773529053, + "learning_rate": 5.744730136668984e-07, + "logits/chosen": -0.5284867882728577, + "logits/rejected": -1.2491902112960815, + "logps/chosen": -728.523681640625, + "logps/rejected": -327.8948059082031, + "loss": 0.1841, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0622661113739014, + "rewards/margins": 1.7453036308288574, + "rewards/rejected": 0.31696248054504395, + "step": 496 + }, + { + "epoch": 0.20725604670558798, + "grad_norm": 6.115292072296143, + "learning_rate": 5.75631225388001e-07, + "logits/chosen": -1.061293601989746, + "logits/rejected": NaN, + "logps/chosen": -677.3186645507812, + "logps/rejected": -311.51031494140625, + "loss": 0.2328, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.050570011138916, + "rewards/margins": 1.8008110523223877, + "rewards/rejected": 0.24975891411304474, + "step": 497 + }, + { + "epoch": 0.20767306088407006, + "grad_norm": 100.41395568847656, + "learning_rate": 5.767894371091036e-07, + "logits/chosen": -0.5508221387863159, + "logits/rejected": -0.6348477005958557, + "logps/chosen": -1261.278564453125, + "logps/rejected": -672.1776123046875, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.401383876800537, + "rewards/margins": 2.1011276245117188, + "rewards/rejected": 0.3002563416957855, + "step": 498 + }, + { + "epoch": 0.20809007506255212, + "grad_norm": 273.3428649902344, + "learning_rate": 5.779476488302062e-07, + "logits/chosen": -1.8569422960281372, + "logits/rejected": -0.31225189566612244, + "logps/chosen": -1251.6263427734375, + "logps/rejected": -635.263916015625, + "loss": 0.3213, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4489028453826904, + "rewards/margins": 1.2669843435287476, + "rewards/rejected": 0.18191853165626526, + "step": 499 + }, + { + "epoch": 0.2085070892410342, + "grad_norm": 18.478635787963867, + "learning_rate": 5.791058605513088e-07, + "logits/chosen": -0.6980480551719666, + "logits/rejected": -0.7884376645088196, + "logps/chosen": -888.6747436523438, + "logps/rejected": -496.82470703125, + "loss": 0.2383, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.809554934501648, + "rewards/margins": 1.5770269632339478, + "rewards/rejected": 0.23252812027931213, + "step": 500 + }, + { + "epoch": 0.20892410341951625, + "grad_norm": 7.6266608238220215, + "learning_rate": 5.802640722724114e-07, + "logits/chosen": -0.7157642841339111, + "logits/rejected": -0.78514164686203, + "logps/chosen": -923.09716796875, + "logps/rejected": -602.072509765625, + "loss": 0.2698, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7793320417404175, + "rewards/margins": 1.3394509553909302, + "rewards/rejected": 0.4398811459541321, + "step": 501 + }, + { + "epoch": 0.20934111759799834, + "grad_norm": 16.68840980529785, + "learning_rate": 5.814222839935141e-07, + "logits/chosen": -0.3611679971218109, + "logits/rejected": -0.8872990012168884, + "logps/chosen": -1034.734130859375, + "logps/rejected": -643.9951171875, + "loss": 0.4157, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0523841381072998, + "rewards/margins": 0.797533392906189, + "rewards/rejected": 0.2548507750034332, + "step": 502 + }, + { + "epoch": 0.2097581317764804, + "grad_norm": 6.259205341339111, + "learning_rate": 5.825804957146166e-07, + "logits/chosen": -0.737076461315155, + "logits/rejected": -1.0007250308990479, + "logps/chosen": -864.5632934570312, + "logps/rejected": -463.37359619140625, + "loss": 0.216, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8479232788085938, + "rewards/margins": 1.600756049156189, + "rewards/rejected": 0.2471672147512436, + "step": 503 + }, + { + "epoch": 0.21017514595496248, + "grad_norm": 6.791311264038086, + "learning_rate": 5.837387074357193e-07, + "logits/chosen": -1.5701210498809814, + "logits/rejected": NaN, + "logps/chosen": -909.39111328125, + "logps/rejected": -287.55010986328125, + "loss": 0.2957, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.6464382410049438, + "rewards/margins": 1.35407555103302, + "rewards/rejected": 0.29236268997192383, + "step": 504 + }, + { + "epoch": 0.21059216013344453, + "grad_norm": 6.392669677734375, + "learning_rate": 5.84896919156822e-07, + "logits/chosen": -0.9825227856636047, + "logits/rejected": -0.9550327062606812, + "logps/chosen": -906.317626953125, + "logps/rejected": -533.5596923828125, + "loss": 0.2238, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2561349868774414, + "rewards/margins": 1.7955307960510254, + "rewards/rejected": 0.4606044590473175, + "step": 505 + }, + { + "epoch": 0.21100917431192662, + "grad_norm": 5.856666564941406, + "learning_rate": 5.860551308779245e-07, + "logits/chosen": -0.6865166425704956, + "logits/rejected": -0.9882336258888245, + "logps/chosen": -909.060791015625, + "logps/rejected": -465.1253662109375, + "loss": 0.2018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.906712293624878, + "rewards/margins": 1.568305253982544, + "rewards/rejected": 0.33840712904930115, + "step": 506 + }, + { + "epoch": 0.21142618849040867, + "grad_norm": 5.804927349090576, + "learning_rate": 5.872133425990272e-07, + "logits/chosen": -1.2417722940444946, + "logits/rejected": -1.070058822631836, + "logps/chosen": -774.563720703125, + "logps/rejected": -459.14208984375, + "loss": 0.296, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.671726942062378, + "rewards/margins": 1.4490001201629639, + "rewards/rejected": 0.22272682189941406, + "step": 507 + }, + { + "epoch": 0.21184320266889073, + "grad_norm": 5.183831691741943, + "learning_rate": 5.883715543201297e-07, + "logits/chosen": -0.6837495565414429, + "logits/rejected": -0.8375566005706787, + "logps/chosen": -864.2357177734375, + "logps/rejected": -483.258544921875, + "loss": 0.2305, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7428345680236816, + "rewards/margins": 1.6319987773895264, + "rewards/rejected": 0.11083564907312393, + "step": 508 + }, + { + "epoch": 0.2122602168473728, + "grad_norm": 5.249928951263428, + "learning_rate": 5.895297660412324e-07, + "logits/chosen": -0.2561919689178467, + "logits/rejected": -0.9568966627120972, + "logps/chosen": -684.4576416015625, + "logps/rejected": -440.3200988769531, + "loss": 0.2413, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6901775598526, + "rewards/margins": 1.3702237606048584, + "rewards/rejected": 0.31995370984077454, + "step": 509 + }, + { + "epoch": 0.21267723102585487, + "grad_norm": 151.5081024169922, + "learning_rate": 5.90687977762335e-07, + "logits/chosen": -0.8613792061805725, + "logits/rejected": -0.6085315942764282, + "logps/chosen": -922.3336181640625, + "logps/rejected": -541.621337890625, + "loss": 0.4749, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.101792812347412, + "rewards/margins": 0.7005401849746704, + "rewards/rejected": 0.40125274658203125, + "step": 510 + }, + { + "epoch": 0.21309424520433695, + "grad_norm": 5.42474889755249, + "learning_rate": 5.918461894834376e-07, + "logits/chosen": -0.7772935628890991, + "logits/rejected": -0.8633422255516052, + "logps/chosen": -1018.8075561523438, + "logps/rejected": -537.109375, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9571921825408936, + "rewards/margins": 1.6336033344268799, + "rewards/rejected": 0.3235887587070465, + "step": 511 + }, + { + "epoch": 0.213511259382819, + "grad_norm": 8.998868942260742, + "learning_rate": 5.930044012045402e-07, + "logits/chosen": -0.7121532559394836, + "logits/rejected": -0.8229101300239563, + "logps/chosen": -933.3834228515625, + "logps/rejected": -508.56744384765625, + "loss": 0.2307, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.743220567703247, + "rewards/margins": 1.6557296514511108, + "rewards/rejected": 0.08749084919691086, + "step": 512 + }, + { + "epoch": 0.2139282735613011, + "grad_norm": 5.1136040687561035, + "learning_rate": 5.941626129256429e-07, + "logits/chosen": -1.4199309349060059, + "logits/rejected": -0.9565040469169617, + "logps/chosen": -792.0682373046875, + "logps/rejected": -318.81341552734375, + "loss": 0.2711, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6853744983673096, + "rewards/margins": 1.3745521306991577, + "rewards/rejected": 0.3108223080635071, + "step": 513 + }, + { + "epoch": 0.21434528773978315, + "grad_norm": 5.376322269439697, + "learning_rate": 5.953208246467454e-07, + "logits/chosen": -0.5492020845413208, + "logits/rejected": -1.2454843521118164, + "logps/chosen": -847.89501953125, + "logps/rejected": -471.87652587890625, + "loss": 0.1931, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.405242443084717, + "rewards/margins": 1.936409831047058, + "rewards/rejected": 0.468832790851593, + "step": 514 + }, + { + "epoch": 0.21476230191826523, + "grad_norm": 8.027323722839355, + "learning_rate": 5.964790363678481e-07, + "logits/chosen": -0.6134148836135864, + "logits/rejected": -0.8734131455421448, + "logps/chosen": -943.81396484375, + "logps/rejected": -571.0420532226562, + "loss": 0.2866, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.874568223953247, + "rewards/margins": 1.369287133216858, + "rewards/rejected": 0.5052810907363892, + "step": 515 + }, + { + "epoch": 0.2151793160967473, + "grad_norm": 5.612671375274658, + "learning_rate": 5.976372480889508e-07, + "logits/chosen": -0.690594494342804, + "logits/rejected": -1.0125646591186523, + "logps/chosen": -1200.40869140625, + "logps/rejected": -664.167724609375, + "loss": 0.1522, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4856834411621094, + "rewards/margins": 2.0992865562438965, + "rewards/rejected": 0.38639697432518005, + "step": 516 + }, + { + "epoch": 0.21559633027522937, + "grad_norm": 7.169922351837158, + "learning_rate": 5.987954598100533e-07, + "logits/chosen": -0.5840066075325012, + "logits/rejected": -1.1339763402938843, + "logps/chosen": -918.4588012695312, + "logps/rejected": -493.04638671875, + "loss": 0.2185, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.154876708984375, + "rewards/margins": 1.5623843669891357, + "rewards/rejected": 0.5924922823905945, + "step": 517 + }, + { + "epoch": 0.21601334445371143, + "grad_norm": 5.690084457397461, + "learning_rate": 5.99953671531156e-07, + "logits/chosen": -0.6558934450149536, + "logits/rejected": -0.8117353916168213, + "logps/chosen": -867.5065307617188, + "logps/rejected": -463.116455078125, + "loss": 0.2992, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5261409282684326, + "rewards/margins": 1.2586250305175781, + "rewards/rejected": 0.26751595735549927, + "step": 518 + }, + { + "epoch": 0.21643035863219348, + "grad_norm": 6.783833026885986, + "learning_rate": 6.011118832522586e-07, + "logits/chosen": -0.12468953430652618, + "logits/rejected": -1.00544273853302, + "logps/chosen": -1205.9136962890625, + "logps/rejected": -538.1372680664062, + "loss": 0.2199, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8478951454162598, + "rewards/margins": 1.5803569555282593, + "rewards/rejected": 0.2675381004810333, + "step": 519 + }, + { + "epoch": 0.21684737281067556, + "grad_norm": 7.517802715301514, + "learning_rate": 6.022700949733612e-07, + "logits/chosen": -0.004794944077730179, + "logits/rejected": -0.8362889289855957, + "logps/chosen": -824.8170166015625, + "logps/rejected": -499.6806640625, + "loss": 0.2583, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9108623266220093, + "rewards/margins": 1.3641623258590698, + "rewards/rejected": 0.546700119972229, + "step": 520 + }, + { + "epoch": 0.21726438698915762, + "grad_norm": 36.368282318115234, + "learning_rate": 6.034283066944638e-07, + "logits/chosen": -0.7451829314231873, + "logits/rejected": -0.7599416375160217, + "logps/chosen": -1126.955810546875, + "logps/rejected": -645.94384765625, + "loss": 0.2624, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9780609607696533, + "rewards/margins": 1.5265768766403198, + "rewards/rejected": 0.45148396492004395, + "step": 521 + }, + { + "epoch": 0.2176814011676397, + "grad_norm": 9.714088439941406, + "learning_rate": 6.045865184155664e-07, + "logits/chosen": -0.9430064558982849, + "logits/rejected": -0.7290706634521484, + "logps/chosen": -903.8606567382812, + "logps/rejected": -527.9398803710938, + "loss": 0.3579, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3546218872070312, + "rewards/margins": 1.1148875951766968, + "rewards/rejected": 0.23973429203033447, + "step": 522 + }, + { + "epoch": 0.21809841534612176, + "grad_norm": 6.213085174560547, + "learning_rate": 6.05744730136669e-07, + "logits/chosen": -0.20643621683120728, + "logits/rejected": -0.8436014652252197, + "logps/chosen": -1079.9449462890625, + "logps/rejected": -589.644775390625, + "loss": 0.2144, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9100754261016846, + "rewards/margins": 1.551041841506958, + "rewards/rejected": 0.35903361439704895, + "step": 523 + }, + { + "epoch": 0.21851542952460384, + "grad_norm": 15.497182846069336, + "learning_rate": 6.069029418577717e-07, + "logits/chosen": -1.8852897882461548, + "logits/rejected": -0.2447107583284378, + "logps/chosen": -1225.9576416015625, + "logps/rejected": -388.8406982421875, + "loss": 0.2582, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7164822816848755, + "rewards/margins": 1.5655754804611206, + "rewards/rejected": 0.15090680122375488, + "step": 524 + }, + { + "epoch": 0.2189324437030859, + "grad_norm": 7.319351673126221, + "learning_rate": 6.080611535788742e-07, + "logits/chosen": -0.9240214824676514, + "logits/rejected": NaN, + "logps/chosen": -791.593017578125, + "logps/rejected": -306.0935363769531, + "loss": 0.333, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5699721574783325, + "rewards/margins": 1.2706866264343262, + "rewards/rejected": 0.29928553104400635, + "step": 525 + }, + { + "epoch": 0.21934945788156798, + "grad_norm": 12.686361312866211, + "learning_rate": 6.092193652999769e-07, + "logits/chosen": -0.9901975989341736, + "logits/rejected": -0.8308324813842773, + "logps/chosen": -968.6548461914062, + "logps/rejected": -620.3638916015625, + "loss": 0.315, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.521081566810608, + "rewards/margins": 1.1527560949325562, + "rewards/rejected": 0.368325412273407, + "step": 526 + }, + { + "epoch": 0.21976647206005004, + "grad_norm": 8.114706993103027, + "learning_rate": 6.103775770210795e-07, + "logits/chosen": -0.6517431139945984, + "logits/rejected": NaN, + "logps/chosen": -845.6978759765625, + "logps/rejected": -462.00775146484375, + "loss": 0.2898, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7809189558029175, + "rewards/margins": 1.3891469240188599, + "rewards/rejected": 0.3917720913887024, + "step": 527 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 13.389248847961426, + "learning_rate": 6.115357887421821e-07, + "logits/chosen": -0.02237020432949066, + "logits/rejected": -0.9752336144447327, + "logps/chosen": -776.427490234375, + "logps/rejected": -461.52703857421875, + "loss": 0.2836, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.688924789428711, + "rewards/margins": 1.4550937414169312, + "rewards/rejected": 0.2338310182094574, + "step": 528 + }, + { + "epoch": 0.22060050041701418, + "grad_norm": 8.206870079040527, + "learning_rate": 6.126940004632848e-07, + "logits/chosen": -0.6484327912330627, + "logits/rejected": -0.5530352592468262, + "logps/chosen": -1080.053466796875, + "logps/rejected": -629.78662109375, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2474822998046875, + "rewards/margins": 1.6984965801239014, + "rewards/rejected": 0.5489857196807861, + "step": 529 + }, + { + "epoch": 0.22101751459549623, + "grad_norm": 7.997886657714844, + "learning_rate": 6.138522121843874e-07, + "logits/chosen": -1.111435055732727, + "logits/rejected": -0.8730185627937317, + "logps/chosen": -1000.6494140625, + "logps/rejected": -500.203369140625, + "loss": 0.2585, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.514086127281189, + "rewards/margins": 1.3124998807907104, + "rewards/rejected": 0.20158614218235016, + "step": 530 + }, + { + "epoch": 0.22143452877397832, + "grad_norm": 6.156350612640381, + "learning_rate": 6.1501042390549e-07, + "logits/chosen": -0.10989673435688019, + "logits/rejected": -1.1973705291748047, + "logps/chosen": -846.5324096679688, + "logps/rejected": -400.83319091796875, + "loss": 0.2537, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4737389087677002, + "rewards/margins": 1.421010971069336, + "rewards/rejected": 0.052727892994880676, + "step": 531 + }, + { + "epoch": 0.22185154295246037, + "grad_norm": 3.826831579208374, + "learning_rate": 6.161686356265926e-07, + "logits/chosen": -0.7528170943260193, + "logits/rejected": -0.9332923293113708, + "logps/chosen": -787.8946533203125, + "logps/rejected": -424.85302734375, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.347458839416504, + "rewards/margins": 2.193713903427124, + "rewards/rejected": 0.15374508500099182, + "step": 532 + }, + { + "epoch": 0.22226855713094246, + "grad_norm": 7.43458890914917, + "learning_rate": 6.173268473476953e-07, + "logits/chosen": -1.6082158088684082, + "logits/rejected": -0.6241663694381714, + "logps/chosen": -1114.74755859375, + "logps/rejected": -469.1249694824219, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6276322603225708, + "rewards/margins": 1.3692837953567505, + "rewards/rejected": 0.2583482563495636, + "step": 533 + }, + { + "epoch": 0.2226855713094245, + "grad_norm": 4.882180690765381, + "learning_rate": 6.184850590687978e-07, + "logits/chosen": -1.4704489707946777, + "logits/rejected": -0.7379323244094849, + "logps/chosen": -1021.5924072265625, + "logps/rejected": -467.3543701171875, + "loss": 0.1694, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.431095838546753, + "rewards/margins": 2.0146405696868896, + "rewards/rejected": 0.4164552688598633, + "step": 534 + }, + { + "epoch": 0.2231025854879066, + "grad_norm": 6.333116054534912, + "learning_rate": 6.196432707899004e-07, + "logits/chosen": -1.1734321117401123, + "logits/rejected": -0.38539183139801025, + "logps/chosen": -1247.86083984375, + "logps/rejected": -793.6072387695312, + "loss": 0.272, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7628082036972046, + "rewards/margins": 1.5654258728027344, + "rewards/rejected": 0.1973823755979538, + "step": 535 + }, + { + "epoch": 0.22351959966638865, + "grad_norm": 4.053037643432617, + "learning_rate": 6.20801482511003e-07, + "logits/chosen": -0.4509516954421997, + "logits/rejected": -1.1477140188217163, + "logps/chosen": -805.53515625, + "logps/rejected": -434.8746032714844, + "loss": 0.1607, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8417420387268066, + "rewards/margins": 1.887879729270935, + "rewards/rejected": -0.046137623488903046, + "step": 536 + }, + { + "epoch": 0.22393661384487074, + "grad_norm": 4.824332237243652, + "learning_rate": 6.219596942321057e-07, + "logits/chosen": -0.937088668346405, + "logits/rejected": NaN, + "logps/chosen": -886.4483032226562, + "logps/rejected": -383.72979736328125, + "loss": 0.2272, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1547508239746094, + "rewards/margins": 1.8566722869873047, + "rewards/rejected": 0.2980785369873047, + "step": 537 + }, + { + "epoch": 0.2243536280233528, + "grad_norm": 3.484233856201172, + "learning_rate": 6.231179059532083e-07, + "logits/chosen": -0.6393739581108093, + "logits/rejected": -1.1080241203308105, + "logps/chosen": -1017.6734619140625, + "logps/rejected": -492.06951904296875, + "loss": 0.0976, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8306145668029785, + "rewards/margins": 2.500723361968994, + "rewards/rejected": 0.3298912048339844, + "step": 538 + }, + { + "epoch": 0.22477064220183487, + "grad_norm": 14.17490291595459, + "learning_rate": 6.242761176743109e-07, + "logits/chosen": -0.27556300163269043, + "logits/rejected": -0.826904296875, + "logps/chosen": -1010.3253173828125, + "logps/rejected": -584.5723876953125, + "loss": 0.4085, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4477221965789795, + "rewards/margins": 1.0092155933380127, + "rewards/rejected": 0.43850669264793396, + "step": 539 + }, + { + "epoch": 0.22518765638031693, + "grad_norm": 8.469464302062988, + "learning_rate": 6.254343293954135e-07, + "logits/chosen": -1.4620729684829712, + "logits/rejected": -0.5602311491966248, + "logps/chosen": -1136.534423828125, + "logps/rejected": -593.3687744140625, + "loss": 0.3418, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6194225549697876, + "rewards/margins": 1.1854240894317627, + "rewards/rejected": 0.4339984953403473, + "step": 540 + }, + { + "epoch": 0.225604670558799, + "grad_norm": 6.166780948638916, + "learning_rate": 6.265925411165162e-07, + "logits/chosen": -1.3106743097305298, + "logits/rejected": -0.45721229910850525, + "logps/chosen": -1165.12158203125, + "logps/rejected": -581.984375, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.588327169418335, + "rewards/margins": 1.5256932973861694, + "rewards/rejected": 0.0626337006688118, + "step": 541 + }, + { + "epoch": 0.22602168473728107, + "grad_norm": 16.985233306884766, + "learning_rate": 6.277507528376188e-07, + "logits/chosen": -0.8296419382095337, + "logits/rejected": NaN, + "logps/chosen": -1014.876953125, + "logps/rejected": -481.746826171875, + "loss": 0.4431, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.809546709060669, + "rewards/margins": 1.0552425384521484, + "rewards/rejected": 0.7543041706085205, + "step": 542 + }, + { + "epoch": 0.22643869891576313, + "grad_norm": 4.860982418060303, + "learning_rate": 6.289089645587214e-07, + "logits/chosen": -0.8058522939682007, + "logits/rejected": -0.9101077318191528, + "logps/chosen": -838.5142822265625, + "logps/rejected": -468.4051513671875, + "loss": 0.2175, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5534863471984863, + "rewards/margins": 1.5609939098358154, + "rewards/rejected": -0.007507712580263615, + "step": 543 + }, + { + "epoch": 0.2268557130942452, + "grad_norm": 8.319520950317383, + "learning_rate": 6.30067176279824e-07, + "logits/chosen": -1.464403748512268, + "logits/rejected": -0.29793307185173035, + "logps/chosen": -1630.830810546875, + "logps/rejected": -808.50341796875, + "loss": 0.138, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.494734287261963, + "rewards/margins": 1.9710805416107178, + "rewards/rejected": 0.5236536264419556, + "step": 544 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 8.182605743408203, + "learning_rate": 6.312253880009265e-07, + "logits/chosen": -1.107459545135498, + "logits/rejected": NaN, + "logps/chosen": -913.2267456054688, + "logps/rejected": -465.3213806152344, + "loss": 0.2497, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1516318321228027, + "rewards/margins": 2.0059220790863037, + "rewards/rejected": 0.1457097977399826, + "step": 545 + }, + { + "epoch": 0.22768974145120935, + "grad_norm": 35.565818786621094, + "learning_rate": 6.323835997220293e-07, + "logits/chosen": -0.9825840592384338, + "logits/rejected": -0.7612924575805664, + "logps/chosen": -936.2130737304688, + "logps/rejected": -598.3317260742188, + "loss": 0.1931, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.433122158050537, + "rewards/margins": 1.9068083763122559, + "rewards/rejected": 0.5263140201568604, + "step": 546 + }, + { + "epoch": 0.2281067556296914, + "grad_norm": 15.635652542114258, + "learning_rate": 6.335418114431318e-07, + "logits/chosen": -1.4004907608032227, + "logits/rejected": -0.8593541979789734, + "logps/chosen": -1109.7972412109375, + "logps/rejected": -656.0653076171875, + "loss": 0.3247, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8303390741348267, + "rewards/margins": 1.2601120471954346, + "rewards/rejected": 0.5702270269393921, + "step": 547 + }, + { + "epoch": 0.2285237698081735, + "grad_norm": 2.4502182006835938, + "learning_rate": 6.347000231642344e-07, + "logits/chosen": -0.8341239094734192, + "logits/rejected": -0.7065054178237915, + "logps/chosen": -1004.200439453125, + "logps/rejected": -546.7350463867188, + "loss": 0.1539, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8285040855407715, + "rewards/margins": 2.6060028076171875, + "rewards/rejected": 0.22250138223171234, + "step": 548 + }, + { + "epoch": 0.22894078398665554, + "grad_norm": 2.874181032180786, + "learning_rate": 6.358582348853371e-07, + "logits/chosen": -0.714973509311676, + "logits/rejected": -0.9465231895446777, + "logps/chosen": -824.1865234375, + "logps/rejected": -411.32366943359375, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0970630645751953, + "rewards/margins": 2.743298053741455, + "rewards/rejected": 0.3537651002407074, + "step": 549 + }, + { + "epoch": 0.22935779816513763, + "grad_norm": 5.696512699127197, + "learning_rate": 6.370164466064397e-07, + "logits/chosen": -0.4001590609550476, + "logits/rejected": -1.238459587097168, + "logps/chosen": -700.4256591796875, + "logps/rejected": -338.9095764160156, + "loss": 0.1729, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1066319942474365, + "rewards/margins": 1.902980923652649, + "rewards/rejected": 0.2036510407924652, + "step": 550 + }, + { + "epoch": 0.22977481234361968, + "grad_norm": 5.05336856842041, + "learning_rate": 6.381746583275423e-07, + "logits/chosen": -0.9003589749336243, + "logits/rejected": -0.9663272500038147, + "logps/chosen": -877.3930053710938, + "logps/rejected": -548.6300659179688, + "loss": 0.2077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0915367603302, + "rewards/margins": 1.7862409353256226, + "rewards/rejected": 0.30529576539993286, + "step": 551 + }, + { + "epoch": 0.23019182652210174, + "grad_norm": 3.682574510574341, + "learning_rate": 6.39332870048645e-07, + "logits/chosen": -0.7591300010681152, + "logits/rejected": -1.1398742198944092, + "logps/chosen": -944.6478881835938, + "logps/rejected": -393.1900329589844, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3776848316192627, + "rewards/margins": 2.4553186893463135, + "rewards/rejected": -0.07763385027647018, + "step": 552 + }, + { + "epoch": 0.23060884070058382, + "grad_norm": 4.685425758361816, + "learning_rate": 6.404910817697475e-07, + "logits/chosen": -0.22726334631443024, + "logits/rejected": -0.8687431216239929, + "logps/chosen": -1034.83056640625, + "logps/rejected": -611.6302490234375, + "loss": 0.1732, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6633706092834473, + "rewards/margins": 2.220412254333496, + "rewards/rejected": 0.4429580569267273, + "step": 553 + }, + { + "epoch": 0.23102585487906588, + "grad_norm": 3.6735737323760986, + "learning_rate": 6.416492934908501e-07, + "logits/chosen": -0.7605972290039062, + "logits/rejected": -1.0082345008850098, + "logps/chosen": -968.3330078125, + "logps/rejected": -517.7858276367188, + "loss": 0.1294, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.599534511566162, + "rewards/margins": 2.135115146636963, + "rewards/rejected": 0.4644193649291992, + "step": 554 + }, + { + "epoch": 0.23144286905754796, + "grad_norm": 5.448785781860352, + "learning_rate": 6.428075052119529e-07, + "logits/chosen": -0.3108978271484375, + "logits/rejected": -1.0755946636199951, + "logps/chosen": -947.7612915039062, + "logps/rejected": -474.9495849609375, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4578468799591064, + "rewards/margins": 1.9073097705841064, + "rewards/rejected": 0.550537109375, + "step": 555 + }, + { + "epoch": 0.23185988323603002, + "grad_norm": 4.275854110717773, + "learning_rate": 6.439657169330554e-07, + "logits/chosen": -1.0676941871643066, + "logits/rejected": NaN, + "logps/chosen": -986.945556640625, + "logps/rejected": -370.50311279296875, + "loss": 0.2626, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5649750232696533, + "rewards/margins": 1.5259392261505127, + "rewards/rejected": 0.039035797119140625, + "step": 556 + }, + { + "epoch": 0.2322768974145121, + "grad_norm": 3.9007818698883057, + "learning_rate": 6.45123928654158e-07, + "logits/chosen": -0.32595136761665344, + "logits/rejected": -1.1036198139190674, + "logps/chosen": -814.4769287109375, + "logps/rejected": -416.2591552734375, + "loss": 0.1493, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1116394996643066, + "rewards/margins": 1.9090064764022827, + "rewards/rejected": 0.20263291895389557, + "step": 557 + }, + { + "epoch": 0.23269391159299416, + "grad_norm": 20.266870498657227, + "learning_rate": 6.462821403752606e-07, + "logits/chosen": -1.2633594274520874, + "logits/rejected": -0.5442631840705872, + "logps/chosen": -1420.0001220703125, + "logps/rejected": -719.646484375, + "loss": 0.1054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8166399002075195, + "rewards/margins": 2.6058552265167236, + "rewards/rejected": 0.21078453958034515, + "step": 558 + }, + { + "epoch": 0.23311092577147624, + "grad_norm": 21.510496139526367, + "learning_rate": 6.474403520963633e-07, + "logits/chosen": -0.9272701740264893, + "logits/rejected": -0.8081982135772705, + "logps/chosen": -1212.8956298828125, + "logps/rejected": -497.6791687011719, + "loss": 0.2023, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2435615062713623, + "rewards/margins": 2.0466198921203613, + "rewards/rejected": 0.19694176316261292, + "step": 559 + }, + { + "epoch": 0.2335279399499583, + "grad_norm": 3.7315402030944824, + "learning_rate": 6.485985638174659e-07, + "logits/chosen": -0.17365290224552155, + "logits/rejected": -1.1195919513702393, + "logps/chosen": -952.5748901367188, + "logps/rejected": -438.87939453125, + "loss": 0.1546, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4960663318634033, + "rewards/margins": 2.2565903663635254, + "rewards/rejected": 0.2394760251045227, + "step": 560 + }, + { + "epoch": 0.23394495412844038, + "grad_norm": 5.231003761291504, + "learning_rate": 6.497567755385686e-07, + "logits/chosen": -0.2824426293373108, + "logits/rejected": -0.8957808017730713, + "logps/chosen": -787.4111938476562, + "logps/rejected": -479.49176025390625, + "loss": 0.2022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1331186294555664, + "rewards/margins": 1.7233567237854004, + "rewards/rejected": 0.40976178646087646, + "step": 561 + }, + { + "epoch": 0.23436196830692244, + "grad_norm": 3.689697504043579, + "learning_rate": 6.509149872596711e-07, + "logits/chosen": -0.5894142985343933, + "logits/rejected": -0.9908820390701294, + "logps/chosen": -928.5245361328125, + "logps/rejected": -567.1362915039062, + "loss": 0.0899, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8426597118377686, + "rewards/margins": 2.5543336868286133, + "rewards/rejected": 0.2883262634277344, + "step": 562 + }, + { + "epoch": 0.2347789824854045, + "grad_norm": 4.919488430023193, + "learning_rate": 6.520731989807737e-07, + "logits/chosen": -0.14326128363609314, + "logits/rejected": -0.8198326826095581, + "logps/chosen": -787.2733154296875, + "logps/rejected": -507.84942626953125, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0869407653808594, + "rewards/margins": 1.905129313468933, + "rewards/rejected": 0.18181152641773224, + "step": 563 + }, + { + "epoch": 0.23519599666388658, + "grad_norm": 4.2204718589782715, + "learning_rate": 6.532314107018764e-07, + "logits/chosen": -0.2969723045825958, + "logits/rejected": -0.8992603421211243, + "logps/chosen": -900.7478637695312, + "logps/rejected": -503.32794189453125, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.330336809158325, + "rewards/margins": 2.2471423149108887, + "rewards/rejected": 0.08319436013698578, + "step": 564 + }, + { + "epoch": 0.23561301084236863, + "grad_norm": 4.191512107849121, + "learning_rate": 6.54389622422979e-07, + "logits/chosen": -0.43043604493141174, + "logits/rejected": -1.0587236881256104, + "logps/chosen": -652.44287109375, + "logps/rejected": -406.157958984375, + "loss": 0.1907, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7252751588821411, + "rewards/margins": 1.6250417232513428, + "rewards/rejected": 0.10023346543312073, + "step": 565 + }, + { + "epoch": 0.23603002502085071, + "grad_norm": 3.9499142169952393, + "learning_rate": 6.555478341440816e-07, + "logits/chosen": -0.8927463293075562, + "logits/rejected": -0.7258630990982056, + "logps/chosen": -1205.78271484375, + "logps/rejected": -653.2742309570312, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9425876140594482, + "rewards/margins": 2.7257673740386963, + "rewards/rejected": 0.21682052314281464, + "step": 566 + }, + { + "epoch": 0.23644703919933277, + "grad_norm": 12.783159255981445, + "learning_rate": 6.567060458651841e-07, + "logits/chosen": -1.2977509498596191, + "logits/rejected": -0.7358763813972473, + "logps/chosen": -1031.70068359375, + "logps/rejected": -591.406494140625, + "loss": 0.2798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.027395009994507, + "rewards/margins": 1.656252384185791, + "rewards/rejected": 0.37114259600639343, + "step": 567 + }, + { + "epoch": 0.23686405337781485, + "grad_norm": 3.4443845748901367, + "learning_rate": 6.578642575862869e-07, + "logits/chosen": -0.6637470722198486, + "logits/rejected": -0.9563277959823608, + "logps/chosen": -994.77294921875, + "logps/rejected": -453.1778869628906, + "loss": 0.1561, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3759243488311768, + "rewards/margins": 2.379049301147461, + "rewards/rejected": -0.0031250063329935074, + "step": 568 + }, + { + "epoch": 0.2372810675562969, + "grad_norm": 3.380708932876587, + "learning_rate": 6.590224693073895e-07, + "logits/chosen": -0.6783396601676941, + "logits/rejected": -0.8549071550369263, + "logps/chosen": -991.6975708007812, + "logps/rejected": -514.34130859375, + "loss": 0.1258, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5617051124572754, + "rewards/margins": 2.494755983352661, + "rewards/rejected": 0.06694908440113068, + "step": 569 + }, + { + "epoch": 0.237698081734779, + "grad_norm": 4.558756351470947, + "learning_rate": 6.60180681028492e-07, + "logits/chosen": 0.1792798638343811, + "logits/rejected": -0.9398411512374878, + "logps/chosen": -1109.8642578125, + "logps/rejected": -520.9002685546875, + "loss": 0.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.435124158859253, + "rewards/margins": 2.5224523544311523, + "rewards/rejected": -0.08732815831899643, + "step": 570 + }, + { + "epoch": 0.23811509591326105, + "grad_norm": 4.44219446182251, + "learning_rate": 6.613388927495947e-07, + "logits/chosen": -0.35958126187324524, + "logits/rejected": -0.9748481512069702, + "logps/chosen": -1090.518310546875, + "logps/rejected": -525.9591674804688, + "loss": 0.1387, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2399964332580566, + "rewards/margins": 2.3896257877349854, + "rewards/rejected": -0.1496295928955078, + "step": 571 + }, + { + "epoch": 0.23853211009174313, + "grad_norm": 4.06010627746582, + "learning_rate": 6.624971044706973e-07, + "logits/chosen": -0.3667868673801422, + "logits/rejected": -0.9023036956787109, + "logps/chosen": -801.9957275390625, + "logps/rejected": -485.2452697753906, + "loss": 0.1194, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9251742362976074, + "rewards/margins": 2.2926416397094727, + "rewards/rejected": 0.63253253698349, + "step": 572 + }, + { + "epoch": 0.2389491242702252, + "grad_norm": 8.455350875854492, + "learning_rate": 6.636553161917999e-07, + "logits/chosen": -0.8677927255630493, + "logits/rejected": -0.677211582660675, + "logps/chosen": -973.7728881835938, + "logps/rejected": -477.1925354003906, + "loss": 0.2097, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2893645763397217, + "rewards/margins": 1.8408095836639404, + "rewards/rejected": 0.44855496287345886, + "step": 573 + }, + { + "epoch": 0.23936613844870724, + "grad_norm": 4.279191017150879, + "learning_rate": 6.648135279129026e-07, + "logits/chosen": -1.8340595960617065, + "logits/rejected": NaN, + "logps/chosen": -1046.970458984375, + "logps/rejected": -495.7041931152344, + "loss": 0.2548, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.164609432220459, + "rewards/margins": 1.8078408241271973, + "rewards/rejected": 0.35676878690719604, + "step": 574 + }, + { + "epoch": 0.23978315262718933, + "grad_norm": 2.7377030849456787, + "learning_rate": 6.659717396340051e-07, + "logits/chosen": -0.9447914361953735, + "logits/rejected": NaN, + "logps/chosen": -904.7869873046875, + "logps/rejected": -332.7646179199219, + "loss": 0.1752, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1197502613067627, + "rewards/margins": 2.1207997798919678, + "rewards/rejected": -0.0010496079921722412, + "step": 575 + }, + { + "epoch": 0.24020016680567138, + "grad_norm": 3.692415714263916, + "learning_rate": 6.671299513551077e-07, + "logits/chosen": -0.7124835252761841, + "logits/rejected": -0.9142696261405945, + "logps/chosen": -1036.1793212890625, + "logps/rejected": -638.115234375, + "loss": 0.117, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7023730278015137, + "rewards/margins": 2.347144365310669, + "rewards/rejected": 0.3552290201187134, + "step": 576 + }, + { + "epoch": 0.24061718098415347, + "grad_norm": 64.37772369384766, + "learning_rate": 6.682881630762105e-07, + "logits/chosen": -0.47198644280433655, + "logits/rejected": -0.9584928750991821, + "logps/chosen": -888.731201171875, + "logps/rejected": -595.3089599609375, + "loss": 0.2046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.077327013015747, + "rewards/margins": 1.9851611852645874, + "rewards/rejected": 0.09216576814651489, + "step": 577 + }, + { + "epoch": 0.24103419516263552, + "grad_norm": 11.123977661132812, + "learning_rate": 6.69446374797313e-07, + "logits/chosen": -1.1312851905822754, + "logits/rejected": -0.494720458984375, + "logps/chosen": -979.8524169921875, + "logps/rejected": -507.8468933105469, + "loss": 0.2063, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.127586841583252, + "rewards/margins": 1.8846343755722046, + "rewards/rejected": 0.2429523468017578, + "step": 578 + }, + { + "epoch": 0.2414512093411176, + "grad_norm": 9.920076370239258, + "learning_rate": 6.706045865184156e-07, + "logits/chosen": -1.2353777885437012, + "logits/rejected": -0.49150022864341736, + "logps/chosen": -1299.062255859375, + "logps/rejected": -510.3428649902344, + "loss": 0.1325, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5173263549804688, + "rewards/margins": 2.690919876098633, + "rewards/rejected": -0.17359334230422974, + "step": 579 + }, + { + "epoch": 0.24186822351959966, + "grad_norm": 4.628581523895264, + "learning_rate": 6.717627982395182e-07, + "logits/chosen": -0.9294219613075256, + "logits/rejected": -0.7702094316482544, + "logps/chosen": -860.9224853515625, + "logps/rejected": -400.0919189453125, + "loss": 0.2139, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0851547718048096, + "rewards/margins": 1.792258620262146, + "rewards/rejected": 0.2928960621356964, + "step": 580 + }, + { + "epoch": 0.24228523769808175, + "grad_norm": 2.8453824520111084, + "learning_rate": 6.729210099606208e-07, + "logits/chosen": -0.3334875702857971, + "logits/rejected": -0.9873142838478088, + "logps/chosen": -901.4068603515625, + "logps/rejected": -513.826416015625, + "loss": 0.1101, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5971298217773438, + "rewards/margins": 2.53615140914917, + "rewards/rejected": 0.06097850203514099, + "step": 581 + }, + { + "epoch": 0.2427022518765638, + "grad_norm": 102.48681640625, + "learning_rate": 6.740792216817235e-07, + "logits/chosen": -0.9005489945411682, + "logits/rejected": -0.8098220825195312, + "logps/chosen": -858.6455688476562, + "logps/rejected": -485.65460205078125, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5132217407226562, + "rewards/margins": 2.5258235931396484, + "rewards/rejected": -0.012601859867572784, + "step": 582 + }, + { + "epoch": 0.24311926605504589, + "grad_norm": 3.239579677581787, + "learning_rate": 6.752374334028261e-07, + "logits/chosen": -0.19207116961479187, + "logits/rejected": -1.0377763509750366, + "logps/chosen": -818.3712158203125, + "logps/rejected": -491.93829345703125, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.199800729751587, + "rewards/margins": 2.771793842315674, + "rewards/rejected": 0.42800694704055786, + "step": 583 + }, + { + "epoch": 0.24353628023352794, + "grad_norm": 4.887111186981201, + "learning_rate": 6.763956451239287e-07, + "logits/chosen": -0.07431793212890625, + "logits/rejected": -0.8063115477561951, + "logps/chosen": -974.6423950195312, + "logps/rejected": -582.7467651367188, + "loss": 0.1754, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2957656383514404, + "rewards/margins": 1.95881187915802, + "rewards/rejected": 0.336953729391098, + "step": 584 + }, + { + "epoch": 0.24395329441201, + "grad_norm": 4.571489334106445, + "learning_rate": 6.775538568450313e-07, + "logits/chosen": -0.47171205282211304, + "logits/rejected": -0.7927331924438477, + "logps/chosen": -1092.269775390625, + "logps/rejected": -540.205078125, + "loss": 0.1369, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5768678188323975, + "rewards/margins": 2.2944729328155518, + "rewards/rejected": 0.28239476680755615, + "step": 585 + }, + { + "epoch": 0.24437030859049208, + "grad_norm": 4.157436847686768, + "learning_rate": 6.78712068566134e-07, + "logits/chosen": -0.35529640316963196, + "logits/rejected": -0.7153292298316956, + "logps/chosen": -828.2791137695312, + "logps/rejected": -469.28662109375, + "loss": 0.1545, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3337931632995605, + "rewards/margins": 2.0390233993530273, + "rewards/rejected": 0.29476988315582275, + "step": 586 + }, + { + "epoch": 0.24478732276897414, + "grad_norm": 3.4201345443725586, + "learning_rate": 6.798702802872366e-07, + "logits/chosen": -0.6787762641906738, + "logits/rejected": -0.8459933996200562, + "logps/chosen": -965.1478271484375, + "logps/rejected": -613.1910400390625, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6715972423553467, + "rewards/margins": 2.5818915367126465, + "rewards/rejected": 0.08970566093921661, + "step": 587 + }, + { + "epoch": 0.24520433694745622, + "grad_norm": 25.067283630371094, + "learning_rate": 6.810284920083392e-07, + "logits/chosen": -1.5628466606140137, + "logits/rejected": -0.6070376634597778, + "logps/chosen": -1103.964599609375, + "logps/rejected": -515.2569580078125, + "loss": 0.1563, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.713486671447754, + "rewards/margins": 2.5505623817443848, + "rewards/rejected": 0.16292421519756317, + "step": 588 + }, + { + "epoch": 0.24562135112593828, + "grad_norm": 20.50541114807129, + "learning_rate": 6.821867037294417e-07, + "logits/chosen": -0.7356414198875427, + "logits/rejected": -0.8691208362579346, + "logps/chosen": -1003.6810913085938, + "logps/rejected": -511.199951171875, + "loss": 0.1596, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4230072498321533, + "rewards/margins": 2.3711071014404297, + "rewards/rejected": 0.05190009996294975, + "step": 589 + }, + { + "epoch": 0.24603836530442036, + "grad_norm": 22.755353927612305, + "learning_rate": 6.833449154505443e-07, + "logits/chosen": -1.02828049659729, + "logits/rejected": -0.7264670729637146, + "logps/chosen": -828.9998168945312, + "logps/rejected": -476.0956115722656, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.160987377166748, + "rewards/margins": 2.1044201850891113, + "rewards/rejected": 0.05656719207763672, + "step": 590 + }, + { + "epoch": 0.24645537948290241, + "grad_norm": 2.129828453063965, + "learning_rate": 6.845031271716471e-07, + "logits/chosen": -0.78767991065979, + "logits/rejected": NaN, + "logps/chosen": -752.5457153320312, + "logps/rejected": -304.2408447265625, + "loss": 0.1703, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0995919704437256, + "rewards/margins": 2.306191921234131, + "rewards/rejected": -0.20660018920898438, + "step": 591 + }, + { + "epoch": 0.2468723936613845, + "grad_norm": 19.6134033203125, + "learning_rate": 6.856613388927496e-07, + "logits/chosen": -0.5838348865509033, + "logits/rejected": -0.7953169345855713, + "logps/chosen": -984.330810546875, + "logps/rejected": -471.156005859375, + "loss": 0.2782, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6146644353866577, + "rewards/margins": 1.5306525230407715, + "rewards/rejected": 0.08401203155517578, + "step": 592 + }, + { + "epoch": 0.24728940783986655, + "grad_norm": 2.8890933990478516, + "learning_rate": 6.868195506138522e-07, + "logits/chosen": -0.2521997094154358, + "logits/rejected": -1.164298415184021, + "logps/chosen": -708.4560546875, + "logps/rejected": -387.8543395996094, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6958606243133545, + "rewards/margins": 2.74900484085083, + "rewards/rejected": -0.05314427614212036, + "step": 593 + }, + { + "epoch": 0.24770642201834864, + "grad_norm": 5.386482238769531, + "learning_rate": 6.879777623349549e-07, + "logits/chosen": -0.10945963859558105, + "logits/rejected": -1.1509027481079102, + "logps/chosen": -927.3831787109375, + "logps/rejected": -499.5493469238281, + "loss": 0.1393, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5061428546905518, + "rewards/margins": 2.795931339263916, + "rewards/rejected": -0.28978824615478516, + "step": 594 + }, + { + "epoch": 0.2481234361968307, + "grad_norm": 12.649048805236816, + "learning_rate": 6.891359740560575e-07, + "logits/chosen": -1.3372373580932617, + "logits/rejected": -0.792178750038147, + "logps/chosen": -1076.3389892578125, + "logps/rejected": -598.0125732421875, + "loss": 0.2368, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9524903297424316, + "rewards/margins": 1.8556957244873047, + "rewards/rejected": 0.0967945083975792, + "step": 595 + }, + { + "epoch": 0.24854045037531275, + "grad_norm": 2.6925697326660156, + "learning_rate": 6.902941857771601e-07, + "logits/chosen": -0.5740046501159668, + "logits/rejected": -0.9453269243240356, + "logps/chosen": -965.701904296875, + "logps/rejected": -523.4384765625, + "loss": 0.0969, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.233219623565674, + "rewards/margins": 2.948458671569824, + "rewards/rejected": 0.2847610414028168, + "step": 596 + }, + { + "epoch": 0.24895746455379483, + "grad_norm": 42.091922760009766, + "learning_rate": 6.914523974982628e-07, + "logits/chosen": -0.4415721595287323, + "logits/rejected": -0.6550403237342834, + "logps/chosen": -1065.6392822265625, + "logps/rejected": -695.2924194335938, + "loss": 0.258, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2032508850097656, + "rewards/margins": 2.0537729263305664, + "rewards/rejected": 0.14947815239429474, + "step": 597 + }, + { + "epoch": 0.2493744787322769, + "grad_norm": 2.963456869125366, + "learning_rate": 6.926106092193653e-07, + "logits/chosen": -0.09649978578090668, + "logits/rejected": -0.7514311075210571, + "logps/chosen": -917.226806640625, + "logps/rejected": -557.680908203125, + "loss": 0.1224, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2193751335144043, + "rewards/margins": 2.620576858520508, + "rewards/rejected": 0.5987979769706726, + "step": 598 + }, + { + "epoch": 0.24979149291075897, + "grad_norm": 3.8899786472320557, + "learning_rate": 6.937688209404679e-07, + "logits/chosen": -0.606011688709259, + "logits/rejected": -0.6112761497497559, + "logps/chosen": -1385.108154296875, + "logps/rejected": -745.8326416015625, + "loss": 0.1456, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.562312364578247, + "rewards/margins": 2.351155996322632, + "rewards/rejected": 0.2111564576625824, + "step": 599 + }, + { + "epoch": 0.25020850708924103, + "grad_norm": 7.77393102645874, + "learning_rate": 6.949270326615707e-07, + "logits/chosen": -1.0892990827560425, + "logits/rejected": -0.8075054287910461, + "logps/chosen": -962.5426635742188, + "logps/rejected": -493.09796142578125, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4412333965301514, + "rewards/margins": 2.343658924102783, + "rewards/rejected": 0.09757442772388458, + "step": 600 + }, + { + "epoch": 0.2506255212677231, + "grad_norm": 11.310124397277832, + "learning_rate": 6.960852443826732e-07, + "logits/chosen": -1.4320073127746582, + "logits/rejected": NaN, + "logps/chosen": -1169.1802978515625, + "logps/rejected": -503.3662109375, + "loss": 0.2973, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8641380071640015, + "rewards/margins": 1.8802002668380737, + "rewards/rejected": -0.016062162816524506, + "step": 601 + }, + { + "epoch": 0.2510425354462052, + "grad_norm": 4.969920635223389, + "learning_rate": 6.972434561037758e-07, + "logits/chosen": -0.8338360786437988, + "logits/rejected": NaN, + "logps/chosen": -801.736572265625, + "logps/rejected": -283.032958984375, + "loss": 0.1483, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.171736478805542, + "rewards/margins": 2.166746139526367, + "rewards/rejected": 0.004990383982658386, + "step": 602 + }, + { + "epoch": 0.25145954962468725, + "grad_norm": 5.8967132568359375, + "learning_rate": 6.984016678248783e-07, + "logits/chosen": -0.22121232748031616, + "logits/rejected": -1.0394898653030396, + "logps/chosen": -803.666015625, + "logps/rejected": -461.38134765625, + "loss": 0.1454, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.910428762435913, + "rewards/margins": 2.4246320724487305, + "rewards/rejected": 0.4857965409755707, + "step": 603 + }, + { + "epoch": 0.2518765638031693, + "grad_norm": 5.599724769592285, + "learning_rate": 6.995598795459811e-07, + "logits/chosen": -0.12605930864810944, + "logits/rejected": -0.7261084914207458, + "logps/chosen": -1187.8131103515625, + "logps/rejected": -812.2330322265625, + "loss": 0.1354, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.415083408355713, + "rewards/margins": 2.3329017162323, + "rewards/rejected": 0.08218155801296234, + "step": 604 + }, + { + "epoch": 0.25229357798165136, + "grad_norm": 12.576939582824707, + "learning_rate": 7.007180912670837e-07, + "logits/chosen": -0.11005746573209763, + "logits/rejected": -0.7400598526000977, + "logps/chosen": -891.85986328125, + "logps/rejected": -570.233154296875, + "loss": 0.2601, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9840819835662842, + "rewards/margins": 1.7209751605987549, + "rewards/rejected": 0.26310694217681885, + "step": 605 + }, + { + "epoch": 0.2527105921601334, + "grad_norm": 3.5269579887390137, + "learning_rate": 7.018763029881863e-07, + "logits/chosen": -0.8431596159934998, + "logits/rejected": -0.9353121519088745, + "logps/chosen": -792.7533569335938, + "logps/rejected": -306.334228515625, + "loss": 0.1836, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0603041648864746, + "rewards/margins": 2.0888707637786865, + "rewards/rejected": -0.028566554188728333, + "step": 606 + }, + { + "epoch": 0.25312760633861553, + "grad_norm": 5.083499908447266, + "learning_rate": 7.030345147092889e-07, + "logits/chosen": -0.07433230429887772, + "logits/rejected": -0.6715371608734131, + "logps/chosen": -971.2592163085938, + "logps/rejected": -689.5850830078125, + "loss": 0.1809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.279583692550659, + "rewards/margins": 2.1948137283325195, + "rewards/rejected": 0.08477022498846054, + "step": 607 + }, + { + "epoch": 0.2535446205170976, + "grad_norm": 1.4929580688476562, + "learning_rate": 7.041927264303915e-07, + "logits/chosen": -0.3759745955467224, + "logits/rejected": -1.2254726886749268, + "logps/chosen": -813.1041259765625, + "logps/rejected": -433.30908203125, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.19724702835083, + "rewards/margins": 3.819444179534912, + "rewards/rejected": -0.622197151184082, + "step": 608 + }, + { + "epoch": 0.25396163469557964, + "grad_norm": 5.228629112243652, + "learning_rate": 7.053509381514942e-07, + "logits/chosen": -0.7014021873474121, + "logits/rejected": -1.017041563987732, + "logps/chosen": -674.4427490234375, + "logps/rejected": -431.82366943359375, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8042755126953125, + "rewards/margins": 2.930818557739258, + "rewards/rejected": -0.1265430748462677, + "step": 609 + }, + { + "epoch": 0.2543786488740617, + "grad_norm": 3.598060369491577, + "learning_rate": 7.065091498725968e-07, + "logits/chosen": -0.10981921851634979, + "logits/rejected": -0.6309479475021362, + "logps/chosen": -1301.880126953125, + "logps/rejected": -755.8721923828125, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.392566204071045, + "rewards/margins": 3.402348756790161, + "rewards/rejected": -0.009782787412405014, + "step": 610 + }, + { + "epoch": 0.2547956630525438, + "grad_norm": 5.642092704772949, + "learning_rate": 7.076673615936994e-07, + "logits/chosen": -0.0569276325404644, + "logits/rejected": -0.8452401161193848, + "logps/chosen": -875.9761352539062, + "logps/rejected": -487.72991943359375, + "loss": 0.1564, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.581041097640991, + "rewards/margins": 2.398427724838257, + "rewards/rejected": 0.18261317908763885, + "step": 611 + }, + { + "epoch": 0.25521267723102586, + "grad_norm": 3.79573917388916, + "learning_rate": 7.088255733148019e-07, + "logits/chosen": -0.8144851326942444, + "logits/rejected": NaN, + "logps/chosen": -944.989501953125, + "logps/rejected": -426.1669921875, + "loss": 0.2243, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8051097393035889, + "rewards/margins": 1.9441395998001099, + "rewards/rejected": -0.13902969658374786, + "step": 612 + }, + { + "epoch": 0.2556296914095079, + "grad_norm": 47.055686950683594, + "learning_rate": 7.099837850359047e-07, + "logits/chosen": -1.679915428161621, + "logits/rejected": NaN, + "logps/chosen": -1138.622802734375, + "logps/rejected": -526.8446044921875, + "loss": 0.2384, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.38486909866333, + "rewards/margins": 2.1225922107696533, + "rewards/rejected": 0.26227667927742004, + "step": 613 + }, + { + "epoch": 0.25604670558799, + "grad_norm": 2.9760379791259766, + "learning_rate": 7.111419967570073e-07, + "logits/chosen": -0.7639731168746948, + "logits/rejected": -0.8383712768554688, + "logps/chosen": -1001.8594360351562, + "logps/rejected": -430.9266662597656, + "loss": 0.158, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.435476064682007, + "rewards/margins": 2.6505417823791504, + "rewards/rejected": -0.21506576240062714, + "step": 614 + }, + { + "epoch": 0.2564637197664721, + "grad_norm": 5.29321813583374, + "learning_rate": 7.123002084781098e-07, + "logits/chosen": -0.7288037538528442, + "logits/rejected": -0.8777828216552734, + "logps/chosen": -1116.4102783203125, + "logps/rejected": -665.8782958984375, + "loss": 0.1372, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3669815063476562, + "rewards/margins": 2.4096837043762207, + "rewards/rejected": -0.04270210117101669, + "step": 615 + }, + { + "epoch": 0.25688073394495414, + "grad_norm": 3.693591833114624, + "learning_rate": 7.134584201992125e-07, + "logits/chosen": -0.2041352093219757, + "logits/rejected": -0.902887225151062, + "logps/chosen": -944.523193359375, + "logps/rejected": -577.8483276367188, + "loss": 0.1233, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3401541709899902, + "rewards/margins": 2.3476059436798096, + "rewards/rejected": -0.007452018558979034, + "step": 616 + }, + { + "epoch": 0.2572977481234362, + "grad_norm": 2.9373841285705566, + "learning_rate": 7.14616631920315e-07, + "logits/chosen": -0.4914437234401703, + "logits/rejected": -0.8496617078781128, + "logps/chosen": -836.2396850585938, + "logps/rejected": -399.61724853515625, + "loss": 0.1412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.002143144607544, + "rewards/margins": 2.3433380126953125, + "rewards/rejected": -0.34119492769241333, + "step": 617 + }, + { + "epoch": 0.25771476230191825, + "grad_norm": 4.4135003089904785, + "learning_rate": 7.157748436414177e-07, + "logits/chosen": -0.4537695646286011, + "logits/rejected": -1.0564924478530884, + "logps/chosen": -760.844482421875, + "logps/rejected": -419.2314147949219, + "loss": 0.1371, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.278047561645508, + "rewards/margins": 2.4027953147888184, + "rewards/rejected": -0.12474765628576279, + "step": 618 + }, + { + "epoch": 0.2581317764804003, + "grad_norm": 3.6959428787231445, + "learning_rate": 7.169330553625204e-07, + "logits/chosen": -0.4448232054710388, + "logits/rejected": -0.697265088558197, + "logps/chosen": -1009.5532836914062, + "logps/rejected": -605.65625, + "loss": 0.1251, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6326346397399902, + "rewards/margins": 2.264094829559326, + "rewards/rejected": 0.3685401976108551, + "step": 619 + }, + { + "epoch": 0.2585487906588824, + "grad_norm": 8.950996398925781, + "learning_rate": 7.180912670836229e-07, + "logits/chosen": -0.5991683602333069, + "logits/rejected": -0.6143772602081299, + "logps/chosen": -1078.9169921875, + "logps/rejected": -579.3917846679688, + "loss": 0.1225, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2758069038391113, + "rewards/margins": 3.252493381500244, + "rewards/rejected": 0.023313529789447784, + "step": 620 + }, + { + "epoch": 0.2589658048373645, + "grad_norm": 2.318361520767212, + "learning_rate": 7.192494788047255e-07, + "logits/chosen": -0.0816989317536354, + "logits/rejected": -0.7715961337089539, + "logps/chosen": -925.0755615234375, + "logps/rejected": -534.7843017578125, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.281123638153076, + "rewards/margins": 2.682097911834717, + "rewards/rejected": -0.4009742736816406, + "step": 621 + }, + { + "epoch": 0.25938281901584653, + "grad_norm": 29.81993293762207, + "learning_rate": 7.204076905258283e-07, + "logits/chosen": -1.6424189805984497, + "logits/rejected": NaN, + "logps/chosen": -1094.3739013671875, + "logps/rejected": -500.6665344238281, + "loss": 0.1886, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.905848979949951, + "rewards/margins": 2.7671871185302734, + "rewards/rejected": 0.1386619508266449, + "step": 622 + }, + { + "epoch": 0.2597998331943286, + "grad_norm": 19.26909637451172, + "learning_rate": 7.215659022469308e-07, + "logits/chosen": -1.2158726453781128, + "logits/rejected": -0.8885433673858643, + "logps/chosen": -1011.89306640625, + "logps/rejected": -520.267822265625, + "loss": 0.1501, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5149621963500977, + "rewards/margins": 2.4073736667633057, + "rewards/rejected": 0.10758838057518005, + "step": 623 + }, + { + "epoch": 0.2602168473728107, + "grad_norm": 4.468731880187988, + "learning_rate": 7.227241139680334e-07, + "logits/chosen": -1.0297861099243164, + "logits/rejected": -0.6159461736679077, + "logps/chosen": -987.4801025390625, + "logps/rejected": -610.2789306640625, + "loss": 0.2223, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.344578266143799, + "rewards/margins": 1.982051968574524, + "rewards/rejected": 0.3625263571739197, + "step": 624 + }, + { + "epoch": 0.26063386155129276, + "grad_norm": 20.491273880004883, + "learning_rate": 7.23882325689136e-07, + "logits/chosen": -2.2978689670562744, + "logits/rejected": -0.15781213343143463, + "logps/chosen": -1249.4398193359375, + "logps/rejected": -512.7982788085938, + "loss": 0.3169, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.2322113513946533, + "rewards/margins": 1.9884330034255981, + "rewards/rejected": 0.24377843737602234, + "step": 625 + }, + { + "epoch": 0.2610508757297748, + "grad_norm": 7.675472259521484, + "learning_rate": 7.250405374102386e-07, + "logits/chosen": -0.9392146468162537, + "logits/rejected": -1.1012027263641357, + "logps/chosen": -927.7213134765625, + "logps/rejected": -365.92803955078125, + "loss": 0.1613, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.624394416809082, + "rewards/margins": 2.649636745452881, + "rewards/rejected": -0.02524242177605629, + "step": 626 + }, + { + "epoch": 0.26146788990825687, + "grad_norm": 4.142253875732422, + "learning_rate": 7.261987491313413e-07, + "logits/chosen": -0.1493813842535019, + "logits/rejected": -0.7355846762657166, + "logps/chosen": -984.0211181640625, + "logps/rejected": -522.3724365234375, + "loss": 0.1303, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.188222646713257, + "rewards/margins": 2.276895761489868, + "rewards/rejected": -0.08867339789867401, + "step": 627 + }, + { + "epoch": 0.2618849040867389, + "grad_norm": 6.18007755279541, + "learning_rate": 7.273569608524439e-07, + "logits/chosen": -1.084539771080017, + "logits/rejected": NaN, + "logps/chosen": -826.206298828125, + "logps/rejected": -388.668701171875, + "loss": 0.2548, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.2350287437438965, + "rewards/margins": 1.9623374938964844, + "rewards/rejected": 0.27269136905670166, + "step": 628 + }, + { + "epoch": 0.26230191826522103, + "grad_norm": 2.901905059814453, + "learning_rate": 7.285151725735465e-07, + "logits/chosen": -0.6431840658187866, + "logits/rejected": -0.8045487403869629, + "logps/chosen": -908.0816650390625, + "logps/rejected": -609.215576171875, + "loss": 0.0975, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7318227291107178, + "rewards/margins": 2.8934030532836914, + "rewards/rejected": -0.16158027946949005, + "step": 629 + }, + { + "epoch": 0.2627189324437031, + "grad_norm": 1.7798848152160645, + "learning_rate": 7.296733842946491e-07, + "logits/chosen": -0.7185320854187012, + "logits/rejected": -0.7395218014717102, + "logps/chosen": -1068.451171875, + "logps/rejected": -700.2748413085938, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2202060222625732, + "rewards/margins": 3.3558754920959473, + "rewards/rejected": -0.13566932082176208, + "step": 630 + }, + { + "epoch": 0.26313594662218515, + "grad_norm": 4.306009769439697, + "learning_rate": 7.308315960157517e-07, + "logits/chosen": -0.5681269764900208, + "logits/rejected": -0.8139169216156006, + "logps/chosen": -912.421142578125, + "logps/rejected": -506.1282653808594, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.359752655029297, + "rewards/margins": 2.5077924728393555, + "rewards/rejected": -0.14803993701934814, + "step": 631 + }, + { + "epoch": 0.2635529608006672, + "grad_norm": 2.721731185913086, + "learning_rate": 7.319898077368544e-07, + "logits/chosen": -0.21098235249519348, + "logits/rejected": -0.956455647945404, + "logps/chosen": -900.13232421875, + "logps/rejected": -476.4131164550781, + "loss": 0.0876, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7785096168518066, + "rewards/margins": 2.80594801902771, + "rewards/rejected": -0.027438342571258545, + "step": 632 + }, + { + "epoch": 0.2639699749791493, + "grad_norm": 6.343760013580322, + "learning_rate": 7.33148019457957e-07, + "logits/chosen": -0.728594183921814, + "logits/rejected": -0.758413553237915, + "logps/chosen": -1084.8740234375, + "logps/rejected": -637.304931640625, + "loss": 0.1921, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.445596218109131, + "rewards/margins": 2.588635206222534, + "rewards/rejected": -0.14303895831108093, + "step": 633 + }, + { + "epoch": 0.26438698915763137, + "grad_norm": 6.686293601989746, + "learning_rate": 7.343062311790595e-07, + "logits/chosen": -1.1671501398086548, + "logits/rejected": -0.6036487817764282, + "logps/chosen": -1215.345947265625, + "logps/rejected": -620.2894897460938, + "loss": 0.14, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2133841514587402, + "rewards/margins": 3.0950424671173096, + "rewards/rejected": 0.11834187805652618, + "step": 634 + }, + { + "epoch": 0.2648040033361134, + "grad_norm": 2.612384557723999, + "learning_rate": 7.354644429001623e-07, + "logits/chosen": -0.2142440676689148, + "logits/rejected": -1.0654038190841675, + "logps/chosen": -854.1160278320312, + "logps/rejected": -457.0332336425781, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.489722490310669, + "rewards/margins": 2.3180458545684814, + "rewards/rejected": 0.1716766357421875, + "step": 635 + }, + { + "epoch": 0.2652210175145955, + "grad_norm": 7.894227981567383, + "learning_rate": 7.366226546212649e-07, + "logits/chosen": -0.6962148547172546, + "logits/rejected": -0.6769024133682251, + "logps/chosen": -972.771484375, + "logps/rejected": -601.6817626953125, + "loss": 0.1402, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.400357961654663, + "rewards/margins": 2.917607307434082, + "rewards/rejected": -0.5172494649887085, + "step": 636 + }, + { + "epoch": 0.2656380316930776, + "grad_norm": 1.9356327056884766, + "learning_rate": 7.377808663423674e-07, + "logits/chosen": -0.3570998013019562, + "logits/rejected": -1.0237910747528076, + "logps/chosen": -772.055419921875, + "logps/rejected": -440.7916259765625, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7739291191101074, + "rewards/margins": 3.408705711364746, + "rewards/rejected": -0.6347767114639282, + "step": 637 + }, + { + "epoch": 0.26605504587155965, + "grad_norm": 3.43637752532959, + "learning_rate": 7.3893907806347e-07, + "logits/chosen": -0.8870781660079956, + "logits/rejected": -0.7921243906021118, + "logps/chosen": -834.3616943359375, + "logps/rejected": -474.7134094238281, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.195971727371216, + "rewards/margins": 2.3956241607666016, + "rewards/rejected": -0.19965247809886932, + "step": 638 + }, + { + "epoch": 0.2664720600500417, + "grad_norm": 1.7186380624771118, + "learning_rate": 7.400972897845727e-07, + "logits/chosen": 0.02242952585220337, + "logits/rejected": -0.9621630311012268, + "logps/chosen": -902.3709716796875, + "logps/rejected": -445.0989990234375, + "loss": 0.0667, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8822505474090576, + "rewards/margins": 3.154101848602295, + "rewards/rejected": -0.2718513309955597, + "step": 639 + }, + { + "epoch": 0.26688907422852376, + "grad_norm": 4.002180576324463, + "learning_rate": 7.412555015056753e-07, + "logits/chosen": 0.06847470998764038, + "logits/rejected": -0.9142237901687622, + "logps/chosen": -1401.0712890625, + "logps/rejected": -698.3619384765625, + "loss": 0.1117, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6895318031311035, + "rewards/margins": 3.324552059173584, + "rewards/rejected": -0.6350204944610596, + "step": 640 + }, + { + "epoch": 0.2673060884070058, + "grad_norm": 1.5669466257095337, + "learning_rate": 7.424137132267779e-07, + "logits/chosen": -1.2431749105453491, + "logits/rejected": -0.813616931438446, + "logps/chosen": -784.6046752929688, + "logps/rejected": -458.61077880859375, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2542076110839844, + "rewards/margins": 3.287339687347412, + "rewards/rejected": -0.0331321656703949, + "step": 641 + }, + { + "epoch": 0.2677231025854879, + "grad_norm": 3.1287119388580322, + "learning_rate": 7.435719249478806e-07, + "logits/chosen": -0.054623425006866455, + "logits/rejected": -0.7327274084091187, + "logps/chosen": -1036.7576904296875, + "logps/rejected": -666.3532104492188, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1151413917541504, + "rewards/margins": 3.3082356452941895, + "rewards/rejected": -0.19309425354003906, + "step": 642 + }, + { + "epoch": 0.26814011676397, + "grad_norm": 3.5025298595428467, + "learning_rate": 7.447301366689831e-07, + "logits/chosen": -1.5353213548660278, + "logits/rejected": NaN, + "logps/chosen": -1033.089599609375, + "logps/rejected": -408.560791015625, + "loss": 0.2033, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3098394870758057, + "rewards/margins": 2.235577344894409, + "rewards/rejected": 0.07426223158836365, + "step": 643 + }, + { + "epoch": 0.26855713094245204, + "grad_norm": 8.659988403320312, + "learning_rate": 7.458883483900858e-07, + "logits/chosen": -1.0440795421600342, + "logits/rejected": -0.6693610548973083, + "logps/chosen": -998.2140502929688, + "logps/rejected": -524.8956909179688, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1965155601501465, + "rewards/margins": 3.189103603363037, + "rewards/rejected": 0.007411971688270569, + "step": 644 + }, + { + "epoch": 0.2689741451209341, + "grad_norm": 2.4823145866394043, + "learning_rate": 7.470465601111885e-07, + "logits/chosen": -0.5478352904319763, + "logits/rejected": -1.026309847831726, + "logps/chosen": -981.5814208984375, + "logps/rejected": -547.611083984375, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6101837158203125, + "rewards/margins": 2.898257255554199, + "rewards/rejected": -0.2880733609199524, + "step": 645 + }, + { + "epoch": 0.2693911592994162, + "grad_norm": 2.9276905059814453, + "learning_rate": 7.48204771832291e-07, + "logits/chosen": -0.3042536675930023, + "logits/rejected": -0.8953933119773865, + "logps/chosen": -857.6451416015625, + "logps/rejected": -551.00537109375, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.047858715057373, + "rewards/margins": 3.3290319442749023, + "rewards/rejected": -0.28117331862449646, + "step": 646 + }, + { + "epoch": 0.26980817347789826, + "grad_norm": 8.345758438110352, + "learning_rate": 7.493629835533936e-07, + "logits/chosen": -0.9025275707244873, + "logits/rejected": -0.9312137365341187, + "logps/chosen": -926.4627075195312, + "logps/rejected": -439.72259521484375, + "loss": 0.2735, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9198803901672363, + "rewards/margins": 2.1003036499023438, + "rewards/rejected": -0.18042317032814026, + "step": 647 + }, + { + "epoch": 0.2702251876563803, + "grad_norm": 1.7529091835021973, + "learning_rate": 7.505211952744961e-07, + "logits/chosen": -0.26978495717048645, + "logits/rejected": -0.902770459651947, + "logps/chosen": -1069.1846923828125, + "logps/rejected": -606.6519775390625, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3645193576812744, + "rewards/margins": 3.651594400405884, + "rewards/rejected": -0.2870750427246094, + "step": 648 + }, + { + "epoch": 0.2706422018348624, + "grad_norm": 1.5432116985321045, + "learning_rate": 7.516794069955989e-07, + "logits/chosen": -1.311213493347168, + "logits/rejected": NaN, + "logps/chosen": -962.08935546875, + "logps/rejected": -313.3416748046875, + "loss": 0.1196, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.204906463623047, + "rewards/margins": 3.3752660751342773, + "rewards/rejected": -0.17035941779613495, + "step": 649 + }, + { + "epoch": 0.27105921601334443, + "grad_norm": 2.0998928546905518, + "learning_rate": 7.528376187167015e-07, + "logits/chosen": -0.7778588533401489, + "logits/rejected": -0.9963641166687012, + "logps/chosen": -993.2413330078125, + "logps/rejected": -504.355712890625, + "loss": 0.0552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.286280870437622, + "rewards/margins": 3.7770938873291016, + "rewards/rejected": -0.49081307649612427, + "step": 650 + }, + { + "epoch": 0.27147623019182654, + "grad_norm": 1.919900894165039, + "learning_rate": 7.539958304378041e-07, + "logits/chosen": -0.3267011046409607, + "logits/rejected": -0.9513661861419678, + "logps/chosen": -1116.4066162109375, + "logps/rejected": -612.569580078125, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3686208724975586, + "rewards/margins": 3.9149997234344482, + "rewards/rejected": -0.5463787317276001, + "step": 651 + }, + { + "epoch": 0.2718932443703086, + "grad_norm": 4.157155990600586, + "learning_rate": 7.551540421589067e-07, + "logits/chosen": -0.5755597949028015, + "logits/rejected": -1.1028141975402832, + "logps/chosen": -886.7738037109375, + "logps/rejected": -455.74609375, + "loss": 0.1399, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0556139945983887, + "rewards/margins": 2.728410482406616, + "rewards/rejected": -0.6727964878082275, + "step": 652 + }, + { + "epoch": 0.27231025854879065, + "grad_norm": 1.130822777748108, + "learning_rate": 7.563122538800094e-07, + "logits/chosen": -0.23697136342525482, + "logits/rejected": -1.0186610221862793, + "logps/chosen": -918.81982421875, + "logps/rejected": -488.01043701171875, + "loss": 0.0377, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.248232364654541, + "rewards/margins": 3.4116125106811523, + "rewards/rejected": -0.16338026523590088, + "step": 653 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.6221712827682495, + "learning_rate": 7.57470465601112e-07, + "logits/chosen": -0.17793306708335876, + "logits/rejected": -0.7149175405502319, + "logps/chosen": -926.912841796875, + "logps/rejected": -570.6372680664062, + "loss": 0.064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7409508228302, + "rewards/margins": 3.0978012084960938, + "rewards/rejected": -0.35685041546821594, + "step": 654 + }, + { + "epoch": 0.2731442869057548, + "grad_norm": 1.7620303630828857, + "learning_rate": 7.586286773222146e-07, + "logits/chosen": 0.029685012996196747, + "logits/rejected": -1.0535125732421875, + "logps/chosen": -968.8851318359375, + "logps/rejected": -380.34869384765625, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.663417100906372, + "rewards/margins": 4.228096008300781, + "rewards/rejected": -0.5646785497665405, + "step": 655 + }, + { + "epoch": 0.2735613010842369, + "grad_norm": 1.9110461473464966, + "learning_rate": 7.597868890433172e-07, + "logits/chosen": -1.2838466167449951, + "logits/rejected": -0.7231037616729736, + "logps/chosen": -850.8618774414062, + "logps/rejected": -469.94024658203125, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.921802520751953, + "rewards/margins": 3.062375783920288, + "rewards/rejected": -0.14057330787181854, + "step": 656 + }, + { + "epoch": 0.27397831526271893, + "grad_norm": 4.188403606414795, + "learning_rate": 7.609451007644197e-07, + "logits/chosen": -0.7908676862716675, + "logits/rejected": -0.6276238560676575, + "logps/chosen": -948.8115234375, + "logps/rejected": -617.05126953125, + "loss": 0.101, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8360581398010254, + "rewards/margins": 3.1596131324768066, + "rewards/rejected": -0.32355502247810364, + "step": 657 + }, + { + "epoch": 0.274395329441201, + "grad_norm": 5.276146411895752, + "learning_rate": 7.621033124855225e-07, + "logits/chosen": -0.8148835897445679, + "logits/rejected": -0.9488002061843872, + "logps/chosen": -958.51611328125, + "logps/rejected": -389.41290283203125, + "loss": 0.1091, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.757282257080078, + "rewards/margins": 3.3732452392578125, + "rewards/rejected": -0.6159630417823792, + "step": 658 + }, + { + "epoch": 0.27481234361968304, + "grad_norm": 1.8217800855636597, + "learning_rate": 7.63261524206625e-07, + "logits/chosen": -0.7798799276351929, + "logits/rejected": -0.8922791481018066, + "logps/chosen": -808.9055786132812, + "logps/rejected": -478.9110107421875, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7093400955200195, + "rewards/margins": 3.101158857345581, + "rewards/rejected": -0.3918190002441406, + "step": 659 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 4.446613311767578, + "learning_rate": 7.644197359277276e-07, + "logits/chosen": -1.3124439716339111, + "logits/rejected": -0.8156660795211792, + "logps/chosen": -1074.635498046875, + "logps/rejected": -547.177978515625, + "loss": 0.1578, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.430936336517334, + "rewards/margins": 2.6298398971557617, + "rewards/rejected": -0.19890347123146057, + "step": 660 + }, + { + "epoch": 0.2756463719766472, + "grad_norm": 1.8931450843811035, + "learning_rate": 7.655779476488303e-07, + "logits/chosen": -0.10071955621242523, + "logits/rejected": -0.9861562252044678, + "logps/chosen": -1004.8538208007812, + "logps/rejected": -606.2979125976562, + "loss": 0.0472, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.853541374206543, + "rewards/margins": 3.103257179260254, + "rewards/rejected": -0.2497154325246811, + "step": 661 + }, + { + "epoch": 0.27606338615512926, + "grad_norm": 1.9569365978240967, + "learning_rate": 7.667361593699329e-07, + "logits/chosen": -0.0710374116897583, + "logits/rejected": -0.8563084006309509, + "logps/chosen": -779.7251586914062, + "logps/rejected": -446.2644348144531, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8926262855529785, + "rewards/margins": 3.2478365898132324, + "rewards/rejected": -0.3552105128765106, + "step": 662 + }, + { + "epoch": 0.2764804003336113, + "grad_norm": 6.9746222496032715, + "learning_rate": 7.678943710910355e-07, + "logits/chosen": -0.9494689702987671, + "logits/rejected": -0.6199932098388672, + "logps/chosen": -1155.0352783203125, + "logps/rejected": -831.6119384765625, + "loss": 0.1605, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5138635635375977, + "rewards/margins": 2.4570164680480957, + "rewards/rejected": 0.05684681236743927, + "step": 663 + }, + { + "epoch": 0.27689741451209343, + "grad_norm": 19.132970809936523, + "learning_rate": 7.690525828121382e-07, + "logits/chosen": -0.8272172808647156, + "logits/rejected": -0.803703784942627, + "logps/chosen": -891.4105224609375, + "logps/rejected": -451.93743896484375, + "loss": 0.1529, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.868781328201294, + "rewards/margins": 2.2544689178466797, + "rewards/rejected": -0.38568761944770813, + "step": 664 + }, + { + "epoch": 0.2773144286905755, + "grad_norm": 8.334583282470703, + "learning_rate": 7.702107945332407e-07, + "logits/chosen": -1.299774408340454, + "logits/rejected": NaN, + "logps/chosen": -1235.95751953125, + "logps/rejected": -502.58258056640625, + "loss": 0.1814, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.773240804672241, + "rewards/margins": 3.2850892543792725, + "rewards/rejected": -0.5118484497070312, + "step": 665 + }, + { + "epoch": 0.27773144286905754, + "grad_norm": 4.325191020965576, + "learning_rate": 7.713690062543433e-07, + "logits/chosen": -0.8903163075447083, + "logits/rejected": -0.8252299427986145, + "logps/chosen": -922.1923217773438, + "logps/rejected": -427.3809814453125, + "loss": 0.1299, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4221014976501465, + "rewards/margins": 3.0962209701538086, + "rewards/rejected": -0.6741191744804382, + "step": 666 + }, + { + "epoch": 0.2781484570475396, + "grad_norm": 1.6816037893295288, + "learning_rate": 7.725272179754461e-07, + "logits/chosen": -0.23599767684936523, + "logits/rejected": -0.9079434871673584, + "logps/chosen": -829.1220703125, + "logps/rejected": -531.5292358398438, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1087546348571777, + "rewards/margins": 3.7729506492614746, + "rewards/rejected": -0.6641956567764282, + "step": 667 + }, + { + "epoch": 0.2785654712260217, + "grad_norm": 3.0065712928771973, + "learning_rate": 7.736854296965486e-07, + "logits/chosen": -1.7088561058044434, + "logits/rejected": -0.5686511397361755, + "logps/chosen": -1115.936279296875, + "logps/rejected": -474.3078308105469, + "loss": 0.1933, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7226858139038086, + "rewards/margins": 2.602246046066284, + "rewards/rejected": 0.1204402968287468, + "step": 668 + }, + { + "epoch": 0.27898248540450377, + "grad_norm": 4.5065388679504395, + "learning_rate": 7.748436414176512e-07, + "logits/chosen": -0.09065871685743332, + "logits/rejected": -0.7773456573486328, + "logps/chosen": -1023.1198120117188, + "logps/rejected": -643.0634765625, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5717170238494873, + "rewards/margins": 2.665637493133545, + "rewards/rejected": -0.09392015635967255, + "step": 669 + }, + { + "epoch": 0.2793994995829858, + "grad_norm": 2.9850385189056396, + "learning_rate": 7.760018531387537e-07, + "logits/chosen": -0.7854663133621216, + "logits/rejected": -0.8607598543167114, + "logps/chosen": -1063.158935546875, + "logps/rejected": -605.2032470703125, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5820908546447754, + "rewards/margins": 3.0234179496765137, + "rewards/rejected": -0.4413270950317383, + "step": 670 + }, + { + "epoch": 0.2798165137614679, + "grad_norm": 6.388740062713623, + "learning_rate": 7.771600648598565e-07, + "logits/chosen": -0.6397448778152466, + "logits/rejected": -0.6939932703971863, + "logps/chosen": -913.4097290039062, + "logps/rejected": -537.0264282226562, + "loss": 0.1454, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5907554626464844, + "rewards/margins": 2.5999560356140137, + "rewards/rejected": -0.009200677275657654, + "step": 671 + }, + { + "epoch": 0.28023352793994993, + "grad_norm": 1.1794698238372803, + "learning_rate": 7.783182765809591e-07, + "logits/chosen": -0.3239077627658844, + "logits/rejected": -0.9997440576553345, + "logps/chosen": -864.90771484375, + "logps/rejected": -418.0434265136719, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3542513847351074, + "rewards/margins": 3.9451379776000977, + "rewards/rejected": -0.5908867120742798, + "step": 672 + }, + { + "epoch": 0.28065054211843204, + "grad_norm": 1.6917502880096436, + "learning_rate": 7.794764883020616e-07, + "logits/chosen": -0.4452937841415405, + "logits/rejected": -1.1055946350097656, + "logps/chosen": -934.9141235351562, + "logps/rejected": -442.1134033203125, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1753227710723877, + "rewards/margins": 3.6317028999328613, + "rewards/rejected": -0.4563799202442169, + "step": 673 + }, + { + "epoch": 0.2810675562969141, + "grad_norm": 2.924044609069824, + "learning_rate": 7.806347000231643e-07, + "logits/chosen": -1.871835708618164, + "logits/rejected": -0.2696525454521179, + "logps/chosen": -1480.7890625, + "logps/rejected": -583.851318359375, + "loss": 0.0901, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037710666656494, + "rewards/margins": 2.7692651748657227, + "rewards/rejected": 0.2684454023838043, + "step": 674 + }, + { + "epoch": 0.28148457047539616, + "grad_norm": 2.355971574783325, + "learning_rate": 7.817929117442669e-07, + "logits/chosen": -0.9192445278167725, + "logits/rejected": -1.0934759378433228, + "logps/chosen": -884.8728637695312, + "logps/rejected": -426.6981201171875, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2455408573150635, + "rewards/margins": 3.2321510314941406, + "rewards/rejected": -0.9866101741790771, + "step": 675 + }, + { + "epoch": 0.2819015846538782, + "grad_norm": 1.8769727945327759, + "learning_rate": 7.829511234653695e-07, + "logits/chosen": -0.7166193127632141, + "logits/rejected": -1.0846006870269775, + "logps/chosen": -766.2208251953125, + "logps/rejected": -415.28240966796875, + "loss": 0.0648, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2667829990386963, + "rewards/margins": 4.438577175140381, + "rewards/rejected": -1.1717946529388428, + "step": 676 + }, + { + "epoch": 0.2823185988323603, + "grad_norm": 2.1219003200531006, + "learning_rate": 7.841093351864722e-07, + "logits/chosen": -0.21094398200511932, + "logits/rejected": -0.8582535982131958, + "logps/chosen": -807.8544921875, + "logps/rejected": -522.2545166015625, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7805588245391846, + "rewards/margins": 3.1414308547973633, + "rewards/rejected": -0.3608722984790802, + "step": 677 + }, + { + "epoch": 0.2827356130108424, + "grad_norm": 1.2818228006362915, + "learning_rate": 7.852675469075748e-07, + "logits/chosen": -0.566182553768158, + "logits/rejected": -0.8257555961608887, + "logps/chosen": -1085.4267578125, + "logps/rejected": -603.5076293945312, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5303125381469727, + "rewards/margins": 4.109659671783447, + "rewards/rejected": -0.5793472528457642, + "step": 678 + }, + { + "epoch": 0.28315262718932444, + "grad_norm": 9.401461601257324, + "learning_rate": 7.864257586286773e-07, + "logits/chosen": -1.328122854232788, + "logits/rejected": NaN, + "logps/chosen": -1199.766357421875, + "logps/rejected": -496.22442626953125, + "loss": 0.1612, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5032546520233154, + "rewards/margins": 4.129695415496826, + "rewards/rejected": -0.6264407634735107, + "step": 679 + }, + { + "epoch": 0.2835696413678065, + "grad_norm": 1.502197265625, + "learning_rate": 7.875839703497801e-07, + "logits/chosen": -0.43498438596725464, + "logits/rejected": -1.1827187538146973, + "logps/chosen": -730.7311401367188, + "logps/rejected": -481.34783935546875, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.360247850418091, + "rewards/margins": 3.355269432067871, + "rewards/rejected": -0.9950214624404907, + "step": 680 + }, + { + "epoch": 0.28398665554628855, + "grad_norm": 0.8643062114715576, + "learning_rate": 7.887421820708827e-07, + "logits/chosen": -0.5096925497055054, + "logits/rejected": -0.851090669631958, + "logps/chosen": -791.5926513671875, + "logps/rejected": -517.8757934570312, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5895133018493652, + "rewards/margins": 4.313060760498047, + "rewards/rejected": -0.7235476970672607, + "step": 681 + }, + { + "epoch": 0.28440366972477066, + "grad_norm": 7.837177276611328, + "learning_rate": 7.899003937919852e-07, + "logits/chosen": -2.1451871395111084, + "logits/rejected": NaN, + "logps/chosen": -926.6055908203125, + "logps/rejected": -340.2315673828125, + "loss": 0.2703, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9631454944610596, + "rewards/margins": 2.0916168689727783, + "rewards/rejected": -0.12847137451171875, + "step": 682 + }, + { + "epoch": 0.2848206839032527, + "grad_norm": 2.6071720123291016, + "learning_rate": 7.910586055130878e-07, + "logits/chosen": -1.2867193222045898, + "logits/rejected": -0.8418951630592346, + "logps/chosen": -1076.8138427734375, + "logps/rejected": -602.5552978515625, + "loss": 0.1466, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1427931785583496, + "rewards/margins": 3.0005064010620117, + "rewards/rejected": -0.857712984085083, + "step": 683 + }, + { + "epoch": 0.28523769808173477, + "grad_norm": 1.7058625221252441, + "learning_rate": 7.922168172341905e-07, + "logits/chosen": -0.19488556683063507, + "logits/rejected": -1.0552797317504883, + "logps/chosen": -733.0067749023438, + "logps/rejected": -418.456787109375, + "loss": 0.0654, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1535425186157227, + "rewards/margins": 3.106667995452881, + "rewards/rejected": -0.9531255960464478, + "step": 684 + }, + { + "epoch": 0.2856547122602168, + "grad_norm": 11.146920204162598, + "learning_rate": 7.933750289552931e-07, + "logits/chosen": -0.3163211941719055, + "logits/rejected": -1.055586814880371, + "logps/chosen": -1127.7052001953125, + "logps/rejected": -566.1273803710938, + "loss": 0.1442, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9608155488967896, + "rewards/margins": 3.134819507598877, + "rewards/rejected": -1.1740039587020874, + "step": 685 + }, + { + "epoch": 0.28607172643869894, + "grad_norm": 11.973971366882324, + "learning_rate": 7.945332406763957e-07, + "logits/chosen": -0.6567438840866089, + "logits/rejected": -0.8161460161209106, + "logps/chosen": -895.21728515625, + "logps/rejected": -461.9906005859375, + "loss": 0.1005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6188125610351562, + "rewards/margins": 3.7202296257019043, + "rewards/rejected": -1.101416826248169, + "step": 686 + }, + { + "epoch": 0.286488740617181, + "grad_norm": 4.5819902420043945, + "learning_rate": 7.956914523974983e-07, + "logits/chosen": -0.843708336353302, + "logits/rejected": -0.8347323536872864, + "logps/chosen": -902.4332885742188, + "logps/rejected": -522.321044921875, + "loss": 0.1567, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4726815223693848, + "rewards/margins": 2.9824066162109375, + "rewards/rejected": -0.5097252130508423, + "step": 687 + }, + { + "epoch": 0.28690575479566305, + "grad_norm": 1.642796277999878, + "learning_rate": 7.968496641186009e-07, + "logits/chosen": -0.7697095274925232, + "logits/rejected": -0.9155693054199219, + "logps/chosen": -899.1764526367188, + "logps/rejected": -519.9954833984375, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2575902938842773, + "rewards/margins": 3.228018283843994, + "rewards/rejected": -0.9704281091690063, + "step": 688 + }, + { + "epoch": 0.2873227689741451, + "grad_norm": 0.9376092553138733, + "learning_rate": 7.980078758397036e-07, + "logits/chosen": -1.006415605545044, + "logits/rejected": NaN, + "logps/chosen": -709.4091796875, + "logps/rejected": -349.53973388671875, + "loss": 0.116, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7578554153442383, + "rewards/margins": 3.378726005554199, + "rewards/rejected": -0.6208705902099609, + "step": 689 + }, + { + "epoch": 0.2877397831526272, + "grad_norm": 1.5091439485549927, + "learning_rate": 7.991660875608062e-07, + "logits/chosen": -0.5751785635948181, + "logits/rejected": -0.8447078466415405, + "logps/chosen": -838.064208984375, + "logps/rejected": -471.8148193359375, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0456974506378174, + "rewards/margins": 4.0821685791015625, + "rewards/rejected": -1.0364713668823242, + "step": 690 + }, + { + "epoch": 0.28815679733110927, + "grad_norm": 3.3090641498565674, + "learning_rate": 8.003242992819088e-07, + "logits/chosen": -0.7243960499763489, + "logits/rejected": NaN, + "logps/chosen": -984.9652709960938, + "logps/rejected": -317.9057922363281, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.376939535140991, + "rewards/margins": 4.38753604888916, + "rewards/rejected": -1.0105968713760376, + "step": 691 + }, + { + "epoch": 0.2885738115095913, + "grad_norm": 1.6954480409622192, + "learning_rate": 8.014825110030114e-07, + "logits/chosen": -0.021316327154636383, + "logits/rejected": -0.8633032441139221, + "logps/chosen": -954.0582275390625, + "logps/rejected": -534.59619140625, + "loss": 0.056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9286065101623535, + "rewards/margins": 3.950291633605957, + "rewards/rejected": -1.0216851234436035, + "step": 692 + }, + { + "epoch": 0.2889908256880734, + "grad_norm": 10.30211353302002, + "learning_rate": 8.026407227241139e-07, + "logits/chosen": -0.8956800699234009, + "logits/rejected": -0.8521243333816528, + "logps/chosen": -907.3406982421875, + "logps/rejected": -502.7787780761719, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.012655735015869, + "rewards/margins": 3.700611114501953, + "rewards/rejected": -0.6879553198814392, + "step": 693 + }, + { + "epoch": 0.28940783986655544, + "grad_norm": 2.5135855674743652, + "learning_rate": 8.037989344452167e-07, + "logits/chosen": -0.5596056580543518, + "logits/rejected": -0.7565231323242188, + "logps/chosen": -1133.857177734375, + "logps/rejected": -662.1094360351562, + "loss": 0.0635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9697577953338623, + "rewards/margins": 3.591014862060547, + "rewards/rejected": -0.6212570667266846, + "step": 694 + }, + { + "epoch": 0.28982485404503755, + "grad_norm": 1.3021043539047241, + "learning_rate": 8.049571461663193e-07, + "logits/chosen": -0.2754555940628052, + "logits/rejected": -1.0242847204208374, + "logps/chosen": -725.990234375, + "logps/rejected": -450.32147216796875, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.436117649078369, + "rewards/margins": 3.674661636352539, + "rewards/rejected": -1.238544225692749, + "step": 695 + }, + { + "epoch": 0.2902418682235196, + "grad_norm": 7.527068138122559, + "learning_rate": 8.061153578874218e-07, + "logits/chosen": -0.7449747323989868, + "logits/rejected": -0.7965424060821533, + "logps/chosen": -919.323974609375, + "logps/rejected": -603.3015747070312, + "loss": 0.2012, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.453085422515869, + "rewards/margins": 2.7063755989074707, + "rewards/rejected": -0.25329023599624634, + "step": 696 + }, + { + "epoch": 0.29065888240200166, + "grad_norm": 0.9044526219367981, + "learning_rate": 8.072735696085245e-07, + "logits/chosen": -1.041505217552185, + "logits/rejected": -0.6419574022293091, + "logps/chosen": -844.154052734375, + "logps/rejected": -456.30731201171875, + "loss": 0.1107, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7331080436706543, + "rewards/margins": 3.5984268188476562, + "rewards/rejected": -0.8653184771537781, + "step": 697 + }, + { + "epoch": 0.2910758965804837, + "grad_norm": 1.140866756439209, + "learning_rate": 8.084317813296272e-07, + "logits/chosen": -0.329473078250885, + "logits/rejected": -0.9282429218292236, + "logps/chosen": -1045.888671875, + "logps/rejected": -633.0956420898438, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135223865509033, + "rewards/margins": 4.227619647979736, + "rewards/rejected": -1.0923957824707031, + "step": 698 + }, + { + "epoch": 0.29149291075896583, + "grad_norm": 2.917125940322876, + "learning_rate": 8.095899930507298e-07, + "logits/chosen": -1.2221717834472656, + "logits/rejected": -0.7714150547981262, + "logps/chosen": -845.21630859375, + "logps/rejected": -421.2784423828125, + "loss": 0.0941, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8793387413024902, + "rewards/margins": 3.3897602558135986, + "rewards/rejected": -0.5104213953018188, + "step": 699 + }, + { + "epoch": 0.2919099249374479, + "grad_norm": 1.153072714805603, + "learning_rate": 8.107482047718324e-07, + "logits/chosen": -0.3664035201072693, + "logits/rejected": -0.9467067718505859, + "logps/chosen": -842.8419799804688, + "logps/rejected": -510.69110107421875, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115661144256592, + "rewards/margins": 4.083563804626465, + "rewards/rejected": -0.9679027795791626, + "step": 700 + }, + { + "epoch": 0.29232693911592994, + "grad_norm": 1.5465978384017944, + "learning_rate": 8.119064164929349e-07, + "logits/chosen": -0.8243260979652405, + "logits/rejected": -0.9247199892997742, + "logps/chosen": -1019.0362548828125, + "logps/rejected": -561.4317016601562, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6038451194763184, + "rewards/margins": 4.387948036193848, + "rewards/rejected": -0.7841022610664368, + "step": 701 + }, + { + "epoch": 0.292743953294412, + "grad_norm": 33.15315628051758, + "learning_rate": 8.130646282140375e-07, + "logits/chosen": -0.9549875855445862, + "logits/rejected": -0.8348655700683594, + "logps/chosen": -789.5423583984375, + "logps/rejected": -424.3368225097656, + "loss": 0.1064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6147851943969727, + "rewards/margins": 3.520475387573242, + "rewards/rejected": -0.9056901931762695, + "step": 702 + }, + { + "epoch": 0.29316096747289405, + "grad_norm": 1.0500364303588867, + "learning_rate": 8.142228399351403e-07, + "logits/chosen": -0.9478426575660706, + "logits/rejected": -0.9047337174415588, + "logps/chosen": -910.130859375, + "logps/rejected": -350.6429443359375, + "loss": 0.1151, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.821021795272827, + "rewards/margins": 3.8222744464874268, + "rewards/rejected": -1.0012527704238892, + "step": 703 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 1.0743695497512817, + "learning_rate": 8.153810516562428e-07, + "logits/chosen": -0.33147692680358887, + "logits/rejected": -1.0827728509902954, + "logps/chosen": -732.991943359375, + "logps/rejected": -498.92938232421875, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9223170280456543, + "rewards/margins": 4.524954795837402, + "rewards/rejected": -1.602637767791748, + "step": 704 + }, + { + "epoch": 0.2939949958298582, + "grad_norm": 1.6860175132751465, + "learning_rate": 8.165392633773454e-07, + "logits/chosen": -0.6071344614028931, + "logits/rejected": -0.7867093682289124, + "logps/chosen": -905.5206298828125, + "logps/rejected": -558.619140625, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.452101230621338, + "rewards/margins": 4.571294784545898, + "rewards/rejected": -1.1191933155059814, + "step": 705 + }, + { + "epoch": 0.2944120100083403, + "grad_norm": 9.112780570983887, + "learning_rate": 8.176974750984481e-07, + "logits/chosen": -1.2493171691894531, + "logits/rejected": -0.32148629426956177, + "logps/chosen": -1088.8299560546875, + "logps/rejected": -566.4210815429688, + "loss": 0.162, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.645113706588745, + "rewards/margins": 3.487278699874878, + "rewards/rejected": -0.8421649932861328, + "step": 706 + }, + { + "epoch": 0.29482902418682233, + "grad_norm": 1.6253480911254883, + "learning_rate": 8.188556868195507e-07, + "logits/chosen": -0.8237992525100708, + "logits/rejected": -1.107295036315918, + "logps/chosen": -772.7175903320312, + "logps/rejected": -426.8038330078125, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.321528434753418, + "rewards/margins": 4.244507312774658, + "rewards/rejected": -0.922978937625885, + "step": 707 + }, + { + "epoch": 0.29524603836530444, + "grad_norm": 0.6611251831054688, + "learning_rate": 8.200138985406533e-07, + "logits/chosen": -0.3046683669090271, + "logits/rejected": -0.7843575477600098, + "logps/chosen": -791.3919677734375, + "logps/rejected": -468.6018981933594, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3735854625701904, + "rewards/margins": 4.448781967163086, + "rewards/rejected": -1.0751965045928955, + "step": 708 + }, + { + "epoch": 0.2956630525437865, + "grad_norm": 0.9583315253257751, + "learning_rate": 8.21172110261756e-07, + "logits/chosen": -0.9268222451210022, + "logits/rejected": NaN, + "logps/chosen": -890.0192260742188, + "logps/rejected": -269.09210205078125, + "loss": 0.1186, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.469109058380127, + "rewards/margins": 3.315596342086792, + "rewards/rejected": -0.8464872241020203, + "step": 709 + }, + { + "epoch": 0.29608006672226855, + "grad_norm": 1.8770065307617188, + "learning_rate": 8.223303219828585e-07, + "logits/chosen": -0.4588617980480194, + "logits/rejected": -1.0426967144012451, + "logps/chosen": -1027.719482421875, + "logps/rejected": -544.4545288085938, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4681992530822754, + "rewards/margins": 4.842948913574219, + "rewards/rejected": -1.374750018119812, + "step": 710 + }, + { + "epoch": 0.2964970809007506, + "grad_norm": 4.557132244110107, + "learning_rate": 8.234885337039611e-07, + "logits/chosen": -1.1600285768508911, + "logits/rejected": -0.71152663230896, + "logps/chosen": -1055.401611328125, + "logps/rejected": -671.0565185546875, + "loss": 0.1411, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.601924180984497, + "rewards/margins": 3.0121521949768066, + "rewards/rejected": -0.4102281630039215, + "step": 711 + }, + { + "epoch": 0.2969140950792327, + "grad_norm": 11.238059043884277, + "learning_rate": 8.246467454250639e-07, + "logits/chosen": -0.8595491647720337, + "logits/rejected": -0.617124617099762, + "logps/chosen": -977.273681640625, + "logps/rejected": -677.8731689453125, + "loss": 0.107, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.049149990081787, + "rewards/margins": 3.499211072921753, + "rewards/rejected": -0.4500613510608673, + "step": 712 + }, + { + "epoch": 0.2973311092577148, + "grad_norm": 2.4737396240234375, + "learning_rate": 8.258049571461664e-07, + "logits/chosen": -0.7158533334732056, + "logits/rejected": -0.7006683349609375, + "logps/chosen": -1030.0084228515625, + "logps/rejected": -464.9330749511719, + "loss": 0.0651, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3193588256835938, + "rewards/margins": 4.168511390686035, + "rewards/rejected": -0.8491523861885071, + "step": 713 + }, + { + "epoch": 0.29774812343619683, + "grad_norm": 87.66568756103516, + "learning_rate": 8.26963168867269e-07, + "logits/chosen": -0.2351238876581192, + "logits/rejected": -0.7958122491836548, + "logps/chosen": -937.675048828125, + "logps/rejected": -650.682861328125, + "loss": 0.0957, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5706076622009277, + "rewards/margins": 3.6522862911224365, + "rewards/rejected": -1.0816783905029297, + "step": 714 + }, + { + "epoch": 0.2981651376146789, + "grad_norm": 0.8579132556915283, + "learning_rate": 8.281213805883715e-07, + "logits/chosen": -0.41273605823516846, + "logits/rejected": -1.1658073663711548, + "logps/chosen": -719.7332153320312, + "logps/rejected": -469.5301513671875, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.558349132537842, + "rewards/margins": 4.438024520874023, + "rewards/rejected": -1.8796753883361816, + "step": 715 + }, + { + "epoch": 0.29858215179316094, + "grad_norm": 1.3919587135314941, + "learning_rate": 8.292795923094743e-07, + "logits/chosen": -0.7100697755813599, + "logits/rejected": -0.8709607124328613, + "logps/chosen": -1054.79541015625, + "logps/rejected": -569.888671875, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.698131561279297, + "rewards/margins": 3.7573909759521484, + "rewards/rejected": -1.0592594146728516, + "step": 716 + }, + { + "epoch": 0.29899916597164305, + "grad_norm": 1.7069827318191528, + "learning_rate": 8.304378040305769e-07, + "logits/chosen": -0.5586727857589722, + "logits/rejected": -0.8886260390281677, + "logps/chosen": -868.09033203125, + "logps/rejected": -420.48223876953125, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.102052688598633, + "rewards/margins": 4.115756511688232, + "rewards/rejected": -1.01370370388031, + "step": 717 + }, + { + "epoch": 0.2994161801501251, + "grad_norm": 1.5603508949279785, + "learning_rate": 8.315960157516794e-07, + "logits/chosen": -0.31372588872909546, + "logits/rejected": -1.2044098377227783, + "logps/chosen": -733.4849853515625, + "logps/rejected": -391.7495422363281, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8157477378845215, + "rewards/margins": 4.6537933349609375, + "rewards/rejected": -1.838045597076416, + "step": 718 + }, + { + "epoch": 0.29983319432860717, + "grad_norm": 0.856756329536438, + "learning_rate": 8.327542274727821e-07, + "logits/chosen": -0.872563362121582, + "logits/rejected": NaN, + "logps/chosen": -813.7835693359375, + "logps/rejected": -406.3406982421875, + "loss": 0.1065, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1190552711486816, + "rewards/margins": 4.12025785446167, + "rewards/rejected": -1.0012023448944092, + "step": 719 + }, + { + "epoch": 0.3002502085070892, + "grad_norm": 0.8562890291213989, + "learning_rate": 8.339124391938847e-07, + "logits/chosen": -0.8587760329246521, + "logits/rejected": -0.7040319442749023, + "logps/chosen": -1017.5399169921875, + "logps/rejected": -501.9783630371094, + "loss": 0.1017, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4728240966796875, + "rewards/margins": 4.822940349578857, + "rewards/rejected": -1.3501158952713013, + "step": 720 + }, + { + "epoch": 0.30066722268557133, + "grad_norm": 18.233171463012695, + "learning_rate": 8.350706509149873e-07, + "logits/chosen": -0.18597012758255005, + "logits/rejected": -1.0359680652618408, + "logps/chosen": -1010.93505859375, + "logps/rejected": -529.88134765625, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.943120002746582, + "rewards/margins": 4.378110885620117, + "rewards/rejected": -1.4349911212921143, + "step": 721 + }, + { + "epoch": 0.3010842368640534, + "grad_norm": 8.974459648132324, + "learning_rate": 8.3622886263609e-07, + "logits/chosen": -0.6468971967697144, + "logits/rejected": -0.6100889444351196, + "logps/chosen": -1042.210693359375, + "logps/rejected": -557.0152587890625, + "loss": 0.1069, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6096105575561523, + "rewards/margins": 3.681804656982422, + "rewards/rejected": -1.0721943378448486, + "step": 722 + }, + { + "epoch": 0.30150125104253545, + "grad_norm": 4.212851047515869, + "learning_rate": 8.373870743571926e-07, + "logits/chosen": -1.238958477973938, + "logits/rejected": NaN, + "logps/chosen": -886.9922485351562, + "logps/rejected": -285.79095458984375, + "loss": 0.1488, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0844039916992188, + "rewards/margins": 2.7447173595428467, + "rewards/rejected": -0.6603134274482727, + "step": 723 + }, + { + "epoch": 0.3019182652210175, + "grad_norm": 0.8870864510536194, + "learning_rate": 8.385452860782951e-07, + "logits/chosen": -0.9098430275917053, + "logits/rejected": -1.124593734741211, + "logps/chosen": -1033.290771484375, + "logps/rejected": -475.55615234375, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8770546913146973, + "rewards/margins": 5.452757835388184, + "rewards/rejected": -1.5757033824920654, + "step": 724 + }, + { + "epoch": 0.30233527939949956, + "grad_norm": 0.793158769607544, + "learning_rate": 8.397034977993979e-07, + "logits/chosen": -0.9199268817901611, + "logits/rejected": -0.9038375616073608, + "logps/chosen": -763.145263671875, + "logps/rejected": -321.6328430175781, + "loss": 0.0989, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.420822858810425, + "rewards/margins": 4.1555562019348145, + "rewards/rejected": -1.734732985496521, + "step": 725 + }, + { + "epoch": 0.30275229357798167, + "grad_norm": 2.0529580116271973, + "learning_rate": 8.408617095205005e-07, + "logits/chosen": -1.2549889087677002, + "logits/rejected": -0.7337185144424438, + "logps/chosen": -955.87890625, + "logps/rejected": -513.1775512695312, + "loss": 0.0737, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8253564834594727, + "rewards/margins": 4.067018032073975, + "rewards/rejected": -0.24166183173656464, + "step": 726 + }, + { + "epoch": 0.3031693077564637, + "grad_norm": 2.911007881164551, + "learning_rate": 8.42019921241603e-07, + "logits/chosen": -0.7778558135032654, + "logits/rejected": -0.848534345626831, + "logps/chosen": -758.051513671875, + "logps/rejected": -530.8515625, + "loss": 0.0911, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2914395332336426, + "rewards/margins": 3.543973684310913, + "rewards/rejected": -1.2525341510772705, + "step": 727 + }, + { + "epoch": 0.3035863219349458, + "grad_norm": 2.225968599319458, + "learning_rate": 8.431781329627056e-07, + "logits/chosen": -1.6987321376800537, + "logits/rejected": NaN, + "logps/chosen": -915.2614135742188, + "logps/rejected": -419.60699462890625, + "loss": 0.1717, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.498532295227051, + "rewards/margins": 2.835531234741211, + "rewards/rejected": -0.33699914813041687, + "step": 728 + }, + { + "epoch": 0.30400333611342784, + "grad_norm": 30.329219818115234, + "learning_rate": 8.443363446838082e-07, + "logits/chosen": -0.6849836707115173, + "logits/rejected": -0.49754762649536133, + "logps/chosen": -950.8651733398438, + "logps/rejected": -540.7772216796875, + "loss": 0.1699, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2793078422546387, + "rewards/margins": 2.6704554557800293, + "rewards/rejected": -0.39114782214164734, + "step": 729 + }, + { + "epoch": 0.30442035029190995, + "grad_norm": 15.711411476135254, + "learning_rate": 8.454945564049109e-07, + "logits/chosen": -0.6410975456237793, + "logits/rejected": -0.7780447006225586, + "logps/chosen": -1219.5904541015625, + "logps/rejected": -644.7037963867188, + "loss": 0.1193, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.33735990524292, + "rewards/margins": 3.9567136764526367, + "rewards/rejected": -1.619354009628296, + "step": 730 + }, + { + "epoch": 0.304837364470392, + "grad_norm": 0.6789073944091797, + "learning_rate": 8.466527681260135e-07, + "logits/chosen": -0.47173693776130676, + "logits/rejected": -1.1109914779663086, + "logps/chosen": -804.7252197265625, + "logps/rejected": -451.7302551269531, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.422912836074829, + "rewards/margins": 5.3976826667785645, + "rewards/rejected": -1.9747698307037354, + "step": 731 + }, + { + "epoch": 0.30525437864887406, + "grad_norm": 1.671883225440979, + "learning_rate": 8.478109798471161e-07, + "logits/chosen": -0.4184814989566803, + "logits/rejected": -0.8088828921318054, + "logps/chosen": -1043.605712890625, + "logps/rejected": -527.428466796875, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5700318813323975, + "rewards/margins": 3.9027812480926514, + "rewards/rejected": -1.332749366760254, + "step": 732 + }, + { + "epoch": 0.3056713928273561, + "grad_norm": 1.1642073392868042, + "learning_rate": 8.489691915682187e-07, + "logits/chosen": -0.08837386965751648, + "logits/rejected": -0.9661691784858704, + "logps/chosen": -680.7479248046875, + "logps/rejected": -397.3768310546875, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3555893898010254, + "rewards/margins": 4.408527374267578, + "rewards/rejected": -2.052938222885132, + "step": 733 + }, + { + "epoch": 0.3060884070058382, + "grad_norm": 0.9694619178771973, + "learning_rate": 8.501274032893214e-07, + "logits/chosen": -0.2194950133562088, + "logits/rejected": -1.1086682081222534, + "logps/chosen": -678.1859741210938, + "logps/rejected": -411.7095642089844, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6407124996185303, + "rewards/margins": 4.775157928466797, + "rewards/rejected": -2.134445905685425, + "step": 734 + }, + { + "epoch": 0.3065054211843203, + "grad_norm": 0.7924447059631348, + "learning_rate": 8.51285615010424e-07, + "logits/chosen": -0.2842704653739929, + "logits/rejected": -1.1426997184753418, + "logps/chosen": -838.1646728515625, + "logps/rejected": -440.19183349609375, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.830138683319092, + "rewards/margins": 5.258157253265381, + "rewards/rejected": -2.4280190467834473, + "step": 735 + }, + { + "epoch": 0.30692243536280234, + "grad_norm": 1.8849058151245117, + "learning_rate": 8.524438267315266e-07, + "logits/chosen": -0.609952449798584, + "logits/rejected": -0.9074808359146118, + "logps/chosen": -940.3871459960938, + "logps/rejected": -576.6096801757812, + "loss": 0.0477, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1706016063690186, + "rewards/margins": 4.707330703735352, + "rewards/rejected": -1.5367287397384644, + "step": 736 + }, + { + "epoch": 0.3073394495412844, + "grad_norm": 0.9441521763801575, + "learning_rate": 8.536020384526292e-07, + "logits/chosen": -0.6756967306137085, + "logits/rejected": -0.9428300857543945, + "logps/chosen": -1034.5308837890625, + "logps/rejected": -516.3929443359375, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2282631397247314, + "rewards/margins": 4.7183146476745605, + "rewards/rejected": -1.4900516271591187, + "step": 737 + }, + { + "epoch": 0.30775646371976645, + "grad_norm": 1.0409454107284546, + "learning_rate": 8.547602501737317e-07, + "logits/chosen": -0.7238202095031738, + "logits/rejected": NaN, + "logps/chosen": -810.3989868164062, + "logps/rejected": -283.22052001953125, + "loss": 0.1121, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9177191257476807, + "rewards/margins": 4.142874717712402, + "rewards/rejected": -1.2251554727554321, + "step": 738 + }, + { + "epoch": 0.30817347789824856, + "grad_norm": 1.5766559839248657, + "learning_rate": 8.559184618948345e-07, + "logits/chosen": -0.3796844482421875, + "logits/rejected": -1.0110608339309692, + "logps/chosen": -907.6445922851562, + "logps/rejected": -535.6018676757812, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1750571727752686, + "rewards/margins": 5.00908899307251, + "rewards/rejected": -1.8340318202972412, + "step": 739 + }, + { + "epoch": 0.3085904920767306, + "grad_norm": 2.399120330810547, + "learning_rate": 8.57076673615937e-07, + "logits/chosen": -0.5681876540184021, + "logits/rejected": -0.6075429916381836, + "logps/chosen": -1230.56591796875, + "logps/rejected": -773.8480224609375, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4133706092834473, + "rewards/margins": 4.190212249755859, + "rewards/rejected": -0.776841402053833, + "step": 740 + }, + { + "epoch": 0.30900750625521267, + "grad_norm": 0.6514347791671753, + "learning_rate": 8.582348853370396e-07, + "logits/chosen": -0.16138088703155518, + "logits/rejected": -0.8874399662017822, + "logps/chosen": -839.1268310546875, + "logps/rejected": -561.13525390625, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9273126125335693, + "rewards/margins": 5.208553314208984, + "rewards/rejected": -2.281240940093994, + "step": 741 + }, + { + "epoch": 0.3094245204336947, + "grad_norm": 1.2707401514053345, + "learning_rate": 8.593930970581423e-07, + "logits/chosen": -0.3596419095993042, + "logits/rejected": -1.2172266244888306, + "logps/chosen": -775.1124267578125, + "logps/rejected": -456.1535339355469, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1152820587158203, + "rewards/margins": 5.258018970489502, + "rewards/rejected": -2.1427366733551025, + "step": 742 + }, + { + "epoch": 0.30984153461217684, + "grad_norm": 34.6884765625, + "learning_rate": 8.605513087792449e-07, + "logits/chosen": -0.6243309378623962, + "logits/rejected": -0.8001326322555542, + "logps/chosen": -966.7570190429688, + "logps/rejected": -592.6263427734375, + "loss": 0.0879, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8731980323791504, + "rewards/margins": 4.92562198638916, + "rewards/rejected": -2.0524239540100098, + "step": 743 + }, + { + "epoch": 0.3102585487906589, + "grad_norm": 5.627908706665039, + "learning_rate": 8.617095205003476e-07, + "logits/chosen": -1.15355384349823, + "logits/rejected": -0.7155618667602539, + "logps/chosen": -964.3158569335938, + "logps/rejected": -627.7289428710938, + "loss": 0.104, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7810988426208496, + "rewards/margins": 4.481237411499023, + "rewards/rejected": -1.7001385688781738, + "step": 744 + }, + { + "epoch": 0.31067556296914095, + "grad_norm": 0.5163549780845642, + "learning_rate": 8.628677322214502e-07, + "logits/chosen": -0.23586276173591614, + "logits/rejected": -1.0350360870361328, + "logps/chosen": -810.0313110351562, + "logps/rejected": -428.1730651855469, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.662163257598877, + "rewards/margins": 5.3498640060424805, + "rewards/rejected": -1.6877011060714722, + "step": 745 + }, + { + "epoch": 0.311092577147623, + "grad_norm": 21.183578491210938, + "learning_rate": 8.640259439425527e-07, + "logits/chosen": -0.853206217288971, + "logits/rejected": -0.7464024424552917, + "logps/chosen": -924.4681396484375, + "logps/rejected": -572.5872802734375, + "loss": 0.069, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.385301113128662, + "rewards/margins": 4.678724765777588, + "rewards/rejected": -1.2934238910675049, + "step": 746 + }, + { + "epoch": 0.31150959132610506, + "grad_norm": 2.186302423477173, + "learning_rate": 8.651841556636553e-07, + "logits/chosen": -0.49565574526786804, + "logits/rejected": -0.6336132287979126, + "logps/chosen": -856.80859375, + "logps/rejected": -570.90087890625, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.523777961730957, + "rewards/margins": 3.808321475982666, + "rewards/rejected": -1.2845436334609985, + "step": 747 + }, + { + "epoch": 0.3119266055045872, + "grad_norm": 1.1452693939208984, + "learning_rate": 8.663423673847581e-07, + "logits/chosen": 0.026082012802362442, + "logits/rejected": -0.9391926527023315, + "logps/chosen": -960.1495361328125, + "logps/rejected": -539.18896484375, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4152636528015137, + "rewards/margins": 4.7094316482543945, + "rewards/rejected": -2.29416823387146, + "step": 748 + }, + { + "epoch": 0.31234361968306923, + "grad_norm": 0.45655345916748047, + "learning_rate": 8.675005791058606e-07, + "logits/chosen": -1.2876330614089966, + "logits/rejected": NaN, + "logps/chosen": -1095.7806396484375, + "logps/rejected": -335.44622802734375, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.965546131134033, + "rewards/margins": 5.623880863189697, + "rewards/rejected": -1.658334732055664, + "step": 749 + }, + { + "epoch": 0.3127606338615513, + "grad_norm": 1.161063313484192, + "learning_rate": 8.686587908269632e-07, + "logits/chosen": -0.583557665348053, + "logits/rejected": -0.8800214529037476, + "logps/chosen": -907.5145874023438, + "logps/rejected": -523.6499633789062, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1288018226623535, + "rewards/margins": 4.397703647613525, + "rewards/rejected": -1.2689017057418823, + "step": 750 + }, + { + "epoch": 0.31317764804003334, + "grad_norm": 1.025225281715393, + "learning_rate": 8.698170025480659e-07, + "logits/chosen": -0.7228721976280212, + "logits/rejected": -0.8188115954399109, + "logps/chosen": -911.5072021484375, + "logps/rejected": -494.3170166015625, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.573184013366699, + "rewards/margins": 5.554881572723389, + "rewards/rejected": -1.9816970825195312, + "step": 751 + }, + { + "epoch": 0.31359466221851545, + "grad_norm": 0.6657102704048157, + "learning_rate": 8.709752142691685e-07, + "logits/chosen": -0.3925830125808716, + "logits/rejected": -1.365187644958496, + "logps/chosen": -672.3681640625, + "logps/rejected": -376.28155517578125, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.441612958908081, + "rewards/margins": 4.376472473144531, + "rewards/rejected": -1.9348597526550293, + "step": 752 + }, + { + "epoch": 0.3140116763969975, + "grad_norm": 1.4375520944595337, + "learning_rate": 8.721334259902711e-07, + "logits/chosen": -0.5138831734657288, + "logits/rejected": -0.7106859683990479, + "logps/chosen": -986.78076171875, + "logps/rejected": -626.512939453125, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.89927077293396, + "rewards/margins": 5.019356727600098, + "rewards/rejected": -2.1200859546661377, + "step": 753 + }, + { + "epoch": 0.31442869057547956, + "grad_norm": 17.653810501098633, + "learning_rate": 8.732916377113738e-07, + "logits/chosen": -1.090025544166565, + "logits/rejected": -0.4062599241733551, + "logps/chosen": -996.791259765625, + "logps/rejected": -588.6845703125, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3073768615722656, + "rewards/margins": 3.973219394683838, + "rewards/rejected": -0.6658426523208618, + "step": 754 + }, + { + "epoch": 0.3148457047539616, + "grad_norm": 0.9972743391990662, + "learning_rate": 8.744498494324763e-07, + "logits/chosen": -0.7229777574539185, + "logits/rejected": -0.5977667570114136, + "logps/chosen": -840.2335205078125, + "logps/rejected": -380.05926513671875, + "loss": 0.1089, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8339929580688477, + "rewards/margins": 4.260758399963379, + "rewards/rejected": -1.4267654418945312, + "step": 755 + }, + { + "epoch": 0.31526271893244373, + "grad_norm": 1.5003228187561035, + "learning_rate": 8.756080611535789e-07, + "logits/chosen": -0.417793333530426, + "logits/rejected": -1.3243563175201416, + "logps/chosen": -798.0966186523438, + "logps/rejected": -355.4105529785156, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8057780265808105, + "rewards/margins": 7.029495716094971, + "rewards/rejected": -3.22371768951416, + "step": 756 + }, + { + "epoch": 0.3156797331109258, + "grad_norm": 1.1993805170059204, + "learning_rate": 8.767662728746816e-07, + "logits/chosen": -0.6332541704177856, + "logits/rejected": -1.0612430572509766, + "logps/chosen": -836.6159057617188, + "logps/rejected": -445.05047607421875, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8510141372680664, + "rewards/margins": 4.391094207763672, + "rewards/rejected": -1.5400803089141846, + "step": 757 + }, + { + "epoch": 0.31609674728940784, + "grad_norm": 0.8597452044487, + "learning_rate": 8.779244845957842e-07, + "logits/chosen": -0.5041981935501099, + "logits/rejected": -0.7996268272399902, + "logps/chosen": -1048.6082763671875, + "logps/rejected": -432.5263671875, + "loss": 0.1001, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.811117649078369, + "rewards/margins": 5.287683486938477, + "rewards/rejected": -2.4765655994415283, + "step": 758 + }, + { + "epoch": 0.3165137614678899, + "grad_norm": 1.584093689918518, + "learning_rate": 8.790826963168868e-07, + "logits/chosen": -0.9893640279769897, + "logits/rejected": -0.9058552980422974, + "logps/chosen": -901.2923583984375, + "logps/rejected": -507.4869079589844, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.016448497772217, + "rewards/margins": 4.432081699371338, + "rewards/rejected": -1.415632963180542, + "step": 759 + }, + { + "epoch": 0.31693077564637195, + "grad_norm": 0.5010586977005005, + "learning_rate": 8.802409080379893e-07, + "logits/chosen": -0.8955177664756775, + "logits/rejected": -0.9155671000480652, + "logps/chosen": -898.803955078125, + "logps/rejected": -475.0334777832031, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.193852424621582, + "rewards/margins": 5.6018524169921875, + "rewards/rejected": -1.4080003499984741, + "step": 760 + }, + { + "epoch": 0.31734778982485407, + "grad_norm": 4.9953765869140625, + "learning_rate": 8.813991197590921e-07, + "logits/chosen": -1.1544028520584106, + "logits/rejected": -0.7323976755142212, + "logps/chosen": -1048.6640625, + "logps/rejected": -688.2535400390625, + "loss": 0.0729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3619885444641113, + "rewards/margins": 4.034290790557861, + "rewards/rejected": -0.6723023056983948, + "step": 761 + }, + { + "epoch": 0.3177648040033361, + "grad_norm": 0.5207752585411072, + "learning_rate": 8.825573314801947e-07, + "logits/chosen": -0.07623571157455444, + "logits/rejected": -1.1575932502746582, + "logps/chosen": -917.3660888671875, + "logps/rejected": -466.0910949707031, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1788742542266846, + "rewards/margins": 5.730085849761963, + "rewards/rejected": -2.551211357116699, + "step": 762 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 19.263742446899414, + "learning_rate": 8.837155432012972e-07, + "logits/chosen": -1.2915318012237549, + "logits/rejected": -0.5202928185462952, + "logps/chosen": -1049.53759765625, + "logps/rejected": -558.7322387695312, + "loss": 0.1347, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.263453006744385, + "rewards/margins": 5.353590488433838, + "rewards/rejected": -1.0901377201080322, + "step": 763 + }, + { + "epoch": 0.31859883236030023, + "grad_norm": 9.030179023742676, + "learning_rate": 8.848737549223999e-07, + "logits/chosen": -1.3576257228851318, + "logits/rejected": NaN, + "logps/chosen": -808.0367431640625, + "logps/rejected": -360.1929931640625, + "loss": 0.2426, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.1691887378692627, + "rewards/margins": 3.1205127239227295, + "rewards/rejected": -0.9513238668441772, + "step": 764 + }, + { + "epoch": 0.31901584653878234, + "grad_norm": 2.6086552143096924, + "learning_rate": 8.860319666435025e-07, + "logits/chosen": -0.3046741485595703, + "logits/rejected": -0.7356085777282715, + "logps/chosen": -799.2251586914062, + "logps/rejected": -524.0379638671875, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3506011962890625, + "rewards/margins": 4.7549543380737305, + "rewards/rejected": -2.404353141784668, + "step": 765 + }, + { + "epoch": 0.3194328607172644, + "grad_norm": 1.7161815166473389, + "learning_rate": 8.871901783646051e-07, + "logits/chosen": -0.23001635074615479, + "logits/rejected": -0.9949241876602173, + "logps/chosen": -782.0280151367188, + "logps/rejected": -489.9432373046875, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3813774585723877, + "rewards/margins": 4.0297346115112305, + "rewards/rejected": -1.6483572721481323, + "step": 766 + }, + { + "epoch": 0.31984987489574646, + "grad_norm": 0.6653962135314941, + "learning_rate": 8.883483900857078e-07, + "logits/chosen": -0.39989084005355835, + "logits/rejected": -1.0176637172698975, + "logps/chosen": -822.5221557617188, + "logps/rejected": -561.1429443359375, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5314626693725586, + "rewards/margins": 4.8563032150268555, + "rewards/rejected": -2.324840545654297, + "step": 767 + }, + { + "epoch": 0.3202668890742285, + "grad_norm": 11.978507995605469, + "learning_rate": 8.895066018068104e-07, + "logits/chosen": -1.0183825492858887, + "logits/rejected": -0.750690221786499, + "logps/chosen": -1284.5303955078125, + "logps/rejected": -727.1544189453125, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.403820037841797, + "rewards/margins": 5.443722724914551, + "rewards/rejected": -2.039902925491333, + "step": 768 + }, + { + "epoch": 0.32068390325271057, + "grad_norm": 56.441036224365234, + "learning_rate": 8.906648135279129e-07, + "logits/chosen": -1.8849568367004395, + "logits/rejected": NaN, + "logps/chosen": -1048.628173828125, + "logps/rejected": -506.686279296875, + "loss": 0.2323, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.14269757270813, + "rewards/margins": 3.689361333847046, + "rewards/rejected": -0.5466636419296265, + "step": 769 + }, + { + "epoch": 0.3211009174311927, + "grad_norm": 1.8240649700164795, + "learning_rate": 8.918230252490157e-07, + "logits/chosen": -0.5134289860725403, + "logits/rejected": -0.883049488067627, + "logps/chosen": -1134.1602783203125, + "logps/rejected": -603.7987670898438, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.684987783432007, + "rewards/margins": 4.161540985107422, + "rewards/rejected": -1.4765533208847046, + "step": 770 + }, + { + "epoch": 0.32151793160967473, + "grad_norm": 1.1158115863800049, + "learning_rate": 8.929812369701182e-07, + "logits/chosen": -0.6384787559509277, + "logits/rejected": -0.788710355758667, + "logps/chosen": -1021.4607543945312, + "logps/rejected": -574.1055908203125, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.73626708984375, + "rewards/margins": 6.041558742523193, + "rewards/rejected": -2.3052916526794434, + "step": 771 + }, + { + "epoch": 0.3219349457881568, + "grad_norm": 0.5189290642738342, + "learning_rate": 8.941394486912208e-07, + "logits/chosen": -0.6092037558555603, + "logits/rejected": -0.9890626668930054, + "logps/chosen": -780.1041259765625, + "logps/rejected": -487.48370361328125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.421586036682129, + "rewards/margins": 5.341172218322754, + "rewards/rejected": -1.919586420059204, + "step": 772 + }, + { + "epoch": 0.32235195996663885, + "grad_norm": 1.6071184873580933, + "learning_rate": 8.952976604123234e-07, + "logits/chosen": -0.7495828866958618, + "logits/rejected": -1.1717936992645264, + "logps/chosen": -654.9177856445312, + "logps/rejected": -373.14385986328125, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.927735447883606, + "rewards/margins": 4.852870941162109, + "rewards/rejected": -2.925135612487793, + "step": 773 + }, + { + "epoch": 0.32276897414512096, + "grad_norm": 6.41335916519165, + "learning_rate": 8.96455872133426e-07, + "logits/chosen": -0.8931018710136414, + "logits/rejected": -0.6783514022827148, + "logps/chosen": -939.7801513671875, + "logps/rejected": -566.1155395507812, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3349251747131348, + "rewards/margins": 5.128380298614502, + "rewards/rejected": -1.7934552431106567, + "step": 774 + }, + { + "epoch": 0.323185988323603, + "grad_norm": 0.978864312171936, + "learning_rate": 8.976140838545287e-07, + "logits/chosen": -0.3537243902683258, + "logits/rejected": -0.7187613844871521, + "logps/chosen": -1024.04931640625, + "logps/rejected": -688.1065673828125, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2155704498291016, + "rewards/margins": 4.533926010131836, + "rewards/rejected": -1.3183555603027344, + "step": 775 + }, + { + "epoch": 0.32360300250208507, + "grad_norm": 0.9938160181045532, + "learning_rate": 8.987722955756313e-07, + "logits/chosen": -0.28136032819747925, + "logits/rejected": -0.9997270703315735, + "logps/chosen": -1042.78271484375, + "logps/rejected": -545.0120849609375, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.327178955078125, + "rewards/margins": 6.580321311950684, + "rewards/rejected": -3.2531425952911377, + "step": 776 + }, + { + "epoch": 0.3240200166805671, + "grad_norm": 1.3336081504821777, + "learning_rate": 8.999305072967339e-07, + "logits/chosen": -0.2922314703464508, + "logits/rejected": -0.7008571028709412, + "logps/chosen": -1089.712646484375, + "logps/rejected": -697.3277587890625, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3788812160491943, + "rewards/margins": 5.18662166595459, + "rewards/rejected": -1.8077404499053955, + "step": 777 + }, + { + "epoch": 0.3244370308590492, + "grad_norm": 1.7028926610946655, + "learning_rate": 9.010887190178365e-07, + "logits/chosen": -0.6164554357528687, + "logits/rejected": -0.7555915713310242, + "logps/chosen": -1000.7896118164062, + "logps/rejected": -622.096923828125, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.285421848297119, + "rewards/margins": 3.433682441711426, + "rewards/rejected": -1.1482603549957275, + "step": 778 + }, + { + "epoch": 0.3248540450375313, + "grad_norm": 117.65081787109375, + "learning_rate": 9.022469307389392e-07, + "logits/chosen": -0.7185012698173523, + "logits/rejected": -0.7287086248397827, + "logps/chosen": -987.4775390625, + "logps/rejected": -661.9227905273438, + "loss": 0.0962, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.032565116882324, + "rewards/margins": 4.357406139373779, + "rewards/rejected": -1.3248409032821655, + "step": 779 + }, + { + "epoch": 0.32527105921601335, + "grad_norm": 5.027904033660889, + "learning_rate": 9.034051424600418e-07, + "logits/chosen": -0.6866406798362732, + "logits/rejected": -0.798186719417572, + "logps/chosen": -1117.471923828125, + "logps/rejected": -654.20654296875, + "loss": 0.0726, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.601243495941162, + "rewards/margins": 6.253108978271484, + "rewards/rejected": -2.651865005493164, + "step": 780 + }, + { + "epoch": 0.3256880733944954, + "grad_norm": 2.9056148529052734, + "learning_rate": 9.045633541811444e-07, + "logits/chosen": -1.0973331928253174, + "logits/rejected": -0.792086124420166, + "logps/chosen": -909.6199951171875, + "logps/rejected": -499.38812255859375, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9113333225250244, + "rewards/margins": 5.026923656463623, + "rewards/rejected": -1.1155906915664673, + "step": 781 + }, + { + "epoch": 0.32610508757297746, + "grad_norm": 1.8610360622406006, + "learning_rate": 9.05721565902247e-07, + "logits/chosen": -0.5788887143135071, + "logits/rejected": -0.893035888671875, + "logps/chosen": -905.3232421875, + "logps/rejected": -543.7236938476562, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5703351497650146, + "rewards/margins": 4.506134510040283, + "rewards/rejected": -1.9357993602752686, + "step": 782 + }, + { + "epoch": 0.32652210175145957, + "grad_norm": 1.4102493524551392, + "learning_rate": 9.068797776233495e-07, + "logits/chosen": -0.643222987651825, + "logits/rejected": -0.9339867830276489, + "logps/chosen": -1004.9195556640625, + "logps/rejected": -573.5598754882812, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1915841102600098, + "rewards/margins": 5.904484748840332, + "rewards/rejected": -2.7129006385803223, + "step": 783 + }, + { + "epoch": 0.3269391159299416, + "grad_norm": 2.5425479412078857, + "learning_rate": 9.080379893444523e-07, + "logits/chosen": -0.8167959451675415, + "logits/rejected": -1.0551121234893799, + "logps/chosen": -756.0317993164062, + "logps/rejected": -430.5360107421875, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.934839963912964, + "rewards/margins": 4.7681803703308105, + "rewards/rejected": -1.8333404064178467, + "step": 784 + }, + { + "epoch": 0.3273561301084237, + "grad_norm": 0.42624422907829285, + "learning_rate": 9.091962010655548e-07, + "logits/chosen": -0.48641759157180786, + "logits/rejected": -1.1949657201766968, + "logps/chosen": -848.6430053710938, + "logps/rejected": -471.8353576660156, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.902517080307007, + "rewards/margins": 6.00065279006958, + "rewards/rejected": -3.098135471343994, + "step": 785 + }, + { + "epoch": 0.32777314428690574, + "grad_norm": 0.6933255195617676, + "learning_rate": 9.103544127866574e-07, + "logits/chosen": -0.7129107713699341, + "logits/rejected": -0.9817930459976196, + "logps/chosen": -1253.135009765625, + "logps/rejected": -668.2852783203125, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.249969482421875, + "rewards/margins": 6.744891166687012, + "rewards/rejected": -2.4949216842651367, + "step": 786 + }, + { + "epoch": 0.32819015846538785, + "grad_norm": 0.7722011208534241, + "learning_rate": 9.115126245077601e-07, + "logits/chosen": -0.1022772490978241, + "logits/rejected": -1.0910495519638062, + "logps/chosen": -777.2555541992188, + "logps/rejected": -466.14501953125, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2668368816375732, + "rewards/margins": 5.477089881896973, + "rewards/rejected": -2.2102534770965576, + "step": 787 + }, + { + "epoch": 0.3286071726438699, + "grad_norm": 1.0695912837982178, + "learning_rate": 9.126708362288627e-07, + "logits/chosen": -0.2192329466342926, + "logits/rejected": -1.0688375234603882, + "logps/chosen": -864.9066162109375, + "logps/rejected": -499.8095397949219, + "loss": 0.082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9469490051269531, + "rewards/margins": 5.130863666534424, + "rewards/rejected": -3.18391489982605, + "step": 788 + }, + { + "epoch": 0.32902418682235196, + "grad_norm": 1.7828694581985474, + "learning_rate": 9.138290479499654e-07, + "logits/chosen": -0.9653950333595276, + "logits/rejected": -0.5897153615951538, + "logps/chosen": -1012.6570434570312, + "logps/rejected": -564.6229858398438, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.256986141204834, + "rewards/margins": 5.876895904541016, + "rewards/rejected": -1.6199092864990234, + "step": 789 + }, + { + "epoch": 0.329441201000834, + "grad_norm": 0.1979788988828659, + "learning_rate": 9.14987259671068e-07, + "logits/chosen": -0.35013940930366516, + "logits/rejected": -1.054707646369934, + "logps/chosen": -870.935791015625, + "logps/rejected": -564.0841674804688, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2198565006256104, + "rewards/margins": 6.592498779296875, + "rewards/rejected": -3.3726425170898438, + "step": 790 + }, + { + "epoch": 0.32985821517931607, + "grad_norm": 0.2599593698978424, + "learning_rate": 9.161454713921705e-07, + "logits/chosen": -1.25644850730896, + "logits/rejected": NaN, + "logps/chosen": -956.0512084960938, + "logps/rejected": -302.0899658203125, + "loss": 0.1757, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.171377420425415, + "rewards/margins": 4.955552101135254, + "rewards/rejected": -1.7841743230819702, + "step": 791 + }, + { + "epoch": 0.3302752293577982, + "grad_norm": 2.2945258617401123, + "learning_rate": 9.173036831132731e-07, + "logits/chosen": -0.7138734459877014, + "logits/rejected": -0.899302065372467, + "logps/chosen": -925.9655151367188, + "logps/rejected": -612.1957397460938, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5079712867736816, + "rewards/margins": 6.027793884277344, + "rewards/rejected": -2.519822597503662, + "step": 792 + }, + { + "epoch": 0.33069224353628024, + "grad_norm": 131.70701599121094, + "learning_rate": 9.184618948343759e-07, + "logits/chosen": -1.5362334251403809, + "logits/rejected": -0.6502158641815186, + "logps/chosen": -1089.036376953125, + "logps/rejected": -588.7554321289062, + "loss": 0.1317, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.171149492263794, + "rewards/margins": 3.9477829933166504, + "rewards/rejected": -0.7766334414482117, + "step": 793 + }, + { + "epoch": 0.3311092577147623, + "grad_norm": 35.41465377807617, + "learning_rate": 9.196201065554784e-07, + "logits/chosen": -0.661848783493042, + "logits/rejected": -0.6477712392807007, + "logps/chosen": -965.2376708984375, + "logps/rejected": -562.81298828125, + "loss": 0.1625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.987924098968506, + "rewards/margins": 5.306061744689941, + "rewards/rejected": -2.3181376457214355, + "step": 794 + }, + { + "epoch": 0.33152627189324435, + "grad_norm": 0.1958695352077484, + "learning_rate": 9.20778318276581e-07, + "logits/chosen": -0.45575064420700073, + "logits/rejected": -1.4286383390426636, + "logps/chosen": -757.6196899414062, + "logps/rejected": -424.5440368652344, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.502028465270996, + "rewards/margins": 6.849234104156494, + "rewards/rejected": -4.347206115722656, + "step": 795 + }, + { + "epoch": 0.33194328607172646, + "grad_norm": 0.8136610984802246, + "learning_rate": 9.219365299976837e-07, + "logits/chosen": -0.044421710073947906, + "logits/rejected": -1.1629042625427246, + "logps/chosen": -753.831298828125, + "logps/rejected": -400.03857421875, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0912253856658936, + "rewards/margins": 5.463101387023926, + "rewards/rejected": -3.371875762939453, + "step": 796 + }, + { + "epoch": 0.3323603002502085, + "grad_norm": 0.6434449553489685, + "learning_rate": 9.230947417187863e-07, + "logits/chosen": -0.1026993840932846, + "logits/rejected": -0.7977926731109619, + "logps/chosen": -995.4454956054688, + "logps/rejected": -621.8523559570312, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3399484157562256, + "rewards/margins": 5.772470474243164, + "rewards/rejected": -2.4325218200683594, + "step": 797 + }, + { + "epoch": 0.3327773144286906, + "grad_norm": 8.965337753295898, + "learning_rate": 9.242529534398889e-07, + "logits/chosen": -1.535846471786499, + "logits/rejected": -0.6008101105690002, + "logps/chosen": -1248.24609375, + "logps/rejected": -762.42724609375, + "loss": 0.1648, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.460240364074707, + "rewards/margins": 3.539628505706787, + "rewards/rejected": -1.0793880224227905, + "step": 798 + }, + { + "epoch": 0.33319432860717263, + "grad_norm": 4.700278282165527, + "learning_rate": 9.254111651609915e-07, + "logits/chosen": -0.9412198662757874, + "logits/rejected": -0.7176629900932312, + "logps/chosen": -1192.577392578125, + "logps/rejected": -702.593994140625, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.104191303253174, + "rewards/margins": 4.991684913635254, + "rewards/rejected": -1.88749361038208, + "step": 799 + }, + { + "epoch": 0.3336113427856547, + "grad_norm": 0.23061853647232056, + "learning_rate": 9.265693768820941e-07, + "logits/chosen": -0.28676533699035645, + "logits/rejected": -1.204689860343933, + "logps/chosen": -872.7047119140625, + "logps/rejected": -562.8673095703125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.927873373031616, + "rewards/margins": 6.644464492797852, + "rewards/rejected": -3.7165915966033936, + "step": 800 + } + ], + "logging_steps": 1, + "max_steps": 7194, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}