diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7521 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.925925925925926, + "eval_steps": 1, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011851851851851851, + "grad_norm": 30.633439415390697, + "learning_rate": 7.352941176470588e-09, + "logits/chosen": -1.1390000581741333, + "logits/rejected": -1.004213571548462, + "logps/chosen": -27.46249008178711, + "logps/rejected": -40.97970962524414, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.023703703703703703, + "grad_norm": 30.560266192737977, + "learning_rate": 1.4705882352941176e-08, + "logits/chosen": -0.9409990310668945, + "logits/rejected": -1.0981616973876953, + "logps/chosen": -25.160219192504883, + "logps/rejected": -37.994651794433594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 29.12506200104364, + "learning_rate": 2.2058823529411764e-08, + "logits/chosen": -1.0592122077941895, + "logits/rejected": -1.023957371711731, + "logps/chosen": -24.85056495666504, + "logps/rejected": -33.17691421508789, + "loss": 0.6958, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.015119694173336029, + "rewards/margins": 0.03308585658669472, + "rewards/rejected": -0.01796616055071354, + "step": 3 + }, + { + "epoch": 0.047407407407407405, + "grad_norm": 30.746154156018388, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -1.0859633684158325, + "logits/rejected": -0.9216127991676331, + "logps/chosen": -27.081607818603516, + "logps/rejected": -31.82309913635254, + "loss": 0.6924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00960695743560791, + "rewards/margins": 0.043169185519218445, + "rewards/rejected": -0.05277615040540695, + "step": 4 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 29.519287927141388, + "learning_rate": 3.676470588235294e-08, + "logits/chosen": -1.049912452697754, + "logits/rejected": -1.1279696226119995, + "logps/chosen": -28.21110725402832, + "logps/rejected": -31.672449111938477, + "loss": 0.6936, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010185590013861656, + "rewards/margins": 0.022015150636434555, + "rewards/rejected": -0.011829562485218048, + "step": 5 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 30.815268471480024, + "learning_rate": 4.411764705882353e-08, + "logits/chosen": -0.9929622411727905, + "logits/rejected": -0.8766285181045532, + "logps/chosen": -33.8680419921875, + "logps/rejected": -32.97846221923828, + "loss": 0.6901, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01366226002573967, + "rewards/margins": -0.027053195983171463, + "rewards/rejected": 0.013390939682722092, + "step": 6 + }, + { + "epoch": 0.08296296296296296, + "grad_norm": 28.066993035093688, + "learning_rate": 5.147058823529411e-08, + "logits/chosen": -1.0458980798721313, + "logits/rejected": -0.99763023853302, + "logps/chosen": -27.67081069946289, + "logps/rejected": -32.387882232666016, + "loss": 0.6937, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.004842352122068405, + "rewards/margins": -0.027380306273698807, + "rewards/rejected": 0.03222266212105751, + "step": 7 + }, + { + "epoch": 0.09481481481481481, + "grad_norm": 30.97687763376115, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -0.7906761169433594, + "logits/rejected": -0.9265250563621521, + "logps/chosen": -21.296993255615234, + "logps/rejected": -30.3665771484375, + "loss": 0.6966, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.011183989234268665, + "rewards/margins": -0.038471613079309464, + "rewards/rejected": 0.027287624776363373, + "step": 8 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 30.839253271730197, + "learning_rate": 6.617647058823529e-08, + "logits/chosen": -1.0798628330230713, + "logits/rejected": -0.8085466623306274, + "logps/chosen": -27.84103012084961, + "logps/rejected": -27.858829498291016, + "loss": 0.697, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.00032351166009902954, + "rewards/margins": 0.0009455680847167969, + "rewards/rejected": -0.0006220571231096983, + "step": 9 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 30.290928737637028, + "learning_rate": 7.352941176470588e-08, + "logits/chosen": -0.9101401567459106, + "logits/rejected": -0.8849160671234131, + "logps/chosen": -28.90041160583496, + "logps/rejected": -36.99686050415039, + "loss": 0.7062, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.002118426375091076, + "rewards/margins": -0.00936745386570692, + "rewards/rejected": 0.011485882103443146, + "step": 10 + }, + { + "epoch": 0.13037037037037036, + "grad_norm": 29.947407184192368, + "learning_rate": 8.088235294117647e-08, + "logits/chosen": -0.6956688761711121, + "logits/rejected": -0.6927211284637451, + "logps/chosen": -26.63684844970703, + "logps/rejected": -32.870521545410156, + "loss": 0.6938, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.005509059876203537, + "rewards/margins": -0.01887032575905323, + "rewards/rejected": 0.013361264020204544, + "step": 11 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 28.661495862175222, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -0.9000818133354187, + "logits/rejected": -0.8827647566795349, + "logps/chosen": -30.33894157409668, + "logps/rejected": -39.22317886352539, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.582557059824467e-05, + "rewards/margins": -0.02260081097483635, + "rewards/rejected": 0.02254498563706875, + "step": 12 + }, + { + "epoch": 0.15407407407407409, + "grad_norm": 28.65418832324272, + "learning_rate": 9.558823529411763e-08, + "logits/chosen": -1.002519130706787, + "logits/rejected": -0.8338276147842407, + "logps/chosen": -22.69075584411621, + "logps/rejected": -28.403766632080078, + "loss": 0.6928, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02947317063808441, + "rewards/margins": 0.025819525122642517, + "rewards/rejected": 0.003653643187135458, + "step": 13 + }, + { + "epoch": 0.16592592592592592, + "grad_norm": 32.991997579379785, + "learning_rate": 1.0294117647058822e-07, + "logits/chosen": -1.103955864906311, + "logits/rejected": -0.9304717183113098, + "logps/chosen": -30.54917335510254, + "logps/rejected": -35.995635986328125, + "loss": 0.6951, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.029121514409780502, + "rewards/margins": -0.03852158039808273, + "rewards/rejected": 0.00940006971359253, + "step": 14 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 28.604312545703518, + "learning_rate": 1.1029411764705881e-07, + "logits/chosen": -0.9514233469963074, + "logits/rejected": -1.0262576341629028, + "logps/chosen": -24.50589942932129, + "logps/rejected": -35.90400314331055, + "loss": 0.6846, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0038123312406241894, + "rewards/margins": -0.002235441468656063, + "rewards/rejected": -0.001576889306306839, + "step": 15 + }, + { + "epoch": 0.18962962962962962, + "grad_norm": 29.400959002049447, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -1.0083125829696655, + "logits/rejected": -1.054337739944458, + "logps/chosen": -25.947620391845703, + "logps/rejected": -34.74080276489258, + "loss": 0.6858, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018293071538209915, + "rewards/margins": 0.003816458163782954, + "rewards/rejected": 0.014476614072918892, + "step": 16 + }, + { + "epoch": 0.20148148148148148, + "grad_norm": 28.668497058092704, + "learning_rate": 1.25e-07, + "logits/chosen": -1.2847508192062378, + "logits/rejected": -1.1500571966171265, + "logps/chosen": -24.377540588378906, + "logps/rejected": -24.707080841064453, + "loss": 0.6955, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.020369501784443855, + "rewards/margins": -0.04808269441127777, + "rewards/rejected": 0.027713194489479065, + "step": 17 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 28.83942987795299, + "learning_rate": 1.3235294117647057e-07, + "logits/chosen": -0.7682641744613647, + "logits/rejected": -0.7946673035621643, + "logps/chosen": -30.430973052978516, + "logps/rejected": -38.10691833496094, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008304210379719734, + "rewards/margins": 0.025393059477210045, + "rewards/rejected": -0.03369727358222008, + "step": 18 + }, + { + "epoch": 0.22518518518518518, + "grad_norm": 30.269138451288548, + "learning_rate": 1.3970588235294117e-07, + "logits/chosen": -0.9091902375221252, + "logits/rejected": -0.8255650997161865, + "logps/chosen": -23.93052864074707, + "logps/rejected": -31.06536865234375, + "loss": 0.6875, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02613237127661705, + "rewards/margins": 0.029419327154755592, + "rewards/rejected": -0.0032869577407836914, + "step": 19 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 28.790853669800715, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -0.8767175078392029, + "logits/rejected": -0.7814630270004272, + "logps/chosen": -24.231098175048828, + "logps/rejected": -31.490211486816406, + "loss": 0.6895, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.003405546071007848, + "rewards/margins": -0.02764531597495079, + "rewards/rejected": 0.03105086088180542, + "step": 20 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 28.60949101677147, + "learning_rate": 1.5441176470588236e-07, + "logits/chosen": -0.7832755446434021, + "logits/rejected": -0.6420150995254517, + "logps/chosen": -30.503726959228516, + "logps/rejected": -37.48731231689453, + "loss": 0.6849, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.015105541795492172, + "rewards/margins": -0.011576179414987564, + "rewards/rejected": 0.026681719347834587, + "step": 21 + }, + { + "epoch": 0.2607407407407407, + "grad_norm": 30.79843186614584, + "learning_rate": 1.6176470588235293e-07, + "logits/chosen": -0.838936984539032, + "logits/rejected": -0.890478789806366, + "logps/chosen": -23.474294662475586, + "logps/rejected": -27.067607879638672, + "loss": 0.69, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.00017893756739795208, + "rewards/margins": 0.007045186124742031, + "rewards/rejected": -0.00686624925583601, + "step": 22 + }, + { + "epoch": 0.2725925925925926, + "grad_norm": 27.887885485403874, + "learning_rate": 1.6911764705882354e-07, + "logits/chosen": -1.018220067024231, + "logits/rejected": -0.9071334600448608, + "logps/chosen": -31.536334991455078, + "logps/rejected": -39.14314270019531, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.034499503672122955, + "rewards/margins": -0.007689610123634338, + "rewards/rejected": -0.026809897273778915, + "step": 23 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 28.284778127262, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -0.9286752939224243, + "logits/rejected": -0.8299227952957153, + "logps/chosen": -31.92042350769043, + "logps/rejected": -35.5419921875, + "loss": 0.6982, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.020512927323579788, + "rewards/margins": -0.028788220137357712, + "rewards/rejected": 0.008275296539068222, + "step": 24 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 30.25505597249182, + "learning_rate": 1.8382352941176472e-07, + "logits/chosen": -1.0533839464187622, + "logits/rejected": -1.1014021635055542, + "logps/chosen": -19.149333953857422, + "logps/rejected": -31.527441024780273, + "loss": 0.6788, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0033930009230971336, + "rewards/margins": 0.015363391488790512, + "rewards/rejected": -0.01875639334321022, + "step": 25 + }, + { + "epoch": 0.30814814814814817, + "grad_norm": 27.757686456876737, + "learning_rate": 1.9117647058823527e-07, + "logits/chosen": -0.9570952653884888, + "logits/rejected": -1.2151724100112915, + "logps/chosen": -24.398239135742188, + "logps/rejected": -33.41880416870117, + "loss": 0.6772, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0002640171442180872, + "rewards/margins": 0.0642121285200119, + "rewards/rejected": -0.06394810229539871, + "step": 26 + }, + { + "epoch": 0.32, + "grad_norm": 26.596692102946548, + "learning_rate": 1.9852941176470587e-07, + "logits/chosen": -1.2522035837173462, + "logits/rejected": -1.0859918594360352, + "logps/chosen": -29.689172744750977, + "logps/rejected": -29.878040313720703, + "loss": 0.6767, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013057002797722816, + "rewards/margins": 0.011676701717078686, + "rewards/rejected": -0.024733707308769226, + "step": 27 + }, + { + "epoch": 0.33185185185185184, + "grad_norm": 30.447771442461562, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -1.0458675622940063, + "logits/rejected": -1.011217474937439, + "logps/chosen": -33.10105895996094, + "logps/rejected": -36.27530288696289, + "loss": 0.677, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.022874630987644196, + "rewards/margins": 0.024510394781827927, + "rewards/rejected": -0.04738502576947212, + "step": 28 + }, + { + "epoch": 0.3437037037037037, + "grad_norm": 28.662596504192038, + "learning_rate": 2.1323529411764705e-07, + "logits/chosen": -1.0796473026275635, + "logits/rejected": -0.9194013476371765, + "logps/chosen": -26.835773468017578, + "logps/rejected": -32.48525619506836, + "loss": 0.6772, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020335160195827484, + "rewards/margins": -0.010507296770811081, + "rewards/rejected": -0.009827865287661552, + "step": 29 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 26.531589778840697, + "learning_rate": 2.2058823529411763e-07, + "logits/chosen": -1.2273566722869873, + "logits/rejected": -1.1912459135055542, + "logps/chosen": -25.345335006713867, + "logps/rejected": -30.677719116210938, + "loss": 0.6825, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01930350251495838, + "rewards/margins": 0.0014111557975411415, + "rewards/rejected": -0.0207146555185318, + "step": 30 + }, + { + "epoch": 0.3674074074074074, + "grad_norm": 28.447408639251734, + "learning_rate": 2.2794117647058823e-07, + "logits/chosen": -0.9077786207199097, + "logits/rejected": -0.8582189083099365, + "logps/chosen": -21.32311248779297, + "logps/rejected": -30.76042366027832, + "loss": 0.6718, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023572970181703568, + "rewards/margins": 0.055517010390758514, + "rewards/rejected": -0.07908996939659119, + "step": 31 + }, + { + "epoch": 0.37925925925925924, + "grad_norm": 28.62654390398505, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -0.7928012609481812, + "logits/rejected": -0.9217997193336487, + "logps/chosen": -25.130535125732422, + "logps/rejected": -40.558807373046875, + "loss": 0.6755, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.011272218078374863, + "rewards/margins": 0.00860257912427187, + "rewards/rejected": -0.01987479254603386, + "step": 32 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 28.128072104623097, + "learning_rate": 2.426470588235294e-07, + "logits/chosen": -0.6893962025642395, + "logits/rejected": -0.7390108704566956, + "logps/chosen": -26.575319290161133, + "logps/rejected": -35.0635986328125, + "loss": 0.6622, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.027767837047576904, + "rewards/margins": 0.07502231001853943, + "rewards/rejected": -0.04725448414683342, + "step": 33 + }, + { + "epoch": 0.40296296296296297, + "grad_norm": 30.211985121803504, + "learning_rate": 2.5e-07, + "logits/chosen": -1.1355931758880615, + "logits/rejected": -0.9662358164787292, + "logps/chosen": -23.865161895751953, + "logps/rejected": -31.000974655151367, + "loss": 0.6793, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.012976722791790962, + "rewards/margins": 0.09340134263038635, + "rewards/rejected": -0.08042460680007935, + "step": 34 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 28.052145828737455, + "learning_rate": 2.5735294117647057e-07, + "logits/chosen": -1.0929094552993774, + "logits/rejected": -1.000644564628601, + "logps/chosen": -29.99643325805664, + "logps/rejected": -35.5587272644043, + "loss": 0.667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.009614755399525166, + "rewards/margins": 0.09663048386573792, + "rewards/rejected": -0.10624523460865021, + "step": 35 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 28.13933829077562, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -0.9170397520065308, + "logits/rejected": -0.8025334477424622, + "logps/chosen": -35.03669357299805, + "logps/rejected": -38.15205001831055, + "loss": 0.6579, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.028206665068864822, + "rewards/margins": 0.06581413000822067, + "rewards/rejected": -0.0940207988023758, + "step": 36 + }, + { + "epoch": 0.43851851851851853, + "grad_norm": 26.573245255968228, + "learning_rate": 2.720588235294117e-07, + "logits/chosen": -0.8840937614440918, + "logits/rejected": -1.154779076576233, + "logps/chosen": -21.523990631103516, + "logps/rejected": -37.12339401245117, + "loss": 0.6507, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03301115334033966, + "rewards/margins": 0.07873430848121643, + "rewards/rejected": -0.11174546182155609, + "step": 37 + }, + { + "epoch": 0.45037037037037037, + "grad_norm": 27.415699326968735, + "learning_rate": 2.7941176470588235e-07, + "logits/chosen": -0.851655125617981, + "logits/rejected": -0.848722517490387, + "logps/chosen": -27.6691837310791, + "logps/rejected": -35.29108810424805, + "loss": 0.6516, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.030980991199612617, + "rewards/margins": 0.12533767521381378, + "rewards/rejected": -0.15631866455078125, + "step": 38 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 25.808526954893544, + "learning_rate": 2.8676470588235293e-07, + "logits/chosen": -1.019010066986084, + "logits/rejected": -1.0574215650558472, + "logps/chosen": -28.815650939941406, + "logps/rejected": -38.8148193359375, + "loss": 0.6379, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03295973688364029, + "rewards/margins": 0.17373906075954437, + "rewards/rejected": -0.20669879019260406, + "step": 39 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 26.057899144470028, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -1.0933586359024048, + "logits/rejected": -1.029239535331726, + "logps/chosen": -29.62429428100586, + "logps/rejected": -36.406829833984375, + "loss": 0.6446, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03471359610557556, + "rewards/margins": 0.10683241486549377, + "rewards/rejected": -0.14154601097106934, + "step": 40 + }, + { + "epoch": 0.48592592592592593, + "grad_norm": 26.768031824352317, + "learning_rate": 3.014705882352941e-07, + "logits/chosen": -1.01728093624115, + "logits/rejected": -0.958576500415802, + "logps/chosen": -27.431272506713867, + "logps/rejected": -35.526336669921875, + "loss": 0.6315, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.002386711537837982, + "rewards/margins": 0.16936060786247253, + "rewards/rejected": -0.17174731194972992, + "step": 41 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 26.609263347631167, + "learning_rate": 3.088235294117647e-07, + "logits/chosen": -0.8449506759643555, + "logits/rejected": -0.8115115761756897, + "logps/chosen": -30.90084457397461, + "logps/rejected": -41.651222229003906, + "loss": 0.6449, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03619501739740372, + "rewards/margins": 0.07611033320426941, + "rewards/rejected": -0.11230535060167313, + "step": 42 + }, + { + "epoch": 0.5096296296296297, + "grad_norm": 26.602574402864892, + "learning_rate": 3.161764705882353e-07, + "logits/chosen": -0.9452661871910095, + "logits/rejected": -1.1044366359710693, + "logps/chosen": -34.610469818115234, + "logps/rejected": -46.29701232910156, + "loss": 0.6076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03930632770061493, + "rewards/margins": 0.31100231409072876, + "rewards/rejected": -0.3503086566925049, + "step": 43 + }, + { + "epoch": 0.5214814814814814, + "grad_norm": 26.4843286159413, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -1.0949482917785645, + "logits/rejected": -1.2503392696380615, + "logps/chosen": -26.57402992248535, + "logps/rejected": -37.58720016479492, + "loss": 0.6344, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06539441645145416, + "rewards/margins": 0.10176797211170197, + "rewards/rejected": -0.16716240346431732, + "step": 44 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 25.709202508798676, + "learning_rate": 3.3088235294117644e-07, + "logits/chosen": -0.9102402329444885, + "logits/rejected": -0.6524286270141602, + "logps/chosen": -33.4108772277832, + "logps/rejected": -34.84819030761719, + "loss": 0.5999, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.006676537916064262, + "rewards/margins": 0.23027826845645905, + "rewards/rejected": -0.23695479333400726, + "step": 45 + }, + { + "epoch": 0.5451851851851852, + "grad_norm": 24.273462665693376, + "learning_rate": 3.3823529411764707e-07, + "logits/chosen": -1.174068808555603, + "logits/rejected": -1.0645579099655151, + "logps/chosen": -27.4698429107666, + "logps/rejected": -40.68760681152344, + "loss": 0.6215, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0283779539167881, + "rewards/margins": 0.4031476676464081, + "rewards/rejected": -0.37476974725723267, + "step": 46 + }, + { + "epoch": 0.557037037037037, + "grad_norm": 26.030778079202978, + "learning_rate": 3.4558823529411765e-07, + "logits/chosen": -1.0018541812896729, + "logits/rejected": -0.9415028095245361, + "logps/chosen": -31.507640838623047, + "logps/rejected": -30.727527618408203, + "loss": 0.6199, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.034904323518276215, + "rewards/margins": 0.14491260051727295, + "rewards/rejected": -0.17981691658496857, + "step": 47 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 25.162457677575347, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -1.079911708831787, + "logits/rejected": -0.8244823217391968, + "logps/chosen": -26.54584503173828, + "logps/rejected": -32.15207290649414, + "loss": 0.6022, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.045158352702856064, + "rewards/margins": 0.24844199419021606, + "rewards/rejected": -0.29360032081604004, + "step": 48 + }, + { + "epoch": 0.5807407407407408, + "grad_norm": 26.364792800100354, + "learning_rate": 3.602941176470588e-07, + "logits/chosen": -1.0118129253387451, + "logits/rejected": -1.0024306774139404, + "logps/chosen": -30.346418380737305, + "logps/rejected": -36.94379425048828, + "loss": 0.5945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.004303845576941967, + "rewards/margins": 0.29810506105422974, + "rewards/rejected": -0.302408903837204, + "step": 49 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 25.49611713846103, + "learning_rate": 3.6764705882352943e-07, + "logits/chosen": -1.1100342273712158, + "logits/rejected": -1.2106306552886963, + "logps/chosen": -27.509366989135742, + "logps/rejected": -38.91688919067383, + "loss": 0.6128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02782953716814518, + "rewards/margins": 0.22366918623447418, + "rewards/rejected": -0.2514986991882324, + "step": 50 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 25.714654341441545, + "learning_rate": 3.75e-07, + "logits/chosen": -1.0252456665039062, + "logits/rejected": -0.9075378775596619, + "logps/chosen": -22.085193634033203, + "logps/rejected": -30.892152786254883, + "loss": 0.5875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.005728382617235184, + "rewards/margins": 0.43889015913009644, + "rewards/rejected": -0.4446185231208801, + "step": 51 + }, + { + "epoch": 0.6162962962962963, + "grad_norm": 23.576042719128466, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -1.3855410814285278, + "logits/rejected": -1.2717393636703491, + "logps/chosen": -26.286727905273438, + "logps/rejected": -31.481882095336914, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08100883662700653, + "rewards/margins": 0.31634223461151123, + "rewards/rejected": -0.3973510265350342, + "step": 52 + }, + { + "epoch": 0.6281481481481481, + "grad_norm": 24.575982948079687, + "learning_rate": 3.8970588235294116e-07, + "logits/chosen": -0.6433981657028198, + "logits/rejected": -0.9913661479949951, + "logps/chosen": -27.564329147338867, + "logps/rejected": -41.453670501708984, + "loss": 0.5978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.019089514389634132, + "rewards/margins": 0.37892332673072815, + "rewards/rejected": -0.3980128765106201, + "step": 53 + }, + { + "epoch": 0.64, + "grad_norm": 23.450435346962628, + "learning_rate": 3.9705882352941174e-07, + "logits/chosen": -0.7840040326118469, + "logits/rejected": -0.5774534344673157, + "logps/chosen": -34.3664665222168, + "logps/rejected": -39.58485412597656, + "loss": 0.5313, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06814204156398773, + "rewards/margins": 0.5044265389442444, + "rewards/rejected": -0.5725685954093933, + "step": 54 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 24.822846411492073, + "learning_rate": 4.044117647058823e-07, + "logits/chosen": -1.0658819675445557, + "logits/rejected": -1.2129803895950317, + "logps/chosen": -29.57436752319336, + "logps/rejected": -47.35321807861328, + "loss": 0.5771, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10051199793815613, + "rewards/margins": 0.5884958505630493, + "rewards/rejected": -0.6890078186988831, + "step": 55 + }, + { + "epoch": 0.6637037037037037, + "grad_norm": 25.00801264031218, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -1.0222803354263306, + "logits/rejected": -1.0347211360931396, + "logps/chosen": -30.616716384887695, + "logps/rejected": -36.40394592285156, + "loss": 0.5714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.018573857843875885, + "rewards/margins": 0.5196735262870789, + "rewards/rejected": -0.5382473468780518, + "step": 56 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 25.502073225889802, + "learning_rate": 4.191176470588235e-07, + "logits/chosen": -0.9514663219451904, + "logits/rejected": -0.796308696269989, + "logps/chosen": -30.12722396850586, + "logps/rejected": -33.41557312011719, + "loss": 0.5685, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14312461018562317, + "rewards/margins": 0.16847191751003265, + "rewards/rejected": -0.31159651279449463, + "step": 57 + }, + { + "epoch": 0.6874074074074074, + "grad_norm": 22.66345454008422, + "learning_rate": 4.264705882352941e-07, + "logits/chosen": -0.7501423358917236, + "logits/rejected": -0.7819620370864868, + "logps/chosen": -31.42731475830078, + "logps/rejected": -47.55392074584961, + "loss": 0.5282, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06128763407468796, + "rewards/margins": 0.8270988464355469, + "rewards/rejected": -0.888386607170105, + "step": 58 + }, + { + "epoch": 0.6992592592592592, + "grad_norm": 22.074290348194058, + "learning_rate": 4.338235294117647e-07, + "logits/chosen": -0.9507533311843872, + "logits/rejected": -0.9616032838821411, + "logps/chosen": -28.218658447265625, + "logps/rejected": -43.016136169433594, + "loss": 0.5125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.12597304582595825, + "rewards/margins": 0.6684972643852234, + "rewards/rejected": -0.7944703102111816, + "step": 59 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 23.08318692263043, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -1.2129336595535278, + "logits/rejected": -1.283077359199524, + "logps/chosen": -25.175594329833984, + "logps/rejected": -30.427568435668945, + "loss": 0.5012, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10316593945026398, + "rewards/margins": 0.6034290194511414, + "rewards/rejected": -0.7065950036048889, + "step": 60 + }, + { + "epoch": 0.7229629629629629, + "grad_norm": 22.04689507649114, + "learning_rate": 4.485294117647059e-07, + "logits/chosen": -1.275814175605774, + "logits/rejected": -1.2445783615112305, + "logps/chosen": -24.249958038330078, + "logps/rejected": -31.738033294677734, + "loss": 0.508, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.13786821067333221, + "rewards/margins": 0.44874635338783264, + "rewards/rejected": -0.5866145491600037, + "step": 61 + }, + { + "epoch": 0.7348148148148148, + "grad_norm": 21.718511918361607, + "learning_rate": 4.5588235294117646e-07, + "logits/chosen": -0.931826651096344, + "logits/rejected": -1.0001661777496338, + "logps/chosen": -25.03376007080078, + "logps/rejected": -40.26024627685547, + "loss": 0.5085, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06334442645311356, + "rewards/margins": 0.8100509643554688, + "rewards/rejected": -0.8733953237533569, + "step": 62 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 24.701982218388043, + "learning_rate": 4.6323529411764704e-07, + "logits/chosen": -1.1187516450881958, + "logits/rejected": -0.9583245515823364, + "logps/chosen": -29.71410369873047, + "logps/rejected": -37.30023956298828, + "loss": 0.5318, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0184539332985878, + "rewards/margins": 0.7952549457550049, + "rewards/rejected": -0.8137089014053345, + "step": 63 + }, + { + "epoch": 0.7585185185185185, + "grad_norm": 23.712296041316314, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -0.8638966083526611, + "logits/rejected": -0.7655866146087646, + "logps/chosen": -23.166181564331055, + "logps/rejected": -33.395484924316406, + "loss": 0.523, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.020204465836286545, + "rewards/margins": 0.6352415680885315, + "rewards/rejected": -0.6554459929466248, + "step": 64 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 24.60132344003472, + "learning_rate": 4.779411764705882e-07, + "logits/chosen": -0.726274311542511, + "logits/rejected": -0.8394519686698914, + "logps/chosen": -32.45426559448242, + "logps/rejected": -38.104736328125, + "loss": 0.5137, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1299125850200653, + "rewards/margins": 0.2771841883659363, + "rewards/rejected": -0.4070967733860016, + "step": 65 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 22.34792389998632, + "learning_rate": 4.852941176470588e-07, + "logits/chosen": -0.9895652532577515, + "logits/rejected": -0.8180733919143677, + "logps/chosen": -29.497535705566406, + "logps/rejected": -34.33931350708008, + "loss": 0.5037, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.03963299095630646, + "rewards/margins": 1.122789978981018, + "rewards/rejected": -1.162423014640808, + "step": 66 + }, + { + "epoch": 0.794074074074074, + "grad_norm": 22.03941990218751, + "learning_rate": 4.926470588235295e-07, + "logits/chosen": -1.2204893827438354, + "logits/rejected": -0.9625455737113953, + "logps/chosen": -24.71898651123047, + "logps/rejected": -29.869098663330078, + "loss": 0.4825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01972150057554245, + "rewards/margins": 1.2066266536712646, + "rewards/rejected": -1.2263481616973877, + "step": 67 + }, + { + "epoch": 0.8059259259259259, + "grad_norm": 24.106320190723114, + "learning_rate": 5e-07, + "logits/chosen": -1.2090452909469604, + "logits/rejected": -0.8567001819610596, + "logps/chosen": -40.8047981262207, + "logps/rejected": -41.501075744628906, + "loss": 0.5534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23644912242889404, + "rewards/margins": 0.4782140851020813, + "rewards/rejected": -0.7146631479263306, + "step": 68 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 21.771823138835973, + "learning_rate": 4.999966183013662e-07, + "logits/chosen": -0.8472105860710144, + "logits/rejected": -0.74596107006073, + "logps/chosen": -29.863178253173828, + "logps/rejected": -41.79008483886719, + "loss": 0.4941, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.031317003071308136, + "rewards/margins": 0.7321377396583557, + "rewards/rejected": -0.763454794883728, + "step": 69 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 24.376274282207536, + "learning_rate": 4.999864732969518e-07, + "logits/chosen": -1.0997436046600342, + "logits/rejected": -0.9688754081726074, + "logps/chosen": -25.0852108001709, + "logps/rejected": -30.06993865966797, + "loss": 0.5104, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1426914781332016, + "rewards/margins": 0.9001794457435608, + "rewards/rejected": -1.0428708791732788, + "step": 70 + }, + { + "epoch": 0.8414814814814815, + "grad_norm": 22.70502304759575, + "learning_rate": 4.999695652612155e-07, + "logits/chosen": -0.9267873167991638, + "logits/rejected": -0.7994822263717651, + "logps/chosen": -24.70890998840332, + "logps/rejected": -32.99658203125, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08305026590824127, + "rewards/margins": 0.9199594855308533, + "rewards/rejected": -1.0030097961425781, + "step": 71 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 24.314232282779354, + "learning_rate": 4.999458946515807e-07, + "logits/chosen": -1.0226339101791382, + "logits/rejected": -0.8611736297607422, + "logps/chosen": -24.211002349853516, + "logps/rejected": -35.78715896606445, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05914217233657837, + "rewards/margins": 0.8107442855834961, + "rewards/rejected": -0.8698864579200745, + "step": 72 + }, + { + "epoch": 0.8651851851851852, + "grad_norm": 23.657900907591774, + "learning_rate": 4.999154621084221e-07, + "logits/chosen": -0.9246867895126343, + "logits/rejected": -0.8785493969917297, + "logps/chosen": -30.946041107177734, + "logps/rejected": -33.3994255065918, + "loss": 0.5232, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11669294536113739, + "rewards/margins": 0.31690362095832825, + "rewards/rejected": -0.43359655141830444, + "step": 73 + }, + { + "epoch": 0.8770370370370371, + "grad_norm": 22.551868004831768, + "learning_rate": 4.998782684550491e-07, + "logits/chosen": -0.9841519594192505, + "logits/rejected": -1.094799518585205, + "logps/chosen": -28.580307006835938, + "logps/rejected": -43.50556182861328, + "loss": 0.499, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09129568934440613, + "rewards/margins": 0.9107706546783447, + "rewards/rejected": -1.0020663738250732, + "step": 74 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 22.142782690350444, + "learning_rate": 4.998343146976837e-07, + "logits/chosen": -1.3637927770614624, + "logits/rejected": -1.2565383911132812, + "logps/chosen": -22.514596939086914, + "logps/rejected": -34.200462341308594, + "loss": 0.5102, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12911996245384216, + "rewards/margins": 0.8493518233299255, + "rewards/rejected": -0.9784718751907349, + "step": 75 + }, + { + "epoch": 0.9007407407407407, + "grad_norm": 23.23826317747192, + "learning_rate": 4.997836020254328e-07, + "logits/chosen": -0.8813817501068115, + "logits/rejected": -0.672915518283844, + "logps/chosen": -29.698516845703125, + "logps/rejected": -37.84008026123047, + "loss": 0.4984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13888058066368103, + "rewards/margins": 0.8507511615753174, + "rewards/rejected": -0.9896316528320312, + "step": 76 + }, + { + "epoch": 0.9125925925925926, + "grad_norm": 21.893171367899452, + "learning_rate": 4.99726131810256e-07, + "logits/chosen": -0.9941626191139221, + "logits/rejected": -0.8581103682518005, + "logps/chosen": -25.82787322998047, + "logps/rejected": -26.007766723632812, + "loss": 0.4861, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.016278021037578583, + "rewards/margins": 0.5648348331451416, + "rewards/rejected": -0.5811129212379456, + "step": 77 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 21.62182885137629, + "learning_rate": 4.996619056069291e-07, + "logits/chosen": -0.7729543447494507, + "logits/rejected": -0.4389303922653198, + "logps/chosen": -30.95619010925293, + "logps/rejected": -32.129966735839844, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08845072984695435, + "rewards/margins": 0.7277066111564636, + "rewards/rejected": -0.8161574602127075, + "step": 78 + }, + { + "epoch": 0.9362962962962963, + "grad_norm": 23.213502848073986, + "learning_rate": 4.995909251530013e-07, + "logits/chosen": -0.94273841381073, + "logits/rejected": -0.8963134288787842, + "logps/chosen": -28.92119598388672, + "logps/rejected": -32.17072296142578, + "loss": 0.511, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11542296409606934, + "rewards/margins": 0.23704171180725098, + "rewards/rejected": -0.3524646759033203, + "step": 79 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 22.2887907095195, + "learning_rate": 4.995131923687487e-07, + "logits/chosen": -0.8467817902565002, + "logits/rejected": -0.9369296431541443, + "logps/chosen": -34.357215881347656, + "logps/rejected": -45.66539001464844, + "loss": 0.4817, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1713842898607254, + "rewards/margins": 0.5076985955238342, + "rewards/rejected": -0.6790828108787537, + "step": 80 + }, + { + "epoch": 0.96, + "grad_norm": 22.96182494835123, + "learning_rate": 4.994287093571221e-07, + "logits/chosen": -1.0453802347183228, + "logits/rejected": -0.796444296836853, + "logps/chosen": -32.47832107543945, + "logps/rejected": -33.27830123901367, + "loss": 0.4837, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07759351283311844, + "rewards/margins": 0.7651993632316589, + "rewards/rejected": -0.8427927494049072, + "step": 81 + }, + { + "epoch": 0.9718518518518519, + "grad_norm": 21.57718887204458, + "learning_rate": 4.993374784036901e-07, + "logits/chosen": -0.8520927429199219, + "logits/rejected": -0.7505441904067993, + "logps/chosen": -31.684307098388672, + "logps/rejected": -36.8294677734375, + "loss": 0.438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28118762373924255, + "rewards/margins": 0.9425290822982788, + "rewards/rejected": -1.2237166166305542, + "step": 82 + }, + { + "epoch": 0.9837037037037037, + "grad_norm": 19.78935688611565, + "learning_rate": 4.992395019765775e-07, + "logits/chosen": -1.0383273363113403, + "logits/rejected": -0.9351356625556946, + "logps/chosen": -23.624879837036133, + "logps/rejected": -39.410057067871094, + "loss": 0.4564, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.16446058452129364, + "rewards/margins": 1.0133116245269775, + "rewards/rejected": -1.1777722835540771, + "step": 83 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 20.85888712023128, + "learning_rate": 4.991347827263982e-07, + "logits/chosen": -0.8024070262908936, + "logits/rejected": -0.9107375741004944, + "logps/chosen": -23.73459815979004, + "logps/rejected": -36.912132263183594, + "loss": 0.4675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15683181583881378, + "rewards/margins": 0.8828233480453491, + "rewards/rejected": -1.0396552085876465, + "step": 84 + }, + { + "epoch": 1.0074074074074073, + "grad_norm": 21.201555546786057, + "learning_rate": 4.990233234861839e-07, + "logits/chosen": -1.1527302265167236, + "logits/rejected": -0.7363994121551514, + "logps/chosen": -33.276859283447266, + "logps/rejected": -40.38740539550781, + "loss": 0.4559, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10876104235649109, + "rewards/margins": 1.702566385269165, + "rewards/rejected": -1.8113272190093994, + "step": 85 + }, + { + "epoch": 1.0192592592592593, + "grad_norm": 18.56352446519458, + "learning_rate": 4.989051272713069e-07, + "logits/chosen": -1.0480620861053467, + "logits/rejected": -0.8963486552238464, + "logps/chosen": -24.252384185791016, + "logps/rejected": -33.39715576171875, + "loss": 0.4078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0004123076796531677, + "rewards/margins": 1.1432445049285889, + "rewards/rejected": -1.1436569690704346, + "step": 86 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 19.689301826887924, + "learning_rate": 4.987801972793993e-07, + "logits/chosen": -1.2163270711898804, + "logits/rejected": -1.0421555042266846, + "logps/chosen": -29.977680206298828, + "logps/rejected": -45.905757904052734, + "loss": 0.4003, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1577407568693161, + "rewards/margins": 1.4759492874145508, + "rewards/rejected": -1.6336898803710938, + "step": 87 + }, + { + "epoch": 1.0429629629629629, + "grad_norm": 18.31837515764729, + "learning_rate": 4.986485368902656e-07, + "logits/chosen": -0.9305309653282166, + "logits/rejected": -0.7790694832801819, + "logps/chosen": -28.83968734741211, + "logps/rejected": -45.16325378417969, + "loss": 0.3903, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06216466426849365, + "rewards/margins": 1.9154176712036133, + "rewards/rejected": -1.977582335472107, + "step": 88 + }, + { + "epoch": 1.0548148148148149, + "grad_norm": 20.64062540717056, + "learning_rate": 4.985101496657918e-07, + "logits/chosen": -0.9884095191955566, + "logits/rejected": -0.9845250844955444, + "logps/chosen": -26.488367080688477, + "logps/rejected": -39.72848892211914, + "loss": 0.4059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2092103362083435, + "rewards/margins": 1.6819262504577637, + "rewards/rejected": -1.8911365270614624, + "step": 89 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 20.58350959300496, + "learning_rate": 4.983650393498489e-07, + "logits/chosen": -0.9459998607635498, + "logits/rejected": -1.0477434396743774, + "logps/chosen": -28.036460876464844, + "logps/rejected": -47.1868782043457, + "loss": 0.4087, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07500169426202774, + "rewards/margins": 1.858386516571045, + "rewards/rejected": -1.933388113975525, + "step": 90 + }, + { + "epoch": 1.0785185185185184, + "grad_norm": 20.268639694693597, + "learning_rate": 4.982132098681923e-07, + "logits/chosen": -0.9951722621917725, + "logits/rejected": -1.0074284076690674, + "logps/chosen": -27.722091674804688, + "logps/rejected": -34.781681060791016, + "loss": 0.4277, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0997595265507698, + "rewards/margins": 0.8821978569030762, + "rewards/rejected": -0.9819574356079102, + "step": 91 + }, + { + "epoch": 1.0903703703703704, + "grad_norm": 18.542515385326787, + "learning_rate": 4.980546653283537e-07, + "logits/chosen": -0.8079184889793396, + "logits/rejected": -0.7196828126907349, + "logps/chosen": -26.553253173828125, + "logps/rejected": -42.56141662597656, + "loss": 0.3864, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15712840855121613, + "rewards/margins": 1.236411690711975, + "rewards/rejected": -1.3935401439666748, + "step": 92 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 19.264634270110978, + "learning_rate": 4.978894100195324e-07, + "logits/chosen": -0.8741264343261719, + "logits/rejected": -0.7673499584197998, + "logps/chosen": -30.112987518310547, + "logps/rejected": -40.45325469970703, + "loss": 0.3594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09071722626686096, + "rewards/margins": 1.353488802909851, + "rewards/rejected": -1.4442059993743896, + "step": 93 + }, + { + "epoch": 1.114074074074074, + "grad_norm": 19.800071465779904, + "learning_rate": 4.977174484124775e-07, + "logits/chosen": -0.9135103225708008, + "logits/rejected": -0.9411644339561462, + "logps/chosen": -31.76198959350586, + "logps/rejected": -36.60845947265625, + "loss": 0.3919, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11135930567979813, + "rewards/margins": 1.0709939002990723, + "rewards/rejected": -1.1823533773422241, + "step": 94 + }, + { + "epoch": 1.125925925925926, + "grad_norm": 17.895705636922976, + "learning_rate": 4.975387851593676e-07, + "logits/chosen": -1.1691641807556152, + "logits/rejected": -1.1903026103973389, + "logps/chosen": -27.760555267333984, + "logps/rejected": -39.08766555786133, + "loss": 0.3879, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03780115395784378, + "rewards/margins": 0.8077431321144104, + "rewards/rejected": -0.769942045211792, + "step": 95 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 20.599609019101436, + "learning_rate": 4.97353425093685e-07, + "logits/chosen": -0.7190616726875305, + "logits/rejected": -0.7100368142127991, + "logps/chosen": -25.934661865234375, + "logps/rejected": -34.01348876953125, + "loss": 0.4321, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10089172422885895, + "rewards/margins": 0.6498050689697266, + "rewards/rejected": -0.7506968379020691, + "step": 96 + }, + { + "epoch": 1.1496296296296296, + "grad_norm": 21.095071119686782, + "learning_rate": 4.971613732300848e-07, + "logits/chosen": -0.9941329956054688, + "logits/rejected": -0.9379687309265137, + "logps/chosen": -27.864826202392578, + "logps/rejected": -40.54108810424805, + "loss": 0.3575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0676848441362381, + "rewards/margins": 1.5730375051498413, + "rewards/rejected": -1.640722393989563, + "step": 97 + }, + { + "epoch": 1.1614814814814816, + "grad_norm": 17.403612874481382, + "learning_rate": 4.96962634764259e-07, + "logits/chosen": -1.2904139757156372, + "logits/rejected": -1.0033934116363525, + "logps/chosen": -30.47223472595215, + "logps/rejected": -40.54103088378906, + "loss": 0.3457, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018950080499053, + "rewards/margins": 1.4018616676330566, + "rewards/rejected": -1.3829115629196167, + "step": 98 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 17.611474070304414, + "learning_rate": 4.967572150727964e-07, + "logits/chosen": -1.046346664428711, + "logits/rejected": -0.7940016388893127, + "logps/chosen": -32.027244567871094, + "logps/rejected": -32.67860412597656, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13017599284648895, + "rewards/margins": 0.9814402461051941, + "rewards/rejected": -1.1116162538528442, + "step": 99 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 18.404945181563157, + "learning_rate": 4.965451197130372e-07, + "logits/chosen": -1.1877573728561401, + "logits/rejected": -0.7191611528396606, + "logps/chosen": -28.89226531982422, + "logps/rejected": -38.4400634765625, + "loss": 0.3818, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14333172142505646, + "rewards/margins": 1.7097716331481934, + "rewards/rejected": -1.5664398670196533, + "step": 100 + }, + { + "epoch": 1.1970370370370371, + "grad_norm": 19.263820617064404, + "learning_rate": 4.963263544229219e-07, + "logits/chosen": -0.9433082938194275, + "logits/rejected": -0.8548566699028015, + "logps/chosen": -31.192596435546875, + "logps/rejected": -43.11643600463867, + "loss": 0.3602, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05815482884645462, + "rewards/margins": 2.0110859870910645, + "rewards/rejected": -2.0692405700683594, + "step": 101 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 19.438323567719507, + "learning_rate": 4.961009251208367e-07, + "logits/chosen": -0.847985565662384, + "logits/rejected": -0.7285029292106628, + "logps/chosen": -27.112821578979492, + "logps/rejected": -36.1579475402832, + "loss": 0.3828, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09437136352062225, + "rewards/margins": 1.5882363319396973, + "rewards/rejected": -1.4938650131225586, + "step": 102 + }, + { + "epoch": 1.2207407407407407, + "grad_norm": 20.89714425336938, + "learning_rate": 4.958688379054535e-07, + "logits/chosen": -0.7281448841094971, + "logits/rejected": -0.7819719314575195, + "logps/chosen": -30.29817771911621, + "logps/rejected": -40.67383575439453, + "loss": 0.41, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1488310694694519, + "rewards/margins": 1.0787588357925415, + "rewards/rejected": -1.2275900840759277, + "step": 103 + }, + { + "epoch": 1.2325925925925927, + "grad_norm": 18.51161455832253, + "learning_rate": 4.956300990555643e-07, + "logits/chosen": -1.1979477405548096, + "logits/rejected": -1.2166452407836914, + "logps/chosen": -24.593063354492188, + "logps/rejected": -36.1388053894043, + "loss": 0.3529, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10680893063545227, + "rewards/margins": 1.3273341655731201, + "rewards/rejected": -1.4341431856155396, + "step": 104 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 18.348198834543016, + "learning_rate": 4.953847150299118e-07, + "logits/chosen": -1.2016291618347168, + "logits/rejected": -1.0083853006362915, + "logps/chosen": -26.028261184692383, + "logps/rejected": -31.78951644897461, + "loss": 0.3515, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.017123635858297348, + "rewards/margins": 1.5143089294433594, + "rewards/rejected": -1.497185230255127, + "step": 105 + }, + { + "epoch": 1.2562962962962962, + "grad_norm": 18.885448615868953, + "learning_rate": 4.951326924670147e-07, + "logits/chosen": -1.356000304222107, + "logits/rejected": -1.3348772525787354, + "logps/chosen": -31.302087783813477, + "logps/rejected": -37.98714828491211, + "loss": 0.3631, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.029658418148756027, + "rewards/margins": 1.4642913341522217, + "rewards/rejected": -1.4939496517181396, + "step": 106 + }, + { + "epoch": 1.268148148148148, + "grad_norm": 19.10542072499911, + "learning_rate": 4.948740381849879e-07, + "logits/chosen": -0.999729335308075, + "logits/rejected": -0.707848846912384, + "logps/chosen": -26.364988327026367, + "logps/rejected": -24.70641326904297, + "loss": 0.3761, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035974204540252686, + "rewards/margins": 1.0942362546920776, + "rewards/rejected": -1.0582619905471802, + "step": 107 + }, + { + "epoch": 1.28, + "grad_norm": 20.66299414009225, + "learning_rate": 4.94608759181358e-07, + "logits/chosen": -1.053645133972168, + "logits/rejected": -0.8790953755378723, + "logps/chosen": -41.80066680908203, + "logps/rejected": -44.09773254394531, + "loss": 0.3575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2904718518257141, + "rewards/margins": 1.806175708770752, + "rewards/rejected": -2.0966477394104004, + "step": 108 + }, + { + "epoch": 1.2918518518518518, + "grad_norm": 17.54860479077947, + "learning_rate": 4.943368626328741e-07, + "logits/chosen": -0.9635467529296875, + "logits/rejected": -0.7424119710922241, + "logps/chosen": -30.89739990234375, + "logps/rejected": -41.76382827758789, + "loss": 0.3285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05257820338010788, + "rewards/margins": 2.4511446952819824, + "rewards/rejected": -2.398566484451294, + "step": 109 + }, + { + "epoch": 1.3037037037037038, + "grad_norm": 18.829241299282575, + "learning_rate": 4.940583558953137e-07, + "logits/chosen": -0.9188340306282043, + "logits/rejected": -1.0607569217681885, + "logps/chosen": -22.153308868408203, + "logps/rejected": -44.96165466308594, + "loss": 0.3263, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0348847322165966, + "rewards/margins": 2.250445604324341, + "rewards/rejected": -2.285330057144165, + "step": 110 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 18.93995233274985, + "learning_rate": 4.937732465032838e-07, + "logits/chosen": -0.7081223726272583, + "logits/rejected": -0.8990048766136169, + "logps/chosen": -26.534879684448242, + "logps/rejected": -44.0599365234375, + "loss": 0.3816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27098196744918823, + "rewards/margins": 1.7641065120697021, + "rewards/rejected": -2.035088539123535, + "step": 111 + }, + { + "epoch": 1.3274074074074074, + "grad_norm": 20.79198255104679, + "learning_rate": 4.934815421700164e-07, + "logits/chosen": -1.372730016708374, + "logits/rejected": -1.3816195726394653, + "logps/chosen": -24.733922958374023, + "logps/rejected": -37.37799835205078, + "loss": 0.3793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2344232201576233, + "rewards/margins": 1.4653730392456055, + "rewards/rejected": -1.6997960805892944, + "step": 112 + }, + { + "epoch": 1.3392592592592591, + "grad_norm": 17.64638457869075, + "learning_rate": 4.93183250787161e-07, + "logits/chosen": -0.9247840642929077, + "logits/rejected": -1.0818623304367065, + "logps/chosen": -28.89555549621582, + "logps/rejected": -39.97201919555664, + "loss": 0.3466, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08911335468292236, + "rewards/margins": 0.9882626533508301, + "rewards/rejected": -1.0773760080337524, + "step": 113 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 19.44771569200443, + "learning_rate": 4.928783804245699e-07, + "logits/chosen": -0.9975395202636719, + "logits/rejected": -0.9134284257888794, + "logps/chosen": -31.9511775970459, + "logps/rejected": -44.12687683105469, + "loss": 0.3613, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.28740745782852173, + "rewards/margins": 1.532348394393921, + "rewards/rejected": -1.8197555541992188, + "step": 114 + }, + { + "epoch": 1.362962962962963, + "grad_norm": 18.822336901951047, + "learning_rate": 4.925669393300807e-07, + "logits/chosen": -0.9019233584403992, + "logits/rejected": -0.8816910982131958, + "logps/chosen": -21.266921997070312, + "logps/rejected": -38.15643310546875, + "loss": 0.3558, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09768679738044739, + "rewards/margins": 2.402815103530884, + "rewards/rejected": -2.3051280975341797, + "step": 115 + }, + { + "epoch": 1.374814814814815, + "grad_norm": 18.274649629189174, + "learning_rate": 4.922489359292927e-07, + "logits/chosen": -1.1270387172698975, + "logits/rejected": -1.1085700988769531, + "logps/chosen": -27.17832374572754, + "logps/rejected": -37.22381591796875, + "loss": 0.373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.028047073632478714, + "rewards/margins": 1.7937216758728027, + "rewards/rejected": -1.8217687606811523, + "step": 116 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 16.688764844778802, + "learning_rate": 4.919243788253393e-07, + "logits/chosen": -1.0447142124176025, + "logits/rejected": -1.042763590812683, + "logps/chosen": -22.125022888183594, + "logps/rejected": -39.07143020629883, + "loss": 0.2971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03231469914317131, + "rewards/margins": 2.012996196746826, + "rewards/rejected": -2.0453107357025146, + "step": 117 + }, + { + "epoch": 1.3985185185185185, + "grad_norm": 18.17869075719721, + "learning_rate": 4.915932767986551e-07, + "logits/chosen": -1.2121999263763428, + "logits/rejected": -1.1531389951705933, + "logps/chosen": -24.337661743164062, + "logps/rejected": -36.68294143676758, + "loss": 0.3297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.051829662173986435, + "rewards/margins": 1.766758680343628, + "rewards/rejected": -1.8185884952545166, + "step": 118 + }, + { + "epoch": 1.4103703703703703, + "grad_norm": 19.449877146869795, + "learning_rate": 4.912556388067381e-07, + "logits/chosen": -0.8913883566856384, + "logits/rejected": -0.9280557036399841, + "logps/chosen": -24.18404769897461, + "logps/rejected": -36.907875061035156, + "loss": 0.3403, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.250654011964798, + "rewards/margins": 1.399787425994873, + "rewards/rejected": -1.6504414081573486, + "step": 119 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 20.342939608264548, + "learning_rate": 4.909114739839079e-07, + "logits/chosen": -1.1030035018920898, + "logits/rejected": -1.1965720653533936, + "logps/chosen": -23.354150772094727, + "logps/rejected": -34.74749755859375, + "loss": 0.3736, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05992694944143295, + "rewards/margins": 1.4329164028167725, + "rewards/rejected": -1.372989535331726, + "step": 120 + }, + { + "epoch": 1.434074074074074, + "grad_norm": 15.715115469684475, + "learning_rate": 4.90560791641058e-07, + "logits/chosen": -0.990738570690155, + "logits/rejected": -0.7945737838745117, + "logps/chosen": -27.087312698364258, + "logps/rejected": -49.8587646484375, + "loss": 0.2877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19137343764305115, + "rewards/margins": 3.4916913509368896, + "rewards/rejected": -3.3003177642822266, + "step": 121 + }, + { + "epoch": 1.445925925925926, + "grad_norm": 17.221890947102303, + "learning_rate": 4.902036012654048e-07, + "logits/chosen": -1.0660780668258667, + "logits/rejected": -1.02411687374115, + "logps/chosen": -22.874664306640625, + "logps/rejected": -33.08205795288086, + "loss": 0.3072, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.06796490401029587, + "rewards/margins": 1.5805720090866089, + "rewards/rejected": -1.512607216835022, + "step": 122 + }, + { + "epoch": 1.4577777777777778, + "grad_norm": 16.005069196000893, + "learning_rate": 4.898399125202295e-07, + "logits/chosen": -1.1970123052597046, + "logits/rejected": -1.1410984992980957, + "logps/chosen": -28.59719467163086, + "logps/rejected": -42.150535583496094, + "loss": 0.2946, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3620311915874481, + "rewards/margins": 2.2089571952819824, + "rewards/rejected": -2.570988655090332, + "step": 123 + }, + { + "epoch": 1.4696296296296296, + "grad_norm": 17.717492260773117, + "learning_rate": 4.894697352446182e-07, + "logits/chosen": -0.9282441735267639, + "logits/rejected": -0.9922834038734436, + "logps/chosen": -23.3466739654541, + "logps/rejected": -40.82002258300781, + "loss": 0.3261, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03719540312886238, + "rewards/margins": 1.4782500267028809, + "rewards/rejected": -1.4410545825958252, + "step": 124 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 17.258307758545204, + "learning_rate": 4.890930794531947e-07, + "logits/chosen": -0.8717182278633118, + "logits/rejected": -0.5630895495414734, + "logps/chosen": -31.558549880981445, + "logps/rejected": -39.52922821044922, + "loss": 0.3095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1237109825015068, + "rewards/margins": 1.8089878559112549, + "rewards/rejected": -1.9326988458633423, + "step": 125 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 17.23695069915024, + "learning_rate": 4.887099553358501e-07, + "logits/chosen": -1.0558301210403442, + "logits/rejected": -1.1235636472702026, + "logps/chosen": -28.143768310546875, + "logps/rejected": -45.462162017822266, + "loss": 0.2959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.057410500943660736, + "rewards/margins": 2.5853257179260254, + "rewards/rejected": -2.6427361965179443, + "step": 126 + }, + { + "epoch": 1.5051851851851852, + "grad_norm": 16.801429168404855, + "learning_rate": 4.883203732574667e-07, + "logits/chosen": -0.9709302186965942, + "logits/rejected": -1.0419211387634277, + "logps/chosen": -27.448535919189453, + "logps/rejected": -36.76887512207031, + "loss": 0.301, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22691871225833893, + "rewards/margins": 1.6754447221755981, + "rewards/rejected": -1.9023634195327759, + "step": 127 + }, + { + "epoch": 1.5170370370370372, + "grad_norm": 17.682859896395517, + "learning_rate": 4.879243437576383e-07, + "logits/chosen": -1.0261518955230713, + "logits/rejected": -1.2332643270492554, + "logps/chosen": -26.014982223510742, + "logps/rejected": -44.39478302001953, + "loss": 0.3259, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.13659992814064026, + "rewards/margins": 1.5536094903945923, + "rewards/rejected": -1.6902093887329102, + "step": 128 + }, + { + "epoch": 1.528888888888889, + "grad_norm": 18.640841122291285, + "learning_rate": 4.875218775503837e-07, + "logits/chosen": -1.0689018964767456, + "logits/rejected": -0.980902373790741, + "logps/chosen": -24.02052116394043, + "logps/rejected": -46.01612854003906, + "loss": 0.315, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18941253423690796, + "rewards/margins": 2.954482078552246, + "rewards/rejected": -3.143894672393799, + "step": 129 + }, + { + "epoch": 1.5407407407407407, + "grad_norm": 17.210019096220396, + "learning_rate": 4.871129855238588e-07, + "logits/chosen": -0.8552242517471313, + "logits/rejected": -0.9273741245269775, + "logps/chosen": -25.354354858398438, + "logps/rejected": -45.11846160888672, + "loss": 0.2987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1260746717453003, + "rewards/margins": 2.2974648475646973, + "rewards/rejected": -2.423539638519287, + "step": 130 + }, + { + "epoch": 1.5525925925925925, + "grad_norm": 19.619597361039357, + "learning_rate": 4.866976787400601e-07, + "logits/chosen": -0.9767991304397583, + "logits/rejected": -0.8945188522338867, + "logps/chosen": -21.340879440307617, + "logps/rejected": -34.753665924072266, + "loss": 0.3739, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.011123912408947945, + "rewards/margins": 1.912041187286377, + "rewards/rejected": -1.9231650829315186, + "step": 131 + }, + { + "epoch": 1.5644444444444443, + "grad_norm": 17.099351733067884, + "learning_rate": 4.862759684345269e-07, + "logits/chosen": -1.0631641149520874, + "logits/rejected": -1.111916422843933, + "logps/chosen": -22.13747787475586, + "logps/rejected": -41.837890625, + "loss": 0.3078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3117489218711853, + "rewards/margins": 2.6900596618652344, + "rewards/rejected": -3.0018084049224854, + "step": 132 + }, + { + "epoch": 1.5762962962962963, + "grad_norm": 18.810360092932576, + "learning_rate": 4.858478660160363e-07, + "logits/chosen": -1.0127646923065186, + "logits/rejected": -1.1653410196304321, + "logps/chosen": -29.74410057067871, + "logps/rejected": -43.37088394165039, + "loss": 0.3068, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22267529368400574, + "rewards/margins": 2.1531336307525635, + "rewards/rejected": -2.3758089542388916, + "step": 133 + }, + { + "epoch": 1.5881481481481483, + "grad_norm": 16.58595912768427, + "learning_rate": 4.854133830662955e-07, + "logits/chosen": -0.9233917593955994, + "logits/rejected": -1.0519709587097168, + "logps/chosen": -21.388568878173828, + "logps/rejected": -41.858436584472656, + "loss": 0.3253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1711762249469757, + "rewards/margins": 1.9797799587249756, + "rewards/rejected": -2.150956153869629, + "step": 134 + }, + { + "epoch": 1.6, + "grad_norm": 18.097856896364952, + "learning_rate": 4.849725313396274e-07, + "logits/chosen": -1.0138639211654663, + "logits/rejected": -1.048040747642517, + "logps/chosen": -26.68703842163086, + "logps/rejected": -38.373451232910156, + "loss": 0.3383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.31828218698501587, + "rewards/margins": 1.6841886043548584, + "rewards/rejected": -2.0024707317352295, + "step": 135 + }, + { + "epoch": 1.6118518518518519, + "grad_norm": 18.718740932131244, + "learning_rate": 4.845253227626536e-07, + "logits/chosen": -1.1698411703109741, + "logits/rejected": -0.9726054072380066, + "logps/chosen": -31.787506103515625, + "logps/rejected": -36.80691146850586, + "loss": 0.3202, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5396450757980347, + "rewards/margins": 1.1574627161026, + "rewards/rejected": -1.6971076726913452, + "step": 136 + }, + { + "epoch": 1.6237037037037036, + "grad_norm": 22.38512258408975, + "learning_rate": 4.84071769433971e-07, + "logits/chosen": -1.1697680950164795, + "logits/rejected": -0.976157009601593, + "logps/chosen": -33.117286682128906, + "logps/rejected": -37.835052490234375, + "loss": 0.3394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3910491466522217, + "rewards/margins": 1.6805509328842163, + "rewards/rejected": -2.0715999603271484, + "step": 137 + }, + { + "epoch": 1.6355555555555554, + "grad_norm": 12.949807947370504, + "learning_rate": 4.836118836238252e-07, + "logits/chosen": -1.0753389596939087, + "logits/rejected": -1.0277228355407715, + "logps/chosen": -28.288246154785156, + "logps/rejected": -48.917877197265625, + "loss": 0.2072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13014695048332214, + "rewards/margins": 2.663167953491211, + "rewards/rejected": -2.7933151721954346, + "step": 138 + }, + { + "epoch": 1.6474074074074074, + "grad_norm": 17.386532268496858, + "learning_rate": 4.831456777737779e-07, + "logits/chosen": -1.0588066577911377, + "logits/rejected": -1.2369458675384521, + "logps/chosen": -20.888940811157227, + "logps/rejected": -32.269954681396484, + "loss": 0.2997, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23290792107582092, + "rewards/margins": 1.4484302997589111, + "rewards/rejected": -1.6813381910324097, + "step": 139 + }, + { + "epoch": 1.6592592592592592, + "grad_norm": 17.173102470345963, + "learning_rate": 4.826731644963704e-07, + "logits/chosen": -1.2401896715164185, + "logits/rejected": -1.2012938261032104, + "logps/chosen": -26.554821014404297, + "logps/rejected": -38.8468132019043, + "loss": 0.2831, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.18287095427513123, + "rewards/margins": 1.5197136402130127, + "rewards/rejected": -1.7025846242904663, + "step": 140 + }, + { + "epoch": 1.6711111111111112, + "grad_norm": 19.479729521536182, + "learning_rate": 4.82194356574783e-07, + "logits/chosen": -0.905685544013977, + "logits/rejected": -0.8802829384803772, + "logps/chosen": -22.686073303222656, + "logps/rejected": -41.08201599121094, + "loss": 0.2982, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.32921814918518066, + "rewards/margins": 2.4480390548706055, + "rewards/rejected": -2.7772574424743652, + "step": 141 + }, + { + "epoch": 1.682962962962963, + "grad_norm": 20.199723821111718, + "learning_rate": 4.817092669624882e-07, + "logits/chosen": -0.6202248930931091, + "logits/rejected": -1.0940805673599243, + "logps/chosen": -25.580005645751953, + "logps/rejected": -48.21833419799805, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22085842490196228, + "rewards/margins": 2.6502928733825684, + "rewards/rejected": -2.8711516857147217, + "step": 142 + }, + { + "epoch": 1.6948148148148148, + "grad_norm": 15.645495852904002, + "learning_rate": 4.812179087829012e-07, + "logits/chosen": -1.0544133186340332, + "logits/rejected": -1.0916544198989868, + "logps/chosen": -32.951744079589844, + "logps/rejected": -39.71953582763672, + "loss": 0.2821, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03797334432601929, + "rewards/margins": 1.9893945455551147, + "rewards/rejected": -1.9514211416244507, + "step": 143 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 17.750295917804582, + "learning_rate": 4.807202953290243e-07, + "logits/chosen": -0.7967438697814941, + "logits/rejected": -0.6681925058364868, + "logps/chosen": -31.293800354003906, + "logps/rejected": -36.21725845336914, + "loss": 0.3091, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2945789396762848, + "rewards/margins": 1.7707284688949585, + "rewards/rejected": -2.0653076171875, + "step": 144 + }, + { + "epoch": 1.7185185185185186, + "grad_norm": 19.260083896787478, + "learning_rate": 4.802164400630872e-07, + "logits/chosen": -1.1299808025360107, + "logits/rejected": -0.9482597708702087, + "logps/chosen": -32.00453567504883, + "logps/rejected": -46.12092208862305, + "loss": 0.3411, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30785343050956726, + "rewards/margins": 2.812390089035034, + "rewards/rejected": -3.1202433109283447, + "step": 145 + }, + { + "epoch": 1.7303703703703703, + "grad_norm": 17.66224210217437, + "learning_rate": 4.797063566161834e-07, + "logits/chosen": -0.6721981763839722, + "logits/rejected": -0.8360898494720459, + "logps/chosen": -24.348730087280273, + "logps/rejected": -43.98560333251953, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0023769522085785866, + "rewards/margins": 2.497642993927002, + "rewards/rejected": -2.4952659606933594, + "step": 146 + }, + { + "epoch": 1.7422222222222223, + "grad_norm": 15.415156535878221, + "learning_rate": 4.791900587879009e-07, + "logits/chosen": -0.756319522857666, + "logits/rejected": -0.8637655973434448, + "logps/chosen": -23.030933380126953, + "logps/rejected": -49.56877899169922, + "loss": 0.2918, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04079633206129074, + "rewards/margins": 3.7812483310699463, + "rewards/rejected": -3.822044849395752, + "step": 147 + }, + { + "epoch": 1.7540740740740741, + "grad_norm": 16.570189118993856, + "learning_rate": 4.786675605459487e-07, + "logits/chosen": -0.9925118684768677, + "logits/rejected": -0.9853470921516418, + "logps/chosen": -30.18797492980957, + "logps/rejected": -48.92622756958008, + "loss": 0.2735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33780062198638916, + "rewards/margins": 2.136337995529175, + "rewards/rejected": -2.4741387367248535, + "step": 148 + }, + { + "epoch": 1.765925925925926, + "grad_norm": 14.487088917255884, + "learning_rate": 4.781388760257799e-07, + "logits/chosen": -0.7950170040130615, + "logits/rejected": -1.119793176651001, + "logps/chosen": -32.499168395996094, + "logps/rejected": -43.4089469909668, + "loss": 0.2295, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3429313600063324, + "rewards/margins": 1.9538040161132812, + "rewards/rejected": -2.2967352867126465, + "step": 149 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 17.295525412903633, + "learning_rate": 4.776040195302079e-07, + "logits/chosen": -1.0716664791107178, + "logits/rejected": -0.7986952066421509, + "logps/chosen": -24.969951629638672, + "logps/rejected": -40.855533599853516, + "loss": 0.2815, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06795130670070648, + "rewards/margins": 2.534419298171997, + "rewards/rejected": -2.6023707389831543, + "step": 150 + }, + { + "epoch": 1.7896296296296297, + "grad_norm": 17.536732294132896, + "learning_rate": 4.770630055290208e-07, + "logits/chosen": -0.9179374575614929, + "logits/rejected": -1.0814074277877808, + "logps/chosen": -30.687143325805664, + "logps/rejected": -50.01094436645508, + "loss": 0.2793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.38232100009918213, + "rewards/margins": 2.5078325271606445, + "rewards/rejected": -2.890153646469116, + "step": 151 + }, + { + "epoch": 1.8014814814814815, + "grad_norm": 18.46399583316236, + "learning_rate": 4.76515848658589e-07, + "logits/chosen": -0.9876240491867065, + "logits/rejected": -0.6405973434448242, + "logps/chosen": -27.701950073242188, + "logps/rejected": -41.30625915527344, + "loss": 0.3049, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17839840054512024, + "rewards/margins": 2.12661075592041, + "rewards/rejected": -2.305009126663208, + "step": 152 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 15.710604254869798, + "learning_rate": 4.759625637214696e-07, + "logits/chosen": -1.0109493732452393, + "logits/rejected": -1.0782215595245361, + "logps/chosen": -24.582748413085938, + "logps/rejected": -38.63544464111328, + "loss": 0.2643, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6045380234718323, + "rewards/margins": 2.0814037322998047, + "rewards/rejected": -2.6859421730041504, + "step": 153 + }, + { + "epoch": 1.8251851851851852, + "grad_norm": 17.19159369354583, + "learning_rate": 4.754031656860059e-07, + "logits/chosen": -0.9032812118530273, + "logits/rejected": -0.9155201315879822, + "logps/chosen": -26.887935638427734, + "logps/rejected": -43.84403610229492, + "loss": 0.2882, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3690129816532135, + "rewards/margins": 2.0125298500061035, + "rewards/rejected": -2.381542921066284, + "step": 154 + }, + { + "epoch": 1.837037037037037, + "grad_norm": 17.64840405508708, + "learning_rate": 4.748376696859226e-07, + "logits/chosen": -0.8731366395950317, + "logits/rejected": -0.9406657218933105, + "logps/chosen": -35.8603401184082, + "logps/rejected": -54.6721076965332, + "loss": 0.2996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39435121417045593, + "rewards/margins": 2.3957834243774414, + "rewards/rejected": -2.7901346683502197, + "step": 155 + }, + { + "epoch": 1.8488888888888888, + "grad_norm": 16.897806608192592, + "learning_rate": 4.74266091019916e-07, + "logits/chosen": -1.1001473665237427, + "logits/rejected": -1.0638172626495361, + "logps/chosen": -32.389549255371094, + "logps/rejected": -41.59333801269531, + "loss": 0.2627, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.47744885087013245, + "rewards/margins": 2.048444986343384, + "rewards/rejected": -2.5258936882019043, + "step": 156 + }, + { + "epoch": 1.8607407407407406, + "grad_norm": 16.755773074311023, + "learning_rate": 4.7368844515124046e-07, + "logits/chosen": -1.2288262844085693, + "logits/rejected": -1.0816912651062012, + "logps/chosen": -20.01605796813965, + "logps/rejected": -37.578102111816406, + "loss": 0.2685, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2156611680984497, + "rewards/margins": 3.439674139022827, + "rewards/rejected": -3.224012851715088, + "step": 157 + }, + { + "epoch": 1.8725925925925926, + "grad_norm": 18.096404326471813, + "learning_rate": 4.7310474770728996e-07, + "logits/chosen": -1.2636739015579224, + "logits/rejected": -1.2596609592437744, + "logps/chosen": -27.145275115966797, + "logps/rejected": -46.22590637207031, + "loss": 0.3012, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3016894459724426, + "rewards/margins": 2.962885856628418, + "rewards/rejected": -3.2645750045776367, + "step": 158 + }, + { + "epoch": 1.8844444444444446, + "grad_norm": 20.583684819002986, + "learning_rate": 4.725150144791753e-07, + "logits/chosen": -1.1442478895187378, + "logits/rejected": -1.1148111820220947, + "logps/chosen": -29.054805755615234, + "logps/rejected": -35.72052001953125, + "loss": 0.3098, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17636916041374207, + "rewards/margins": 1.371252179145813, + "rewards/rejected": -1.547621488571167, + "step": 159 + }, + { + "epoch": 1.8962962962962964, + "grad_norm": 19.46902005864018, + "learning_rate": 4.719192614212969e-07, + "logits/chosen": -0.7011775374412537, + "logits/rejected": -1.0104095935821533, + "logps/chosen": -26.7205753326416, + "logps/rejected": -44.29551696777344, + "loss": 0.2899, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.46987485885620117, + "rewards/margins": 2.1664927005767822, + "rewards/rejected": -2.6363675594329834, + "step": 160 + }, + { + "epoch": 1.9081481481481481, + "grad_norm": 16.802592628002614, + "learning_rate": 4.713175046509131e-07, + "logits/chosen": -0.9744415283203125, + "logits/rejected": -0.8675547242164612, + "logps/chosen": -24.74662971496582, + "logps/rejected": -43.67926788330078, + "loss": 0.2793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15661165118217468, + "rewards/margins": 2.913247585296631, + "rewards/rejected": -3.069859266281128, + "step": 161 + }, + { + "epoch": 1.92, + "grad_norm": 17.67384446887718, + "learning_rate": 4.707097604477045e-07, + "logits/chosen": -0.8303030729293823, + "logits/rejected": -0.8715736865997314, + "logps/chosen": -27.890966415405273, + "logps/rejected": -43.261348724365234, + "loss": 0.2763, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22982636094093323, + "rewards/margins": 3.134204149246216, + "rewards/rejected": -3.364030361175537, + "step": 162 + }, + { + "epoch": 1.9318518518518517, + "grad_norm": 19.255369598632385, + "learning_rate": 4.700960452533328e-07, + "logits/chosen": -0.9890022873878479, + "logits/rejected": -0.789630115032196, + "logps/chosen": -29.18407440185547, + "logps/rejected": -35.658119201660156, + "loss": 0.2787, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19787032902240753, + "rewards/margins": 2.666562080383301, + "rewards/rejected": -2.8644325733184814, + "step": 163 + }, + { + "epoch": 1.9437037037037037, + "grad_norm": 16.66917060987199, + "learning_rate": 4.694763756709967e-07, + "logits/chosen": -0.6940379738807678, + "logits/rejected": -0.7478067874908447, + "logps/chosen": -23.664865493774414, + "logps/rejected": -43.157230377197266, + "loss": 0.288, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.17220884561538696, + "rewards/margins": 2.759270668029785, + "rewards/rejected": -2.9314796924591064, + "step": 164 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 17.594348865094073, + "learning_rate": 4.688507684649825e-07, + "logits/chosen": -0.8272866010665894, + "logits/rejected": -0.8017688989639282, + "logps/chosen": -33.46213150024414, + "logps/rejected": -47.20207977294922, + "loss": 0.301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6943995356559753, + "rewards/margins": 2.664170742034912, + "rewards/rejected": -3.3585705757141113, + "step": 165 + }, + { + "epoch": 1.9674074074074075, + "grad_norm": 18.253880938939563, + "learning_rate": 4.6821924056021053e-07, + "logits/chosen": -1.0482605695724487, + "logits/rejected": -0.9928416013717651, + "logps/chosen": -31.832792282104492, + "logps/rejected": -44.5983772277832, + "loss": 0.2999, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025602400302886963, + "rewards/margins": 2.2407517433166504, + "rewards/rejected": -2.2663540840148926, + "step": 166 + }, + { + "epoch": 1.9792592592592593, + "grad_norm": 18.56196744633324, + "learning_rate": 4.6758180904177715e-07, + "logits/chosen": -1.0284857749938965, + "logits/rejected": -0.9405824542045593, + "logps/chosen": -28.0195369720459, + "logps/rejected": -47.371124267578125, + "loss": 0.3204, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3122991621494293, + "rewards/margins": 2.1272220611572266, + "rewards/rejected": -2.439521312713623, + "step": 167 + }, + { + "epoch": 1.991111111111111, + "grad_norm": 13.908988351918019, + "learning_rate": 4.669384911544926e-07, + "logits/chosen": -0.8129782676696777, + "logits/rejected": -0.9865670800209045, + "logps/chosen": -22.810033798217773, + "logps/rejected": -52.896263122558594, + "loss": 0.234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1582772433757782, + "rewards/margins": 3.5472700595855713, + "rewards/rejected": -3.705547332763672, + "step": 168 + }, + { + "epoch": 2.002962962962963, + "grad_norm": 14.484614182012795, + "learning_rate": 4.6628930430241495e-07, + "logits/chosen": -0.7469329237937927, + "logits/rejected": -0.502386212348938, + "logps/chosen": -27.284469604492188, + "logps/rejected": -38.318477630615234, + "loss": 0.2358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07404109835624695, + "rewards/margins": 3.234562397003174, + "rewards/rejected": -3.1605215072631836, + "step": 169 + }, + { + "epoch": 2.0148148148148146, + "grad_norm": 10.915796020040675, + "learning_rate": 4.6563426604837817e-07, + "logits/chosen": -1.315629005432129, + "logits/rejected": -1.4473894834518433, + "logps/chosen": -26.924976348876953, + "logps/rejected": -51.19914627075195, + "loss": 0.2154, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18387992680072784, + "rewards/margins": 2.322176218032837, + "rewards/rejected": -2.5060558319091797, + "step": 170 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 10.428088727132913, + "learning_rate": 4.649733941135183e-07, + "logits/chosen": -0.8857893943786621, + "logits/rejected": -0.9030505418777466, + "logps/chosen": -24.972835540771484, + "logps/rejected": -43.09310531616211, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06505993753671646, + "rewards/margins": 3.5897836685180664, + "rewards/rejected": -3.654843330383301, + "step": 171 + }, + { + "epoch": 2.0385185185185186, + "grad_norm": 12.12837797479529, + "learning_rate": 4.6430670637679294e-07, + "logits/chosen": -0.7054718136787415, + "logits/rejected": -0.8474084734916687, + "logps/chosen": -21.63006019592285, + "logps/rejected": -39.9059944152832, + "loss": 0.1903, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05613371729850769, + "rewards/margins": 2.584840774536133, + "rewards/rejected": -2.640974521636963, + "step": 172 + }, + { + "epoch": 2.0503703703703704, + "grad_norm": 10.668378226356547, + "learning_rate": 4.636342208744981e-07, + "logits/chosen": -1.0311239957809448, + "logits/rejected": -1.260777473449707, + "logps/chosen": -23.770797729492188, + "logps/rejected": -47.466041564941406, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11173764616250992, + "rewards/margins": 3.481583595275879, + "rewards/rejected": -3.5933213233947754, + "step": 173 + }, + { + "epoch": 2.062222222222222, + "grad_norm": 10.557285470526292, + "learning_rate": 4.629559557997804e-07, + "logits/chosen": -0.9182557463645935, + "logits/rejected": -0.9343796968460083, + "logps/chosen": -22.01238441467285, + "logps/rejected": -42.540916442871094, + "loss": 0.1777, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1580294966697693, + "rewards/margins": 3.1986570358276367, + "rewards/rejected": -3.356686592102051, + "step": 174 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 11.326533415566665, + "learning_rate": 4.6227192950214435e-07, + "logits/chosen": -1.0387791395187378, + "logits/rejected": -0.9052732586860657, + "logps/chosen": -27.1229190826416, + "logps/rejected": -39.944488525390625, + "loss": 0.1671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016452651470899582, + "rewards/margins": 2.701723575592041, + "rewards/rejected": -2.7181763648986816, + "step": 175 + }, + { + "epoch": 2.0859259259259257, + "grad_norm": 11.013716006067964, + "learning_rate": 4.615821604869563e-07, + "logits/chosen": -0.6593332290649414, + "logits/rejected": -0.6703491806983948, + "logps/chosen": -30.929479598999023, + "logps/rejected": -53.678165435791016, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2048792690038681, + "rewards/margins": 3.190211296081543, + "rewards/rejected": -3.3950905799865723, + "step": 176 + }, + { + "epoch": 2.097777777777778, + "grad_norm": 9.522443462876495, + "learning_rate": 4.6088666741494384e-07, + "logits/chosen": -1.015365719795227, + "logits/rejected": -1.0170753002166748, + "logps/chosen": -32.784976959228516, + "logps/rejected": -62.65760040283203, + "loss": 0.1466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45717763900756836, + "rewards/margins": 4.587207317352295, + "rewards/rejected": -5.044384956359863, + "step": 177 + }, + { + "epoch": 2.1096296296296297, + "grad_norm": 10.004257598897565, + "learning_rate": 4.6018546910169067e-07, + "logits/chosen": -0.7767499685287476, + "logits/rejected": -0.8015838861465454, + "logps/chosen": -30.55481719970703, + "logps/rejected": -54.31606674194336, + "loss": 0.1471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2950357496738434, + "rewards/margins": 3.3505477905273438, + "rewards/rejected": -3.645583391189575, + "step": 178 + }, + { + "epoch": 2.1214814814814815, + "grad_norm": 10.418887738265333, + "learning_rate": 4.5947858451712773e-07, + "logits/chosen": -1.1200653314590454, + "logits/rejected": -1.180110216140747, + "logps/chosen": -29.036422729492188, + "logps/rejected": -50.913429260253906, + "loss": 0.1772, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10382096469402313, + "rewards/margins": 3.6935832500457764, + "rewards/rejected": -3.5897626876831055, + "step": 179 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 10.311681664393884, + "learning_rate": 4.5876603278502027e-07, + "logits/chosen": -0.6714676022529602, + "logits/rejected": -0.9245094060897827, + "logps/chosen": -24.886255264282227, + "logps/rejected": -59.40272521972656, + "loss": 0.1725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18294279277324677, + "rewards/margins": 3.529733180999756, + "rewards/rejected": -3.7126760482788086, + "step": 180 + }, + { + "epoch": 2.145185185185185, + "grad_norm": 10.598338326995862, + "learning_rate": 4.580478331824498e-07, + "logits/chosen": -0.9760642051696777, + "logits/rejected": -1.0019596815109253, + "logps/chosen": -20.441396713256836, + "logps/rejected": -28.71832275390625, + "loss": 0.1925, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18639449775218964, + "rewards/margins": 2.0053529739379883, + "rewards/rejected": -1.8189586400985718, + "step": 181 + }, + { + "epoch": 2.157037037037037, + "grad_norm": 10.273866806923419, + "learning_rate": 4.573240051392935e-07, + "logits/chosen": -0.9190815687179565, + "logits/rejected": -0.810968279838562, + "logps/chosen": -31.995441436767578, + "logps/rejected": -47.2755012512207, + "loss": 0.1488, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1086086630821228, + "rewards/margins": 2.7792186737060547, + "rewards/rejected": -2.8878276348114014, + "step": 182 + }, + { + "epoch": 2.168888888888889, + "grad_norm": 10.512121243888513, + "learning_rate": 4.565945682376977e-07, + "logits/chosen": -0.9659938216209412, + "logits/rejected": -0.9679660797119141, + "logps/chosen": -26.895111083984375, + "logps/rejected": -43.16779708862305, + "loss": 0.1553, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0222741961479187, + "rewards/margins": 2.5385921001434326, + "rewards/rejected": -2.5608668327331543, + "step": 183 + }, + { + "epoch": 2.180740740740741, + "grad_norm": 11.292043059472906, + "learning_rate": 4.5585954221154853e-07, + "logits/chosen": -0.9343494176864624, + "logits/rejected": -1.0062519311904907, + "logps/chosen": -22.46919822692871, + "logps/rejected": -44.482078552246094, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1603562980890274, + "rewards/margins": 3.266386032104492, + "rewards/rejected": -3.4267418384552, + "step": 184 + }, + { + "epoch": 2.1925925925925926, + "grad_norm": 11.816672083489024, + "learning_rate": 4.551189469459382e-07, + "logits/chosen": -0.786745011806488, + "logits/rejected": -0.6139059066772461, + "logps/chosen": -29.019519805908203, + "logps/rejected": -42.548431396484375, + "loss": 0.1738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33853879570961, + "rewards/margins": 2.533534288406372, + "rewards/rejected": -2.87207293510437, + "step": 185 + }, + { + "epoch": 2.2044444444444444, + "grad_norm": 11.1270432752319, + "learning_rate": 4.5437280247662646e-07, + "logits/chosen": -1.0432802438735962, + "logits/rejected": -0.9425604939460754, + "logps/chosen": -32.64457702636719, + "logps/rejected": -47.38275909423828, + "loss": 0.1729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1137063056230545, + "rewards/margins": 2.688105583190918, + "rewards/rejected": -2.801811933517456, + "step": 186 + }, + { + "epoch": 2.216296296296296, + "grad_norm": 9.879497457215198, + "learning_rate": 4.5362112898949947e-07, + "logits/chosen": -1.1339491605758667, + "logits/rejected": -0.8624970316886902, + "logps/chosen": -29.631816864013672, + "logps/rejected": -46.224395751953125, + "loss": 0.155, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18955475091934204, + "rewards/margins": 3.8356757164001465, + "rewards/rejected": -3.64612078666687, + "step": 187 + }, + { + "epoch": 2.228148148148148, + "grad_norm": 11.35395445567028, + "learning_rate": 4.528639468200226e-07, + "logits/chosen": -1.0913629531860352, + "logits/rejected": -1.0643333196640015, + "logps/chosen": -26.124116897583008, + "logps/rejected": -44.2266845703125, + "loss": 0.1582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25467032194137573, + "rewards/margins": 3.40421724319458, + "rewards/rejected": -3.6588873863220215, + "step": 188 + }, + { + "epoch": 2.24, + "grad_norm": 10.798532598173606, + "learning_rate": 4.5210127645269125e-07, + "logits/chosen": -0.8021432757377625, + "logits/rejected": -0.8973668813705444, + "logps/chosen": -24.38117790222168, + "logps/rejected": -44.69050598144531, + "loss": 0.1403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16808071732521057, + "rewards/margins": 3.162228584289551, + "rewards/rejected": -3.3303093910217285, + "step": 189 + }, + { + "epoch": 2.251851851851852, + "grad_norm": 11.00778435675744, + "learning_rate": 4.5133313852047613e-07, + "logits/chosen": -0.7498683929443359, + "logits/rejected": -0.8889190554618835, + "logps/chosen": -24.70016098022461, + "logps/rejected": -40.018638610839844, + "loss": 0.1443, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23445232212543488, + "rewards/margins": 2.9690420627593994, + "rewards/rejected": -3.203494071960449, + "step": 190 + }, + { + "epoch": 2.2637037037037038, + "grad_norm": 9.650405141066392, + "learning_rate": 4.5055955380426514e-07, + "logits/chosen": -1.0229731798171997, + "logits/rejected": -1.1155050992965698, + "logps/chosen": -26.343704223632812, + "logps/rejected": -41.02541732788086, + "loss": 0.1544, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05786347761750221, + "rewards/margins": 3.1654956340789795, + "rewards/rejected": -3.1076321601867676, + "step": 191 + }, + { + "epoch": 2.2755555555555556, + "grad_norm": 9.921102833174835, + "learning_rate": 4.4978054323230144e-07, + "logits/chosen": -1.1550260782241821, + "logits/rejected": -1.0600630044937134, + "logps/chosen": -28.518699645996094, + "logps/rejected": -44.826168060302734, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3105316162109375, + "rewards/margins": 4.2448906898498535, + "rewards/rejected": -4.555422306060791, + "step": 192 + }, + { + "epoch": 2.2874074074074073, + "grad_norm": 9.450635790377559, + "learning_rate": 4.489961278796167e-07, + "logits/chosen": -1.0403023958206177, + "logits/rejected": -1.0114576816558838, + "logps/chosen": -38.044036865234375, + "logps/rejected": -50.515472412109375, + "loss": 0.1563, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5873116254806519, + "rewards/margins": 3.311967134475708, + "rewards/rejected": -3.8992786407470703, + "step": 193 + }, + { + "epoch": 2.299259259259259, + "grad_norm": 9.85110506830502, + "learning_rate": 4.482063289674618e-07, + "logits/chosen": -0.8661289215087891, + "logits/rejected": -0.8539247512817383, + "logps/chosen": -25.6925048828125, + "logps/rejected": -40.71975326538086, + "loss": 0.142, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09132147580385208, + "rewards/margins": 3.0964341163635254, + "rewards/rejected": -3.005112648010254, + "step": 194 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 9.57230358742824, + "learning_rate": 4.4741116786273176e-07, + "logits/chosen": -0.9691765308380127, + "logits/rejected": -1.184732437133789, + "logps/chosen": -26.444503784179688, + "logps/rejected": -45.394161224365234, + "loss": 0.1358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04939919337630272, + "rewards/margins": 3.6279492378234863, + "rewards/rejected": -3.578549861907959, + "step": 195 + }, + { + "epoch": 2.322962962962963, + "grad_norm": 10.264213642082447, + "learning_rate": 4.466106660773884e-07, + "logits/chosen": -0.8523711562156677, + "logits/rejected": -0.7561182379722595, + "logps/chosen": -29.33661651611328, + "logps/rejected": -43.26116180419922, + "loss": 0.1323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02262909710407257, + "rewards/margins": 3.317958354949951, + "rewards/rejected": -3.3405871391296387, + "step": 196 + }, + { + "epoch": 2.334814814814815, + "grad_norm": 10.508283984586193, + "learning_rate": 4.4580484526787807e-07, + "logits/chosen": -0.7363325953483582, + "logits/rejected": -0.7178278565406799, + "logps/chosen": -24.33274269104004, + "logps/rejected": -34.1280403137207, + "loss": 0.1573, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.02388627827167511, + "rewards/margins": 2.7086284160614014, + "rewards/rejected": -2.6847422122955322, + "step": 197 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 9.573377777372183, + "learning_rate": 4.44993727234546e-07, + "logits/chosen": -0.8255077004432678, + "logits/rejected": -0.7383131980895996, + "logps/chosen": -20.382017135620117, + "logps/rejected": -36.610069274902344, + "loss": 0.1506, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15329806506633759, + "rewards/margins": 2.9509220123291016, + "rewards/rejected": -2.797624111175537, + "step": 198 + }, + { + "epoch": 2.3585185185185185, + "grad_norm": 9.188058087289072, + "learning_rate": 4.4417733392104585e-07, + "logits/chosen": -0.9828134775161743, + "logits/rejected": -0.9461156129837036, + "logps/chosen": -29.49842071533203, + "logps/rejected": -44.92794418334961, + "loss": 0.1324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010838674381375313, + "rewards/margins": 2.9451708793640137, + "rewards/rejected": -2.95600962638855, + "step": 199 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 9.995938278634934, + "learning_rate": 4.4335568741374695e-07, + "logits/chosen": -1.1138256788253784, + "logits/rejected": -0.9974204301834106, + "logps/chosen": -27.004098892211914, + "logps/rejected": -46.76878356933594, + "loss": 0.1287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01464901864528656, + "rewards/margins": 3.5402395725250244, + "rewards/rejected": -3.525590181350708, + "step": 200 + }, + { + "epoch": 2.3822222222222225, + "grad_norm": 9.269425170104977, + "learning_rate": 4.425288099411364e-07, + "logits/chosen": -0.9596495628356934, + "logits/rejected": -0.781050443649292, + "logps/chosen": -36.49948501586914, + "logps/rejected": -46.57865524291992, + "loss": 0.116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34845054149627686, + "rewards/margins": 3.1367316246032715, + "rewards/rejected": -3.485182285308838, + "step": 201 + }, + { + "epoch": 2.3940740740740742, + "grad_norm": 8.391778679248985, + "learning_rate": 4.4169672387321735e-07, + "logits/chosen": -0.9714689254760742, + "logits/rejected": -1.1357210874557495, + "logps/chosen": -26.742515563964844, + "logps/rejected": -48.0218620300293, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12224035710096359, + "rewards/margins": 3.619513511657715, + "rewards/rejected": -3.4972729682922363, + "step": 202 + }, + { + "epoch": 2.405925925925926, + "grad_norm": 9.299446663078193, + "learning_rate": 4.408594517209045e-07, + "logits/chosen": -1.0814424753189087, + "logits/rejected": -1.1163935661315918, + "logps/chosen": -27.21017837524414, + "logps/rejected": -46.6378288269043, + "loss": 0.1414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4369148015975952, + "rewards/margins": 3.762479066848755, + "rewards/rejected": -4.1993937492370605, + "step": 203 + }, + { + "epoch": 2.417777777777778, + "grad_norm": 9.608908773757774, + "learning_rate": 4.4001701613541454e-07, + "logits/chosen": -1.0588593482971191, + "logits/rejected": -0.7137413024902344, + "logps/chosen": -29.38337516784668, + "logps/rejected": -40.47539520263672, + "loss": 0.1348, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1216193437576294, + "rewards/margins": 3.52174711227417, + "rewards/rejected": -3.400127649307251, + "step": 204 + }, + { + "epoch": 2.4296296296296296, + "grad_norm": 9.76537745467198, + "learning_rate": 4.391694399076536e-07, + "logits/chosen": -1.0141160488128662, + "logits/rejected": -1.006161093711853, + "logps/chosen": -18.852293014526367, + "logps/rejected": -47.88473129272461, + "loss": 0.1292, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.040902793407440186, + "rewards/margins": 4.4104509353637695, + "rewards/rejected": -4.3695478439331055, + "step": 205 + }, + { + "epoch": 2.4414814814814814, + "grad_norm": 11.151794344568627, + "learning_rate": 4.383167459676008e-07, + "logits/chosen": -1.2655813694000244, + "logits/rejected": -1.3579968214035034, + "logps/chosen": -36.24526596069336, + "logps/rejected": -64.4418716430664, + "loss": 0.1529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33499276638031006, + "rewards/margins": 4.198886394500732, + "rewards/rejected": -4.533878803253174, + "step": 206 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 9.503514745890126, + "learning_rate": 4.374589573836874e-07, + "logits/chosen": -0.9888389706611633, + "logits/rejected": -0.9190107583999634, + "logps/chosen": -25.66510772705078, + "logps/rejected": -48.77112579345703, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2253144383430481, + "rewards/margins": 3.847475051879883, + "rewards/rejected": -4.072790145874023, + "step": 207 + }, + { + "epoch": 2.4651851851851854, + "grad_norm": 10.350601972964697, + "learning_rate": 4.365960973621734e-07, + "logits/chosen": -0.8265293836593628, + "logits/rejected": -1.0150339603424072, + "logps/chosen": -24.15806770324707, + "logps/rejected": -58.613380432128906, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30442744493484497, + "rewards/margins": 4.33758544921875, + "rewards/rejected": -4.642012596130371, + "step": 208 + }, + { + "epoch": 2.477037037037037, + "grad_norm": 8.643760874362815, + "learning_rate": 4.357281892465191e-07, + "logits/chosen": -0.9911346435546875, + "logits/rejected": -1.0368643999099731, + "logps/chosen": -26.939849853515625, + "logps/rejected": -56.80491638183594, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23226478695869446, + "rewards/margins": 3.6605029106140137, + "rewards/rejected": -3.892767906188965, + "step": 209 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 8.784181941966711, + "learning_rate": 4.348552565167542e-07, + "logits/chosen": -0.926344633102417, + "logits/rejected": -0.8433751463890076, + "logps/chosen": -28.94524574279785, + "logps/rejected": -44.438072204589844, + "loss": 0.1268, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3782831132411957, + "rewards/margins": 3.0210046768188477, + "rewards/rejected": -3.3992879390716553, + "step": 210 + }, + { + "epoch": 2.5007407407407407, + "grad_norm": 9.511335480740543, + "learning_rate": 4.3397732278884194e-07, + "logits/chosen": -0.6906044483184814, + "logits/rejected": -0.7649115324020386, + "logps/chosen": -33.39212417602539, + "logps/rejected": -48.415008544921875, + "loss": 0.1236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3532525897026062, + "rewards/margins": 2.9834935665130615, + "rewards/rejected": -3.3367464542388916, + "step": 211 + }, + { + "epoch": 2.5125925925925925, + "grad_norm": 9.170534986447953, + "learning_rate": 4.330944118140406e-07, + "logits/chosen": -1.1332037448883057, + "logits/rejected": -1.249849796295166, + "logps/chosen": -23.844146728515625, + "logps/rejected": -46.68682098388672, + "loss": 0.1202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03869599476456642, + "rewards/margins": 3.55596923828125, + "rewards/rejected": -3.59466552734375, + "step": 212 + }, + { + "epoch": 2.5244444444444447, + "grad_norm": 10.570737637064244, + "learning_rate": 4.322065474782609e-07, + "logits/chosen": -0.7816108465194702, + "logits/rejected": -0.8124703168869019, + "logps/chosen": -27.124149322509766, + "logps/rejected": -50.91386413574219, + "loss": 0.1535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2354280799627304, + "rewards/margins": 3.9667367935180664, + "rewards/rejected": -4.202165126800537, + "step": 213 + }, + { + "epoch": 2.536296296296296, + "grad_norm": 12.000930534607912, + "learning_rate": 4.313137538014198e-07, + "logits/chosen": -1.0383765697479248, + "logits/rejected": -1.054479956626892, + "logps/chosen": -20.81098747253418, + "logps/rejected": -40.86846923828125, + "loss": 0.1474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17991256713867188, + "rewards/margins": 2.990743637084961, + "rewards/rejected": -3.170656442642212, + "step": 214 + }, + { + "epoch": 2.5481481481481483, + "grad_norm": 9.377408723940578, + "learning_rate": 4.304160549367906e-07, + "logits/chosen": -1.2460458278656006, + "logits/rejected": -1.0874885320663452, + "logps/chosen": -25.634910583496094, + "logps/rejected": -35.86411666870117, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2946632504463196, + "rewards/margins": 3.1012935638427734, + "rewards/rejected": -3.3959569931030273, + "step": 215 + }, + { + "epoch": 2.56, + "grad_norm": 11.87094854333206, + "learning_rate": 4.295134751703492e-07, + "logits/chosen": -0.928742527961731, + "logits/rejected": -0.9784969091415405, + "logps/chosen": -24.898561477661133, + "logps/rejected": -51.34593963623047, + "loss": 0.1659, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3665264844894409, + "rewards/margins": 4.217537879943848, + "rewards/rejected": -4.58406400680542, + "step": 216 + }, + { + "epoch": 2.571851851851852, + "grad_norm": 9.07689060873071, + "learning_rate": 4.28606038920118e-07, + "logits/chosen": -0.8964906930923462, + "logits/rejected": -0.8997987508773804, + "logps/chosen": -22.863473892211914, + "logps/rejected": -43.20418930053711, + "loss": 0.1211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36831557750701904, + "rewards/margins": 3.239995002746582, + "rewards/rejected": -3.6083106994628906, + "step": 217 + }, + { + "epoch": 2.5837037037037036, + "grad_norm": 10.096635066622941, + "learning_rate": 4.276937707355044e-07, + "logits/chosen": -0.6937326192855835, + "logits/rejected": -0.7635350823402405, + "logps/chosen": -31.77448272705078, + "logps/rejected": -58.26279830932617, + "loss": 0.1309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28910571336746216, + "rewards/margins": 4.707897186279297, + "rewards/rejected": -4.997003078460693, + "step": 218 + }, + { + "epoch": 2.5955555555555554, + "grad_norm": 8.283171585551857, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": -1.1899882555007935, + "logits/rejected": -1.0946217775344849, + "logps/chosen": -27.072429656982422, + "logps/rejected": -38.691810607910156, + "loss": 0.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14001135528087616, + "rewards/margins": 2.991671562194824, + "rewards/rejected": -3.131682872772217, + "step": 219 + }, + { + "epoch": 2.6074074074074076, + "grad_norm": 7.992901648965998, + "learning_rate": 4.2585483741369755e-07, + "logits/chosen": -0.7397277355194092, + "logits/rejected": -0.9631584882736206, + "logps/chosen": -23.084850311279297, + "logps/rejected": -41.13532257080078, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06614308059215546, + "rewards/margins": 2.895981788635254, + "rewards/rejected": -2.962125301361084, + "step": 220 + }, + { + "epoch": 2.6192592592592594, + "grad_norm": 11.310778875512614, + "learning_rate": 4.2492822202625065e-07, + "logits/chosen": -0.9721293449401855, + "logits/rejected": -1.079483151435852, + "logps/chosen": -21.184463500976562, + "logps/rejected": -46.17851638793945, + "loss": 0.1222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06296464800834656, + "rewards/margins": 4.013933181762695, + "rewards/rejected": -4.076898097991943, + "step": 221 + }, + { + "epoch": 2.631111111111111, + "grad_norm": 8.569036120090367, + "learning_rate": 4.239968742025684e-07, + "logits/chosen": -1.535069227218628, + "logits/rejected": -1.5164189338684082, + "logps/chosen": -23.31195640563965, + "logps/rejected": -41.743492126464844, + "loss": 0.1046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2246365249156952, + "rewards/margins": 3.2981185913085938, + "rewards/rejected": -3.5227551460266113, + "step": 222 + }, + { + "epoch": 2.642962962962963, + "grad_norm": 9.763643273504174, + "learning_rate": 4.2306081913895177e-07, + "logits/chosen": -0.8104032278060913, + "logits/rejected": -0.7292832732200623, + "logps/chosen": -26.804920196533203, + "logps/rejected": -33.72347640991211, + "loss": 0.1207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5029786229133606, + "rewards/margins": 2.10044527053833, + "rewards/rejected": -2.603423833847046, + "step": 223 + }, + { + "epoch": 2.6548148148148147, + "grad_norm": 8.110012154364268, + "learning_rate": 4.2212008215905e-07, + "logits/chosen": -0.9389030933380127, + "logits/rejected": -1.1046040058135986, + "logps/chosen": -38.54084014892578, + "logps/rejected": -60.23002243041992, + "loss": 0.1005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8747321367263794, + "rewards/margins": 4.18320369720459, + "rewards/rejected": -5.05793571472168, + "step": 224 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 7.651104704560613, + "learning_rate": 4.2117468871317465e-07, + "logits/chosen": -0.6991132497787476, + "logits/rejected": -1.011946678161621, + "logps/chosen": -25.952993392944336, + "logps/rejected": -59.70689392089844, + "loss": 0.0994, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3089551329612732, + "rewards/margins": 4.471611499786377, + "rewards/rejected": -4.780567169189453, + "step": 225 + }, + { + "epoch": 2.6785185185185183, + "grad_norm": 8.662418287718898, + "learning_rate": 4.2022466437761154e-07, + "logits/chosen": -0.7546372413635254, + "logits/rejected": -0.9424067735671997, + "logps/chosen": -23.357009887695312, + "logps/rejected": -48.18181610107422, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04475121945142746, + "rewards/margins": 3.4985172748565674, + "rewards/rejected": -3.5432686805725098, + "step": 226 + }, + { + "epoch": 2.6903703703703705, + "grad_norm": 9.933414328601483, + "learning_rate": 4.1927003485392873e-07, + "logits/chosen": -0.9893782734870911, + "logits/rejected": -0.8852970004081726, + "logps/chosen": -29.632408142089844, + "logps/rejected": -43.87162399291992, + "loss": 0.1189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17188510298728943, + "rewards/margins": 3.24285626411438, + "rewards/rejected": -3.414741039276123, + "step": 227 + }, + { + "epoch": 2.7022222222222223, + "grad_norm": 9.996913313576108, + "learning_rate": 4.18310825968281e-07, + "logits/chosen": -0.6748302578926086, + "logits/rejected": -0.5683417320251465, + "logps/chosen": -29.551555633544922, + "logps/rejected": -41.79899978637695, + "loss": 0.1211, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5972310900688171, + "rewards/margins": 2.882406711578369, + "rewards/rejected": -3.47963809967041, + "step": 228 + }, + { + "epoch": 2.714074074074074, + "grad_norm": 9.705460406332598, + "learning_rate": 4.173470636707115e-07, + "logits/chosen": -0.8065865635871887, + "logits/rejected": -0.6933514475822449, + "logps/chosen": -28.950008392333984, + "logps/rejected": -50.10881042480469, + "loss": 0.1251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5557261109352112, + "rewards/margins": 3.8404664993286133, + "rewards/rejected": -4.396193027496338, + "step": 229 + }, + { + "epoch": 2.725925925925926, + "grad_norm": 10.109073403000078, + "learning_rate": 4.1637877403444923e-07, + "logits/chosen": -1.1754989624023438, + "logits/rejected": -1.002323031425476, + "logps/chosen": -35.7281608581543, + "logps/rejected": -55.17790222167969, + "loss": 0.1276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6030625104904175, + "rewards/margins": 4.27664852142334, + "rewards/rejected": -4.879711151123047, + "step": 230 + }, + { + "epoch": 2.7377777777777776, + "grad_norm": 8.645925075071077, + "learning_rate": 4.1540598325520406e-07, + "logits/chosen": -1.399937391281128, + "logits/rejected": -1.1283848285675049, + "logps/chosen": -24.478912353515625, + "logps/rejected": -35.189491271972656, + "loss": 0.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45019960403442383, + "rewards/margins": 3.054943561553955, + "rewards/rejected": -3.505143165588379, + "step": 231 + }, + { + "epoch": 2.74962962962963, + "grad_norm": 10.429252502571046, + "learning_rate": 4.144287176504582e-07, + "logits/chosen": -0.7452750205993652, + "logits/rejected": -0.5704357028007507, + "logps/chosen": -36.41873550415039, + "logps/rejected": -53.760765075683594, + "loss": 0.1384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.666602611541748, + "rewards/margins": 3.7374300956726074, + "rewards/rejected": -4.4040327072143555, + "step": 232 + }, + { + "epoch": 2.7614814814814816, + "grad_norm": 10.151446493994069, + "learning_rate": 4.1344700365875353e-07, + "logits/chosen": -1.1818780899047852, + "logits/rejected": -1.3208847045898438, + "logps/chosen": -22.36372184753418, + "logps/rejected": -48.73514938354492, + "loss": 0.1381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4263989329338074, + "rewards/margins": 3.8192248344421387, + "rewards/rejected": -4.245623588562012, + "step": 233 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 8.923699196652041, + "learning_rate": 4.1246086783897713e-07, + "logits/chosen": -1.076073169708252, + "logits/rejected": -0.8665668964385986, + "logps/chosen": -29.229549407958984, + "logps/rejected": -52.872596740722656, + "loss": 0.1257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6716349124908447, + "rewards/margins": 4.262983322143555, + "rewards/rejected": -4.934618949890137, + "step": 234 + }, + { + "epoch": 2.785185185185185, + "grad_norm": 9.216060394380802, + "learning_rate": 4.1147033686964213e-07, + "logits/chosen": -0.9747135043144226, + "logits/rejected": -0.8782777190208435, + "logps/chosen": -21.773653030395508, + "logps/rejected": -36.27745056152344, + "loss": 0.115, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3823280334472656, + "rewards/margins": 3.0221927165985107, + "rewards/rejected": -3.4045207500457764, + "step": 235 + }, + { + "epoch": 2.797037037037037, + "grad_norm": 10.290543608918464, + "learning_rate": 4.104754375481664e-07, + "logits/chosen": -1.1765553951263428, + "logits/rejected": -1.2661182880401611, + "logps/chosen": -31.247966766357422, + "logps/rejected": -58.30694580078125, + "loss": 0.1424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6638414263725281, + "rewards/margins": 5.012779235839844, + "rewards/rejected": -5.6766204833984375, + "step": 236 + }, + { + "epoch": 2.8088888888888888, + "grad_norm": 9.054189170474283, + "learning_rate": 4.0947619679014733e-07, + "logits/chosen": -0.7130009531974792, + "logits/rejected": -0.9547990560531616, + "logps/chosen": -33.134037017822266, + "logps/rejected": -51.69905471801758, + "loss": 0.1251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8230234980583191, + "rewards/margins": 3.9294910430908203, + "rewards/rejected": -4.752514362335205, + "step": 237 + }, + { + "epoch": 2.8207407407407405, + "grad_norm": 7.909168568867212, + "learning_rate": 4.084726416286337e-07, + "logits/chosen": -0.9382051825523376, + "logits/rejected": -0.887077808380127, + "logps/chosen": -27.565048217773438, + "logps/rejected": -46.6566047668457, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8664588928222656, + "rewards/margins": 3.4661688804626465, + "rewards/rejected": -4.332627773284912, + "step": 238 + }, + { + "epoch": 2.8325925925925928, + "grad_norm": 11.86775329162737, + "learning_rate": 4.0746479921339456e-07, + "logits/chosen": -1.0731661319732666, + "logits/rejected": -0.9709546566009521, + "logps/chosen": -42.645668029785156, + "logps/rejected": -56.481353759765625, + "loss": 0.126, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8624676465988159, + "rewards/margins": 3.5984742641448975, + "rewards/rejected": -4.460941791534424, + "step": 239 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 8.349623438951571, + "learning_rate": 4.0645269681018434e-07, + "logits/chosen": -0.8875783681869507, + "logits/rejected": -0.7977613210678101, + "logps/chosen": -32.80491638183594, + "logps/rejected": -38.476165771484375, + "loss": 0.1124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5840538740158081, + "rewards/margins": 2.323394298553467, + "rewards/rejected": -2.9074482917785645, + "step": 240 + }, + { + "epoch": 2.8562962962962963, + "grad_norm": 9.486189032436748, + "learning_rate": 4.054363618000057e-07, + "logits/chosen": -0.8179698586463928, + "logits/rejected": -0.8795968294143677, + "logps/chosen": -34.11067199707031, + "logps/rejected": -53.53862762451172, + "loss": 0.1261, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3519166111946106, + "rewards/margins": 4.845457553863525, + "rewards/rejected": -5.19737434387207, + "step": 241 + }, + { + "epoch": 2.868148148148148, + "grad_norm": 10.099080966219972, + "learning_rate": 4.044158216783684e-07, + "logits/chosen": -1.0476202964782715, + "logits/rejected": -1.0129348039627075, + "logps/chosen": -31.891279220581055, + "logps/rejected": -46.14010238647461, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42091822624206543, + "rewards/margins": 3.888162851333618, + "rewards/rejected": -4.309080600738525, + "step": 242 + }, + { + "epoch": 2.88, + "grad_norm": 10.049719054947364, + "learning_rate": 4.033911040545453e-07, + "logits/chosen": -1.2477302551269531, + "logits/rejected": -0.8980987071990967, + "logps/chosen": -41.76299285888672, + "logps/rejected": -53.11893081665039, + "loss": 0.109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1314184069633484, + "rewards/margins": 4.016972064971924, + "rewards/rejected": -4.148390769958496, + "step": 243 + }, + { + "epoch": 2.891851851851852, + "grad_norm": 9.535753969791891, + "learning_rate": 4.0236223665082605e-07, + "logits/chosen": -1.010503888130188, + "logits/rejected": -0.7471677660942078, + "logps/chosen": -34.18782043457031, + "logps/rejected": -46.442893981933594, + "loss": 0.1214, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2552086412906647, + "rewards/margins": 3.847254753112793, + "rewards/rejected": -4.102463722229004, + "step": 244 + }, + { + "epoch": 2.9037037037037035, + "grad_norm": 10.85723484411409, + "learning_rate": 4.0132924730176653e-07, + "logits/chosen": -1.1365649700164795, + "logits/rejected": -0.9161389470100403, + "logps/chosen": -24.016315460205078, + "logps/rejected": -46.270179748535156, + "loss": 0.1335, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20293915271759033, + "rewards/margins": 4.1694440841674805, + "rewards/rejected": -4.372383117675781, + "step": 245 + }, + { + "epoch": 2.9155555555555557, + "grad_norm": 8.982083345550631, + "learning_rate": 4.0029216395343617e-07, + "logits/chosen": -1.1369932889938354, + "logits/rejected": -1.1437650918960571, + "logps/chosen": -35.65043640136719, + "logps/rejected": -61.12848663330078, + "loss": 0.1136, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9092841744422913, + "rewards/margins": 4.529256343841553, + "rewards/rejected": -5.438540458679199, + "step": 246 + }, + { + "epoch": 2.9274074074074075, + "grad_norm": 8.744966472645434, + "learning_rate": 3.992510146626617e-07, + "logits/chosen": -0.8251878023147583, + "logits/rejected": -0.6449207663536072, + "logps/chosen": -33.99653625488281, + "logps/rejected": -56.43595886230469, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8442633152008057, + "rewards/margins": 4.416510581970215, + "rewards/rejected": -5.2607741355896, + "step": 247 + }, + { + "epoch": 2.9392592592592592, + "grad_norm": 10.793634421067168, + "learning_rate": 3.982058275962682e-07, + "logits/chosen": -1.014967918395996, + "logits/rejected": -0.7875441312789917, + "logps/chosen": -29.53582000732422, + "logps/rejected": -38.50312423706055, + "loss": 0.1438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7254617214202881, + "rewards/margins": 1.988916039466858, + "rewards/rejected": -2.7143778800964355, + "step": 248 + }, + { + "epoch": 2.951111111111111, + "grad_norm": 8.961988941988425, + "learning_rate": 3.9715663103031706e-07, + "logits/chosen": -0.7619471549987793, + "logits/rejected": -0.7742725610733032, + "logps/chosen": -34.99053192138672, + "logps/rejected": -46.08906173706055, + "loss": 0.1042, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7473491430282593, + "rewards/margins": 3.3373470306396484, + "rewards/rejected": -4.084695816040039, + "step": 249 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 8.867625542462614, + "learning_rate": 3.9610345334934094e-07, + "logits/chosen": -1.1330126523971558, + "logits/rejected": -1.1697674989700317, + "logps/chosen": -23.25260353088379, + "logps/rejected": -43.87166976928711, + "loss": 0.1069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29624325037002563, + "rewards/margins": 3.2834043502807617, + "rewards/rejected": -3.5796477794647217, + "step": 250 + }, + { + "epoch": 2.974814814814815, + "grad_norm": 9.248026100787925, + "learning_rate": 3.950463230455761e-07, + "logits/chosen": -1.2669241428375244, + "logits/rejected": -1.0569002628326416, + "logps/chosen": -32.21596145629883, + "logps/rejected": -60.97792053222656, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45450618863105774, + "rewards/margins": 6.268526077270508, + "rewards/rejected": -6.723032474517822, + "step": 251 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 9.82476144546843, + "learning_rate": 3.939852687181915e-07, + "logits/chosen": -0.6608531475067139, + "logits/rejected": -0.8034135699272156, + "logps/chosen": -24.462186813354492, + "logps/rejected": -50.0887451171875, + "loss": 0.1354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4931001365184784, + "rewards/margins": 4.019628047943115, + "rewards/rejected": -4.512728691101074, + "step": 252 + }, + { + "epoch": 2.9985185185185186, + "grad_norm": 9.046237713104512, + "learning_rate": 3.9292031907251464e-07, + "logits/chosen": -1.1486996412277222, + "logits/rejected": -1.2216451168060303, + "logps/chosen": -30.053659439086914, + "logps/rejected": -57.20942306518555, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7489092350006104, + "rewards/margins": 3.8848938941955566, + "rewards/rejected": -4.633802890777588, + "step": 253 + }, + { + "epoch": 3.0103703703703704, + "grad_norm": 6.092995378035237, + "learning_rate": 3.9185150291925585e-07, + "logits/chosen": -1.1667040586471558, + "logits/rejected": -1.2177824974060059, + "logps/chosen": -21.940555572509766, + "logps/rejected": -45.18644332885742, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5815523862838745, + "rewards/margins": 3.84624981880188, + "rewards/rejected": -4.427802085876465, + "step": 254 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 5.889868303476228, + "learning_rate": 3.9077884917372806e-07, + "logits/chosen": -1.2974051237106323, + "logits/rejected": -1.1213111877441406, + "logps/chosen": -24.889671325683594, + "logps/rejected": -53.81795883178711, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10434576869010925, + "rewards/margins": 5.804339408874512, + "rewards/rejected": -5.908684730529785, + "step": 255 + }, + { + "epoch": 3.034074074074074, + "grad_norm": 5.6665752841935895, + "learning_rate": 3.8970238685506486e-07, + "logits/chosen": -0.9169086813926697, + "logits/rejected": -0.8243510127067566, + "logps/chosen": -25.639080047607422, + "logps/rejected": -44.80521011352539, + "loss": 0.0712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4333856403827667, + "rewards/margins": 4.004616737365723, + "rewards/rejected": -4.438002586364746, + "step": 256 + }, + { + "epoch": 3.0459259259259257, + "grad_norm": 6.195071442100622, + "learning_rate": 3.8862214508543544e-07, + "logits/chosen": -0.7453622817993164, + "logits/rejected": -0.8452744483947754, + "logps/chosen": -27.675142288208008, + "logps/rejected": -54.29957580566406, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2128800004720688, + "rewards/margins": 4.886155605316162, + "rewards/rejected": -5.099035263061523, + "step": 257 + }, + { + "epoch": 3.057777777777778, + "grad_norm": 4.577785723486363, + "learning_rate": 3.8753815308925685e-07, + "logits/chosen": -0.7777894735336304, + "logits/rejected": -0.9331192970275879, + "logps/chosen": -31.157546997070312, + "logps/rejected": -64.40248107910156, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4510951638221741, + "rewards/margins": 4.6957244873046875, + "rewards/rejected": -5.146819591522217, + "step": 258 + }, + { + "epoch": 3.0696296296296297, + "grad_norm": 6.4011580737866005, + "learning_rate": 3.864504401924031e-07, + "logits/chosen": -0.830951452255249, + "logits/rejected": -0.8679866790771484, + "logps/chosen": -27.256206512451172, + "logps/rejected": -55.158851623535156, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7141133546829224, + "rewards/margins": 4.929049968719482, + "rewards/rejected": -5.643163204193115, + "step": 259 + }, + { + "epoch": 3.0814814814814815, + "grad_norm": 5.755747603737944, + "learning_rate": 3.8535903582141184e-07, + "logits/chosen": -1.0873059034347534, + "logits/rejected": -0.797295868396759, + "logps/chosen": -40.58049011230469, + "logps/rejected": -63.331024169921875, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8855617046356201, + "rewards/margins": 5.494954586029053, + "rewards/rejected": -6.380516052246094, + "step": 260 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 5.377551785753514, + "learning_rate": 3.8426396950268846e-07, + "logits/chosen": -1.0642486810684204, + "logits/rejected": -1.0869853496551514, + "logps/chosen": -25.782983779907227, + "logps/rejected": -47.94844436645508, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5024091005325317, + "rewards/margins": 4.591217041015625, + "rewards/rejected": -5.093626022338867, + "step": 261 + }, + { + "epoch": 3.105185185185185, + "grad_norm": 5.805844710790357, + "learning_rate": 3.8316527086170727e-07, + "logits/chosen": -1.4574161767959595, + "logits/rejected": -1.214327335357666, + "logps/chosen": -37.066261291503906, + "logps/rejected": -46.37734603881836, + "loss": 0.075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3814047574996948, + "rewards/margins": 4.010954856872559, + "rewards/rejected": -4.392359733581543, + "step": 262 + }, + { + "epoch": 3.117037037037037, + "grad_norm": 5.631859361425549, + "learning_rate": 3.820629696222096e-07, + "logits/chosen": -1.037698745727539, + "logits/rejected": -1.2875399589538574, + "logps/chosen": -30.273286819458008, + "logps/rejected": -63.62456130981445, + "loss": 0.0589, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.311270534992218, + "rewards/margins": 4.6157755851745605, + "rewards/rejected": -4.927045822143555, + "step": 263 + }, + { + "epoch": 3.128888888888889, + "grad_norm": 6.4282160152251775, + "learning_rate": 3.809570956054003e-07, + "logits/chosen": -0.8523592352867126, + "logits/rejected": -1.133123517036438, + "logps/chosen": -25.40625762939453, + "logps/rejected": -51.35416030883789, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44330236315727234, + "rewards/margins": 4.313521862030029, + "rewards/rejected": -4.756824016571045, + "step": 264 + }, + { + "epoch": 3.140740740740741, + "grad_norm": 4.79153948993895, + "learning_rate": 3.798476787291407e-07, + "logits/chosen": -0.9867359399795532, + "logits/rejected": -0.8984818458557129, + "logps/chosen": -34.52312088012695, + "logps/rejected": -60.01667785644531, + "loss": 0.0576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7610639929771423, + "rewards/margins": 5.0160813331604, + "rewards/rejected": -5.777144908905029, + "step": 265 + }, + { + "epoch": 3.1525925925925926, + "grad_norm": 5.75718528813517, + "learning_rate": 3.787347490071389e-07, + "logits/chosen": -1.120936393737793, + "logits/rejected": -1.2262120246887207, + "logps/chosen": -23.01953125, + "logps/rejected": -61.13671112060547, + "loss": 0.0727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14551204442977905, + "rewards/margins": 5.611021041870117, + "rewards/rejected": -5.756533622741699, + "step": 266 + }, + { + "epoch": 3.1644444444444444, + "grad_norm": 7.215037149667764, + "learning_rate": 3.776183365481385e-07, + "logits/chosen": -0.8273714780807495, + "logits/rejected": -0.6773754358291626, + "logps/chosen": -22.21053123474121, + "logps/rejected": -43.30362319946289, + "loss": 0.0778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09048902243375778, + "rewards/margins": 4.530794620513916, + "rewards/rejected": -4.621283531188965, + "step": 267 + }, + { + "epoch": 3.176296296296296, + "grad_norm": 5.823890371998723, + "learning_rate": 3.764984715551031e-07, + "logits/chosen": -0.9349130392074585, + "logits/rejected": -0.9820988178253174, + "logps/chosen": -29.579639434814453, + "logps/rejected": -57.95941925048828, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10734206438064575, + "rewards/margins": 4.939681053161621, + "rewards/rejected": -5.047023296356201, + "step": 268 + }, + { + "epoch": 3.188148148148148, + "grad_norm": 5.996717704776933, + "learning_rate": 3.753751843244003e-07, + "logits/chosen": -0.9044997692108154, + "logits/rejected": -1.1144652366638184, + "logps/chosen": -29.467838287353516, + "logps/rejected": -60.72572326660156, + "loss": 0.0656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6497594714164734, + "rewards/margins": 5.051385402679443, + "rewards/rejected": -5.701144695281982, + "step": 269 + }, + { + "epoch": 3.2, + "grad_norm": 5.541764509927049, + "learning_rate": 3.7424850524498113e-07, + "logits/chosen": -1.0047956705093384, + "logits/rejected": -0.9436285495758057, + "logps/chosen": -45.23377227783203, + "logps/rejected": -59.85185241699219, + "loss": 0.075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6731371879577637, + "rewards/margins": 4.904725074768066, + "rewards/rejected": -5.577862739562988, + "step": 270 + }, + { + "epoch": 3.211851851851852, + "grad_norm": 5.924296644494306, + "learning_rate": 3.731184647975584e-07, + "logits/chosen": -1.0159187316894531, + "logits/rejected": -0.9188311100006104, + "logps/chosen": -21.674226760864258, + "logps/rejected": -37.375823974609375, + "loss": 0.0726, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21266776323318481, + "rewards/margins": 3.735647678375244, + "rewards/rejected": -3.522979497909546, + "step": 271 + }, + { + "epoch": 3.2237037037037037, + "grad_norm": 5.386999484278703, + "learning_rate": 3.7198509355378207e-07, + "logits/chosen": -1.1651026010513306, + "logits/rejected": -1.1258482933044434, + "logps/chosen": -26.176000595092773, + "logps/rejected": -46.100807189941406, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20522062480449677, + "rewards/margins": 4.301911354064941, + "rewards/rejected": -4.507132053375244, + "step": 272 + }, + { + "epoch": 3.2355555555555555, + "grad_norm": 5.601624961329212, + "learning_rate": 3.7084842217541196e-07, + "logits/chosen": -0.8905525803565979, + "logits/rejected": -1.1875985860824585, + "logps/chosen": -24.38876724243164, + "logps/rejected": -56.33481979370117, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22321292757987976, + "rewards/margins": 5.481816291809082, + "rewards/rejected": -5.705029010772705, + "step": 273 + }, + { + "epoch": 3.2474074074074073, + "grad_norm": 5.796108078800707, + "learning_rate": 3.6970848141348855e-07, + "logits/chosen": -1.0142302513122559, + "logits/rejected": -1.140476107597351, + "logps/chosen": -20.314678192138672, + "logps/rejected": -47.31608963012695, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08062909543514252, + "rewards/margins": 4.616469860076904, + "rewards/rejected": -4.697099685668945, + "step": 274 + }, + { + "epoch": 3.259259259259259, + "grad_norm": 5.702096262342128, + "learning_rate": 3.685653021075006e-07, + "logits/chosen": -0.8202773332595825, + "logits/rejected": -0.8724027276039124, + "logps/chosen": -33.18456268310547, + "logps/rejected": -56.54239273071289, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5080665349960327, + "rewards/margins": 4.730257987976074, + "rewards/rejected": -5.238324165344238, + "step": 275 + }, + { + "epoch": 3.2711111111111113, + "grad_norm": 5.539345867188406, + "learning_rate": 3.6741891518455146e-07, + "logits/chosen": -1.1294889450073242, + "logits/rejected": -1.100435495376587, + "logps/chosen": -41.69868087768555, + "logps/rejected": -62.35628890991211, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2733755111694336, + "rewards/margins": 4.73194694519043, + "rewards/rejected": -6.005322456359863, + "step": 276 + }, + { + "epoch": 3.282962962962963, + "grad_norm": 4.99860717998791, + "learning_rate": 3.6626935165852183e-07, + "logits/chosen": -1.112581491470337, + "logits/rejected": -1.177794098854065, + "logps/chosen": -33.80670928955078, + "logps/rejected": -71.07241821289062, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6145895719528198, + "rewards/margins": 6.081070423126221, + "rewards/rejected": -6.69566011428833, + "step": 277 + }, + { + "epoch": 3.294814814814815, + "grad_norm": 5.307021962386122, + "learning_rate": 3.6511664262923094e-07, + "logits/chosen": -1.0228497982025146, + "logits/rejected": -0.9828412532806396, + "logps/chosen": -23.93638038635254, + "logps/rejected": -44.23779296875, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12847840785980225, + "rewards/margins": 3.686278820037842, + "rewards/rejected": -3.557800054550171, + "step": 278 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 6.056323436180124, + "learning_rate": 3.639608192815951e-07, + "logits/chosen": -1.0668245553970337, + "logits/rejected": -0.879786491394043, + "logps/chosen": -31.651464462280273, + "logps/rejected": -36.51784133911133, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16986359655857086, + "rewards/margins": 3.040926218032837, + "rewards/rejected": -3.210789680480957, + "step": 279 + }, + { + "epoch": 3.3185185185185184, + "grad_norm": 6.040832198855085, + "learning_rate": 3.6280191288478435e-07, + "logits/chosen": -1.143092155456543, + "logits/rejected": -0.9826564192771912, + "logps/chosen": -36.029632568359375, + "logps/rejected": -51.19088363647461, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4821978509426117, + "rewards/margins": 4.130788803100586, + "rewards/rejected": -4.612987041473389, + "step": 280 + }, + { + "epoch": 3.33037037037037, + "grad_norm": 4.03276871934942, + "learning_rate": 3.61639954791376e-07, + "logits/chosen": -1.117720127105713, + "logits/rejected": -1.1670171022415161, + "logps/chosen": -30.51466178894043, + "logps/rejected": -56.65740966796875, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6547921895980835, + "rewards/margins": 5.172966957092285, + "rewards/rejected": -5.827759265899658, + "step": 281 + }, + { + "epoch": 3.3422222222222224, + "grad_norm": 5.926942922995324, + "learning_rate": 3.604749764365069e-07, + "logits/chosen": -1.1983726024627686, + "logits/rejected": -1.0965083837509155, + "logps/chosen": -31.945249557495117, + "logps/rejected": -42.11628341674805, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15036548674106598, + "rewards/margins": 3.8085999488830566, + "rewards/rejected": -3.958966016769409, + "step": 282 + }, + { + "epoch": 3.354074074074074, + "grad_norm": 5.746588189734936, + "learning_rate": 3.593070093370226e-07, + "logits/chosen": -1.2473244667053223, + "logits/rejected": -1.1790859699249268, + "logps/chosen": -27.693424224853516, + "logps/rejected": -55.838382720947266, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5475992560386658, + "rewards/margins": 5.170629501342773, + "rewards/rejected": -5.718228816986084, + "step": 283 + }, + { + "epoch": 3.365925925925926, + "grad_norm": 4.050731197423161, + "learning_rate": 3.5813608509062526e-07, + "logits/chosen": -0.8557882308959961, + "logits/rejected": -0.9357935190200806, + "logps/chosen": -23.405031204223633, + "logps/rejected": -61.17503356933594, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4729683995246887, + "rewards/margins": 5.8998703956604, + "rewards/rejected": -6.372838020324707, + "step": 284 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 6.330628466200805, + "learning_rate": 3.569622353750181e-07, + "logits/chosen": -1.3175606727600098, + "logits/rejected": -1.1783069372177124, + "logps/chosen": -26.113262176513672, + "logps/rejected": -49.43036651611328, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3910313546657562, + "rewards/margins": 4.3150248527526855, + "rewards/rejected": -4.706056118011475, + "step": 285 + }, + { + "epoch": 3.3896296296296295, + "grad_norm": 4.891367910710421, + "learning_rate": 3.557854919470491e-07, + "logits/chosen": -1.0190260410308838, + "logits/rejected": -0.8906891345977783, + "logps/chosen": -30.935029983520508, + "logps/rejected": -54.49614715576172, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5170705318450928, + "rewards/margins": 4.817275524139404, + "rewards/rejected": -5.334345817565918, + "step": 286 + }, + { + "epoch": 3.4014814814814813, + "grad_norm": 4.73121969828735, + "learning_rate": 3.546058866418513e-07, + "logits/chosen": -1.2499243021011353, + "logits/rejected": -1.1855663061141968, + "logps/chosen": -26.786319732666016, + "logps/rejected": -55.46076965332031, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5292675495147705, + "rewards/margins": 5.712489128112793, + "rewards/rejected": -6.241756916046143, + "step": 287 + }, + { + "epoch": 3.413333333333333, + "grad_norm": 5.279313917975766, + "learning_rate": 3.5342345137198206e-07, + "logits/chosen": -1.0446308851242065, + "logits/rejected": -0.8056778907775879, + "logps/chosen": -29.395492553710938, + "logps/rejected": -44.214805603027344, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3937348425388336, + "rewards/margins": 4.55248498916626, + "rewards/rejected": -4.946219444274902, + "step": 288 + }, + { + "epoch": 3.4251851851851853, + "grad_norm": 5.3706152394158435, + "learning_rate": 3.5223821812655903e-07, + "logits/chosen": -1.045047640800476, + "logits/rejected": -0.9570527076721191, + "logps/chosen": -35.075313568115234, + "logps/rejected": -41.273075103759766, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29294437170028687, + "rewards/margins": 3.7492318153381348, + "rewards/rejected": -4.042176246643066, + "step": 289 + }, + { + "epoch": 3.437037037037037, + "grad_norm": 6.958208575648387, + "learning_rate": 3.510502189703954e-07, + "logits/chosen": -1.0246403217315674, + "logits/rejected": -1.0500332117080688, + "logps/chosen": -30.618261337280273, + "logps/rejected": -51.143402099609375, + "loss": 0.0839, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1609795093536377, + "rewards/margins": 5.519993782043457, + "rewards/rejected": -5.680973529815674, + "step": 290 + }, + { + "epoch": 3.448888888888889, + "grad_norm": 5.661919128416755, + "learning_rate": 3.4985948604313237e-07, + "logits/chosen": -0.978698194026947, + "logits/rejected": -1.042383074760437, + "logps/chosen": -34.199241638183594, + "logps/rejected": -55.27683639526367, + "loss": 0.0662, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0662914514541626, + "rewards/margins": 4.541738510131836, + "rewards/rejected": -5.608030319213867, + "step": 291 + }, + { + "epoch": 3.4607407407407407, + "grad_norm": 5.985004769770914, + "learning_rate": 3.486660515583691e-07, + "logits/chosen": -1.0405309200286865, + "logits/rejected": -1.1097911596298218, + "logps/chosen": -27.69963836669922, + "logps/rejected": -53.2136344909668, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.88338702917099, + "rewards/margins": 4.365427017211914, + "rewards/rejected": -5.248814105987549, + "step": 292 + }, + { + "epoch": 3.4725925925925925, + "grad_norm": 5.177960355626909, + "learning_rate": 3.474699478027918e-07, + "logits/chosen": -1.2111238241195679, + "logits/rejected": -0.9757024049758911, + "logps/chosen": -31.71354103088379, + "logps/rejected": -41.57647705078125, + "loss": 0.0559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6544926166534424, + "rewards/margins": 3.8716635704040527, + "rewards/rejected": -4.526156425476074, + "step": 293 + }, + { + "epoch": 3.4844444444444447, + "grad_norm": 4.613155996179155, + "learning_rate": 3.4627120713529983e-07, + "logits/chosen": -0.9755435585975647, + "logits/rejected": -1.1237881183624268, + "logps/chosen": -21.237268447875977, + "logps/rejected": -54.971214294433594, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2568213641643524, + "rewards/margins": 5.524716854095459, + "rewards/rejected": -5.781538486480713, + "step": 294 + }, + { + "epoch": 3.4962962962962965, + "grad_norm": 5.128029383000987, + "learning_rate": 3.4506986198613077e-07, + "logits/chosen": -0.9919889569282532, + "logits/rejected": -0.8300825357437134, + "logps/chosen": -29.0252742767334, + "logps/rejected": -47.359825134277344, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028458386659622192, + "rewards/margins": 5.328551292419434, + "rewards/rejected": -5.357009410858154, + "step": 295 + }, + { + "epoch": 3.5081481481481482, + "grad_norm": 4.006414442650582, + "learning_rate": 3.438659448559825e-07, + "logits/chosen": -1.0892466306686401, + "logits/rejected": -0.9891116619110107, + "logps/chosen": -27.066917419433594, + "logps/rejected": -43.91146469116211, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07186317443847656, + "rewards/margins": 4.227798938751221, + "rewards/rejected": -4.155935764312744, + "step": 296 + }, + { + "epoch": 3.52, + "grad_norm": 6.127057074618686, + "learning_rate": 3.4265948831513434e-07, + "logits/chosen": -1.1602692604064941, + "logits/rejected": -0.9549179077148438, + "logps/chosen": -38.73994064331055, + "logps/rejected": -40.83653259277344, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6782684922218323, + "rewards/margins": 3.3154821395874023, + "rewards/rejected": -3.99375057220459, + "step": 297 + }, + { + "epoch": 3.531851851851852, + "grad_norm": 5.969524211003228, + "learning_rate": 3.414505250025659e-07, + "logits/chosen": -1.2992161512374878, + "logits/rejected": -1.39997136592865, + "logps/chosen": -25.659305572509766, + "logps/rejected": -57.98012924194336, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21638883650302887, + "rewards/margins": 4.876415252685547, + "rewards/rejected": -5.092803955078125, + "step": 298 + }, + { + "epoch": 3.5437037037037036, + "grad_norm": 5.142417577054952, + "learning_rate": 3.402390876250737e-07, + "logits/chosen": -0.9474883675575256, + "logits/rejected": -0.9179717302322388, + "logps/chosen": -22.229196548461914, + "logps/rejected": -46.438045501708984, + "loss": 0.0535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5016873478889465, + "rewards/margins": 5.030646324157715, + "rewards/rejected": -5.5323333740234375, + "step": 299 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 4.2230056460228145, + "learning_rate": 3.390252089563867e-07, + "logits/chosen": -0.7143265008926392, + "logits/rejected": -0.7287828922271729, + "logps/chosen": -36.311458587646484, + "logps/rejected": -57.43645095825195, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6835311651229858, + "rewards/margins": 4.852078914642334, + "rewards/rejected": -5.535609722137451, + "step": 300 + }, + { + "epoch": 3.5674074074074076, + "grad_norm": 4.613790070173845, + "learning_rate": 3.3780892183627974e-07, + "logits/chosen": -0.9158973693847656, + "logits/rejected": -0.9328266978263855, + "logps/chosen": -26.87763214111328, + "logps/rejected": -55.432861328125, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9097545146942139, + "rewards/margins": 5.234354019165039, + "rewards/rejected": -6.144108295440674, + "step": 301 + }, + { + "epoch": 3.5792592592592594, + "grad_norm": 5.438641971874985, + "learning_rate": 3.3659025916968475e-07, + "logits/chosen": -0.8110693693161011, + "logits/rejected": -0.9373239278793335, + "logps/chosen": -33.212890625, + "logps/rejected": -57.53894805908203, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0058021545410156, + "rewards/margins": 4.593456268310547, + "rewards/rejected": -5.599257946014404, + "step": 302 + }, + { + "epoch": 3.591111111111111, + "grad_norm": 5.85934801222626, + "learning_rate": 3.353692539258006e-07, + "logits/chosen": -1.0616428852081299, + "logits/rejected": -1.0635976791381836, + "logps/chosen": -42.99614334106445, + "logps/rejected": -59.80973815917969, + "loss": 0.0729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5455619096755981, + "rewards/margins": 4.6160502433776855, + "rewards/rejected": -5.161612033843994, + "step": 303 + }, + { + "epoch": 3.602962962962963, + "grad_norm": 4.758016084753321, + "learning_rate": 3.3414593913720155e-07, + "logits/chosen": -1.0797550678253174, + "logits/rejected": -1.1135615110397339, + "logps/chosen": -25.05910301208496, + "logps/rejected": -48.85538864135742, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7004413604736328, + "rewards/margins": 4.928831100463867, + "rewards/rejected": -5.6292724609375, + "step": 304 + }, + { + "epoch": 3.6148148148148147, + "grad_norm": 5.071406885912709, + "learning_rate": 3.329203478989431e-07, + "logits/chosen": -0.8254508972167969, + "logits/rejected": -0.8509577512741089, + "logps/chosen": -31.357934951782227, + "logps/rejected": -56.72637176513672, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3466971218585968, + "rewards/margins": 6.271115303039551, + "rewards/rejected": -6.617812156677246, + "step": 305 + }, + { + "epoch": 3.626666666666667, + "grad_norm": 5.745951047820061, + "learning_rate": 3.3169251336766697e-07, + "logits/chosen": -1.1452853679656982, + "logits/rejected": -0.9981801509857178, + "logps/chosen": -27.595762252807617, + "logps/rejected": -57.56399917602539, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9206715822219849, + "rewards/margins": 5.363818168640137, + "rewards/rejected": -6.28449010848999, + "step": 306 + }, + { + "epoch": 3.6385185185185183, + "grad_norm": 4.425440687205678, + "learning_rate": 3.3046246876070405e-07, + "logits/chosen": -1.078284502029419, + "logits/rejected": -0.9102885127067566, + "logps/chosen": -35.90202331542969, + "logps/rejected": -54.64530944824219, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6325173377990723, + "rewards/margins": 4.867710590362549, + "rewards/rejected": -5.500227928161621, + "step": 307 + }, + { + "epoch": 3.6503703703703705, + "grad_norm": 6.4970787913514965, + "learning_rate": 3.2923024735517567e-07, + "logits/chosen": -1.1114505529403687, + "logits/rejected": -1.2475202083587646, + "logps/chosen": -31.215063095092773, + "logps/rejected": -52.50607681274414, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8711620569229126, + "rewards/margins": 4.3640456199646, + "rewards/rejected": -5.235208034515381, + "step": 308 + }, + { + "epoch": 3.6622222222222223, + "grad_norm": 4.671255590927646, + "learning_rate": 3.279958824870934e-07, + "logits/chosen": -1.0347408056259155, + "logits/rejected": -1.1540158987045288, + "logps/chosen": -22.052547454833984, + "logps/rejected": -37.25065994262695, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3115699291229248, + "rewards/margins": 3.4501681327819824, + "rewards/rejected": -3.7617380619049072, + "step": 309 + }, + { + "epoch": 3.674074074074074, + "grad_norm": 5.203696540638213, + "learning_rate": 3.2675940755045713e-07, + "logits/chosen": -0.8645880818367004, + "logits/rejected": -0.7690497636795044, + "logps/chosen": -27.421972274780273, + "logps/rejected": -46.4594612121582, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.294041246175766, + "rewards/margins": 5.109452724456787, + "rewards/rejected": -5.403493881225586, + "step": 310 + }, + { + "epoch": 3.685925925925926, + "grad_norm": 6.0369789648764955, + "learning_rate": 3.2552085599635167e-07, + "logits/chosen": -1.26742684841156, + "logits/rejected": -1.0950877666473389, + "logps/chosen": -29.363801956176758, + "logps/rejected": -51.036216735839844, + "loss": 0.0565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4705730378627777, + "rewards/margins": 5.166300296783447, + "rewards/rejected": -5.636873722076416, + "step": 311 + }, + { + "epoch": 3.6977777777777776, + "grad_norm": 5.411702389421841, + "learning_rate": 3.242802613320418e-07, + "logits/chosen": -1.2446568012237549, + "logits/rejected": -1.2488821744918823, + "logps/chosen": -34.51674270629883, + "logps/rejected": -49.218116760253906, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41771867871284485, + "rewards/margins": 4.54225492477417, + "rewards/rejected": -4.9599738121032715, + "step": 312 + }, + { + "epoch": 3.70962962962963, + "grad_norm": 6.254274807160154, + "learning_rate": 3.2303765712006585e-07, + "logits/chosen": -0.8596463203430176, + "logits/rejected": -1.100625991821289, + "logps/chosen": -39.566654205322266, + "logps/rejected": -69.77198791503906, + "loss": 0.0653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.135459542274475, + "rewards/margins": 5.502881050109863, + "rewards/rejected": -6.638340473175049, + "step": 313 + }, + { + "epoch": 3.7214814814814816, + "grad_norm": 5.503511678320754, + "learning_rate": 3.217930769773275e-07, + "logits/chosen": -0.8999834060668945, + "logits/rejected": -0.8143523931503296, + "logps/chosen": -40.45894241333008, + "logps/rejected": -51.68131637573242, + "loss": 0.0566, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.05463707447052, + "rewards/margins": 3.0763254165649414, + "rewards/rejected": -4.130962371826172, + "step": 314 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 4.936108454949577, + "learning_rate": 3.2054655457418647e-07, + "logits/chosen": -0.8420968055725098, + "logits/rejected": -1.0272884368896484, + "logps/chosen": -23.133481979370117, + "logps/rejected": -41.11859130859375, + "loss": 0.0578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44954371452331543, + "rewards/margins": 3.8556888103485107, + "rewards/rejected": -4.305232524871826, + "step": 315 + }, + { + "epoch": 3.745185185185185, + "grad_norm": 5.389922613533766, + "learning_rate": 3.1929812363354764e-07, + "logits/chosen": -1.0261280536651611, + "logits/rejected": -0.7948772311210632, + "logps/chosen": -31.51889991760254, + "logps/rejected": -42.249786376953125, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4212304651737213, + "rewards/margins": 4.244043827056885, + "rewards/rejected": -4.665274620056152, + "step": 316 + }, + { + "epoch": 3.757037037037037, + "grad_norm": 4.6295381857080296, + "learning_rate": 3.1804781792994867e-07, + "logits/chosen": -0.8965979218482971, + "logits/rejected": -0.8617987632751465, + "logps/chosen": -42.41337585449219, + "logps/rejected": -72.1649169921875, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9737859964370728, + "rewards/margins": 6.099238395690918, + "rewards/rejected": -7.073023796081543, + "step": 317 + }, + { + "epoch": 3.7688888888888887, + "grad_norm": 5.5642008946868655, + "learning_rate": 3.167956712886463e-07, + "logits/chosen": -0.6077237129211426, + "logits/rejected": -0.7936528325080872, + "logps/chosen": -33.86177444458008, + "logps/rejected": -64.47748565673828, + "loss": 0.0663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5910844206809998, + "rewards/margins": 5.1871795654296875, + "rewards/rejected": -5.778264045715332, + "step": 318 + }, + { + "epoch": 3.7807407407407405, + "grad_norm": 6.861996538597475, + "learning_rate": 3.155417175847011e-07, + "logits/chosen": -1.2012813091278076, + "logits/rejected": -1.1468443870544434, + "logps/chosen": -31.32352638244629, + "logps/rejected": -51.89019775390625, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.28224515914917, + "rewards/margins": 4.7101054191589355, + "rewards/rejected": -5.9923505783081055, + "step": 319 + }, + { + "epoch": 3.7925925925925927, + "grad_norm": 5.30619862961812, + "learning_rate": 3.142859907420615e-07, + "logits/chosen": -1.0433729887008667, + "logits/rejected": -1.0667370557785034, + "logps/chosen": -39.072513580322266, + "logps/rejected": -58.01454544067383, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.079565405845642, + "rewards/margins": 3.7631685733795166, + "rewards/rejected": -4.842733860015869, + "step": 320 + }, + { + "epoch": 3.8044444444444445, + "grad_norm": 5.765561512142667, + "learning_rate": 3.1302852473264537e-07, + "logits/chosen": -1.352389931678772, + "logits/rejected": -1.1685843467712402, + "logps/chosen": -25.97576141357422, + "logps/rejected": -41.47727584838867, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7056049108505249, + "rewards/margins": 3.5812952518463135, + "rewards/rejected": -4.286900043487549, + "step": 321 + }, + { + "epoch": 3.8162962962962963, + "grad_norm": 5.083438759014095, + "learning_rate": 3.117693535754213e-07, + "logits/chosen": -1.1177499294281006, + "logits/rejected": -0.9615808725357056, + "logps/chosen": -30.529483795166016, + "logps/rejected": -65.1328125, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3329801559448242, + "rewards/margins": 6.611982345581055, + "rewards/rejected": -7.944962501525879, + "step": 322 + }, + { + "epoch": 3.828148148148148, + "grad_norm": 5.339542761344782, + "learning_rate": 3.105085113354885e-07, + "logits/chosen": -1.219346523284912, + "logits/rejected": -1.0785218477249146, + "logps/chosen": -24.757862091064453, + "logps/rejected": -44.62859344482422, + "loss": 0.0665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.687053918838501, + "rewards/margins": 5.1285810470581055, + "rewards/rejected": -5.8156352043151855, + "step": 323 + }, + { + "epoch": 3.84, + "grad_norm": 5.388037597932235, + "learning_rate": 3.092460321231547e-07, + "logits/chosen": -0.8497295379638672, + "logits/rejected": -0.6827176809310913, + "logps/chosen": -28.461023330688477, + "logps/rejected": -40.831329345703125, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3341638445854187, + "rewards/margins": 4.207818984985352, + "rewards/rejected": -4.541983127593994, + "step": 324 + }, + { + "epoch": 3.851851851851852, + "grad_norm": 6.777852590076427, + "learning_rate": 3.079819500930138e-07, + "logits/chosen": -1.1702187061309814, + "logits/rejected": -0.8639770746231079, + "logps/chosen": -33.30535888671875, + "logps/rejected": -45.3155632019043, + "loss": 0.068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7845231294631958, + "rewards/margins": 4.671426296234131, + "rewards/rejected": -5.455949783325195, + "step": 325 + }, + { + "epoch": 3.863703703703704, + "grad_norm": 4.96340034468975, + "learning_rate": 3.0671629944302164e-07, + "logits/chosen": -1.1068304777145386, + "logits/rejected": -0.8126644492149353, + "logps/chosen": -24.921886444091797, + "logps/rejected": -54.05364990234375, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4166131913661957, + "rewards/margins": 6.512470722198486, + "rewards/rejected": -6.929083824157715, + "step": 326 + }, + { + "epoch": 3.8755555555555556, + "grad_norm": 5.138730208654112, + "learning_rate": 3.054491144135707e-07, + "logits/chosen": -1.1236650943756104, + "logits/rejected": -0.91242516040802, + "logps/chosen": -28.354129791259766, + "logps/rejected": -43.563880920410156, + "loss": 0.0477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3069639801979065, + "rewards/margins": 3.786219596862793, + "rewards/rejected": -4.093183517456055, + "step": 327 + }, + { + "epoch": 3.8874074074074074, + "grad_norm": 6.099421147689264, + "learning_rate": 3.0418042928656415e-07, + "logits/chosen": -0.7698428630828857, + "logits/rejected": -0.8110767602920532, + "logps/chosen": -38.043785095214844, + "logps/rejected": -64.61091613769531, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9435144662857056, + "rewards/margins": 5.062083721160889, + "rewards/rejected": -6.005598068237305, + "step": 328 + }, + { + "epoch": 3.899259259259259, + "grad_norm": 6.253989880304663, + "learning_rate": 3.029102783844879e-07, + "logits/chosen": -1.29204261302948, + "logits/rejected": -1.051203966140747, + "logps/chosen": -29.049633026123047, + "logps/rejected": -43.23980712890625, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0605460405349731, + "rewards/margins": 4.699088096618652, + "rewards/rejected": -5.759634494781494, + "step": 329 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 6.957873933273759, + "learning_rate": 3.016386960694827e-07, + "logits/chosen": -1.1009610891342163, + "logits/rejected": -0.9418026208877563, + "logps/chosen": -27.438570022583008, + "logps/rejected": -54.82490921020508, + "loss": 0.0727, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6385165452957153, + "rewards/margins": 5.449717998504639, + "rewards/rejected": -6.088234901428223, + "step": 330 + }, + { + "epoch": 3.9229629629629628, + "grad_norm": 5.279686775444548, + "learning_rate": 3.003657167424139e-07, + "logits/chosen": -1.222791314125061, + "logits/rejected": -1.2143419981002808, + "logps/chosen": -28.141613006591797, + "logps/rejected": -51.94461441040039, + "loss": 0.0599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.66292804479599, + "rewards/margins": 4.610944747924805, + "rewards/rejected": -5.2738728523254395, + "step": 331 + }, + { + "epoch": 3.934814814814815, + "grad_norm": 4.870201008956739, + "learning_rate": 2.990913748419411e-07, + "logits/chosen": -0.8936185836791992, + "logits/rejected": -0.5663695335388184, + "logps/chosen": -38.59070587158203, + "logps/rejected": -60.144142150878906, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8343983292579651, + "rewards/margins": 6.496341228485107, + "rewards/rejected": -7.330739974975586, + "step": 332 + }, + { + "epoch": 3.9466666666666668, + "grad_norm": 4.2701436405891835, + "learning_rate": 2.978157048435863e-07, + "logits/chosen": -1.0724661350250244, + "logits/rejected": -1.1288424730300903, + "logps/chosen": -35.56442642211914, + "logps/rejected": -64.08760070800781, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9496961236000061, + "rewards/margins": 5.328984260559082, + "rewards/rejected": -6.278680324554443, + "step": 333 + }, + { + "epoch": 3.9585185185185185, + "grad_norm": 5.0014798868853045, + "learning_rate": 2.9653874125880167e-07, + "logits/chosen": -0.849825918674469, + "logits/rejected": -0.798513650894165, + "logps/chosen": -24.454782485961914, + "logps/rejected": -50.63400650024414, + "loss": 0.0486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20302775502204895, + "rewards/margins": 5.389017105102539, + "rewards/rejected": -5.592044353485107, + "step": 334 + }, + { + "epoch": 3.9703703703703703, + "grad_norm": 5.388732511269792, + "learning_rate": 2.9526051863403517e-07, + "logits/chosen": -0.9176443815231323, + "logits/rejected": -0.8973706364631653, + "logps/chosen": -27.884946823120117, + "logps/rejected": -67.28817749023438, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4531194865703583, + "rewards/margins": 7.574563503265381, + "rewards/rejected": -8.02768325805664, + "step": 335 + }, + { + "epoch": 3.982222222222222, + "grad_norm": 6.518100338649706, + "learning_rate": 2.9398107154979634e-07, + "logits/chosen": -1.078136920928955, + "logits/rejected": -1.0960880517959595, + "logps/chosen": -33.29356384277344, + "logps/rejected": -51.3941650390625, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8194246292114258, + "rewards/margins": 4.724632263183594, + "rewards/rejected": -5.5440568923950195, + "step": 336 + }, + { + "epoch": 3.9940740740740743, + "grad_norm": 5.2039964922640065, + "learning_rate": 2.9270043461972097e-07, + "logits/chosen": -1.067875623703003, + "logits/rejected": -0.8327341675758362, + "logps/chosen": -35.44911575317383, + "logps/rejected": -61.61286544799805, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.434535264968872, + "rewards/margins": 6.484606742858887, + "rewards/rejected": -7.919142723083496, + "step": 337 + }, + { + "epoch": 4.005925925925926, + "grad_norm": 4.187898216155283, + "learning_rate": 2.9141864248963427e-07, + "logits/chosen": -1.328801155090332, + "logits/rejected": -1.3687348365783691, + "logps/chosen": -25.005905151367188, + "logps/rejected": -51.1596794128418, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5326629877090454, + "rewards/margins": 5.868394374847412, + "rewards/rejected": -6.401058197021484, + "step": 338 + }, + { + "epoch": 4.017777777777778, + "grad_norm": 3.1974382675774202, + "learning_rate": 2.9013572983661375e-07, + "logits/chosen": -0.5688086152076721, + "logits/rejected": -0.5790220499038696, + "logps/chosen": -36.34614181518555, + "logps/rejected": -59.652000427246094, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.793825626373291, + "rewards/margins": 6.1709794998168945, + "rewards/rejected": -6.964805603027344, + "step": 339 + }, + { + "epoch": 4.029629629629629, + "grad_norm": 4.158358612889323, + "learning_rate": 2.8885173136805125e-07, + "logits/chosen": -1.0552133321762085, + "logits/rejected": -0.8415440320968628, + "logps/chosen": -41.49494934082031, + "logps/rejected": -57.47507858276367, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.504111409187317, + "rewards/margins": 5.205606937408447, + "rewards/rejected": -6.709719181060791, + "step": 340 + }, + { + "epoch": 4.0414814814814815, + "grad_norm": 3.365949611740308, + "learning_rate": 2.8756668182071357e-07, + "logits/chosen": -0.9030847549438477, + "logits/rejected": -1.0248281955718994, + "logps/chosen": -22.62586784362793, + "logps/rejected": -57.38360595703125, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8877655267715454, + "rewards/margins": 5.815491199493408, + "rewards/rejected": -6.703256607055664, + "step": 341 + }, + { + "epoch": 4.053333333333334, + "grad_norm": 3.7482771926949994, + "learning_rate": 2.862806159598032e-07, + "logits/chosen": -1.1101518869400024, + "logits/rejected": -1.2761971950531006, + "logps/chosen": -27.846036911010742, + "logps/rejected": -53.2386360168457, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8876510858535767, + "rewards/margins": 5.131702899932861, + "rewards/rejected": -6.019353866577148, + "step": 342 + }, + { + "epoch": 4.065185185185185, + "grad_norm": 3.450390240669121, + "learning_rate": 2.8499356857801744e-07, + "logits/chosen": -1.0844999551773071, + "logits/rejected": -1.003774881362915, + "logps/chosen": -23.862234115600586, + "logps/rejected": -48.92105484008789, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2901920974254608, + "rewards/margins": 5.612967491149902, + "rewards/rejected": -5.903159141540527, + "step": 343 + }, + { + "epoch": 4.077037037037037, + "grad_norm": 3.894756023928179, + "learning_rate": 2.837055744946072e-07, + "logits/chosen": -1.0588653087615967, + "logits/rejected": -0.9954587817192078, + "logps/chosen": -33.45431137084961, + "logps/rejected": -54.49903106689453, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7320315837860107, + "rewards/margins": 4.859332084655762, + "rewards/rejected": -5.591363430023193, + "step": 344 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 3.76201693248286, + "learning_rate": 2.8241666855443526e-07, + "logits/chosen": -1.0309534072875977, + "logits/rejected": -1.0565218925476074, + "logps/chosen": -25.159849166870117, + "logps/rejected": -55.27362823486328, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9085606336593628, + "rewards/margins": 5.737356185913086, + "rewards/rejected": -6.645916938781738, + "step": 345 + }, + { + "epoch": 4.100740740740741, + "grad_norm": 3.5741790516530005, + "learning_rate": 2.811268856270332e-07, + "logits/chosen": -0.6616291403770447, + "logits/rejected": -0.747644305229187, + "logps/chosen": -31.07400131225586, + "logps/rejected": -61.612125396728516, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5435906052589417, + "rewards/margins": 6.400445938110352, + "rewards/rejected": -6.944036483764648, + "step": 346 + }, + { + "epoch": 4.112592592592593, + "grad_norm": 3.704349704368933, + "learning_rate": 2.798362606056583e-07, + "logits/chosen": -1.056967854499817, + "logits/rejected": -1.1185215711593628, + "logps/chosen": -41.541934967041016, + "logps/rejected": -66.00812530517578, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4524059295654297, + "rewards/margins": 6.422599792480469, + "rewards/rejected": -7.875006198883057, + "step": 347 + }, + { + "epoch": 4.124444444444444, + "grad_norm": 4.484355122741482, + "learning_rate": 2.7854482840634965e-07, + "logits/chosen": -1.4520745277404785, + "logits/rejected": -1.226075291633606, + "logps/chosen": -31.625812530517578, + "logps/rejected": -38.356956481933594, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.793215274810791, + "rewards/margins": 3.6655571460723877, + "rewards/rejected": -4.458772659301758, + "step": 348 + }, + { + "epoch": 4.136296296296297, + "grad_norm": 3.734160255747896, + "learning_rate": 2.772526239669831e-07, + "logits/chosen": -0.7964825630187988, + "logits/rejected": -0.8359100222587585, + "logps/chosen": -25.68545150756836, + "logps/rejected": -69.7961196899414, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3613826334476471, + "rewards/margins": 6.706874370574951, + "rewards/rejected": -7.0682573318481445, + "step": 349 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 3.980785661890267, + "learning_rate": 2.759596822463267e-07, + "logits/chosen": -1.2085440158843994, + "logits/rejected": -1.0254814624786377, + "logps/chosen": -29.036041259765625, + "logps/rejected": -51.60666275024414, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.908053457736969, + "rewards/margins": 5.615097522735596, + "rewards/rejected": -6.52315092086792, + "step": 350 + }, + { + "epoch": 4.16, + "grad_norm": 3.071416968940517, + "learning_rate": 2.746660382230944e-07, + "logits/chosen": -1.111541986465454, + "logits/rejected": -1.015653371810913, + "logps/chosen": -28.444028854370117, + "logps/rejected": -46.22932815551758, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6811230182647705, + "rewards/margins": 4.413417816162109, + "rewards/rejected": -5.094541072845459, + "step": 351 + }, + { + "epoch": 4.1718518518518515, + "grad_norm": 3.379553943501446, + "learning_rate": 2.73371726895e-07, + "logits/chosen": -1.0623794794082642, + "logits/rejected": -0.7676531076431274, + "logps/chosen": -28.41644859313965, + "logps/rejected": -41.91035079956055, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5011738538742065, + "rewards/margins": 4.736425876617432, + "rewards/rejected": -5.2375993728637695, + "step": 352 + }, + { + "epoch": 4.183703703703704, + "grad_norm": 3.6570535219048494, + "learning_rate": 2.7207678327781036e-07, + "logits/chosen": -0.9565964937210083, + "logits/rejected": -1.037332534790039, + "logps/chosen": -30.713109970092773, + "logps/rejected": -61.01609802246094, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.675375759601593, + "rewards/margins": 6.953289031982422, + "rewards/rejected": -7.628664970397949, + "step": 353 + }, + { + "epoch": 4.195555555555556, + "grad_norm": 3.110297173394316, + "learning_rate": 2.7078124240439793e-07, + "logits/chosen": -1.219465970993042, + "logits/rejected": -1.0699536800384521, + "logps/chosen": -27.52557373046875, + "logps/rejected": -58.77972412109375, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6273876428604126, + "rewards/margins": 6.733246803283691, + "rewards/rejected": -7.360633850097656, + "step": 354 + }, + { + "epoch": 4.207407407407407, + "grad_norm": 3.833981676241624, + "learning_rate": 2.6948513932379307e-07, + "logits/chosen": -0.987195611000061, + "logits/rejected": -1.073306918144226, + "logps/chosen": -26.008790969848633, + "logps/rejected": -57.6981315612793, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4490770399570465, + "rewards/margins": 5.844611167907715, + "rewards/rejected": -6.29368782043457, + "step": 355 + }, + { + "epoch": 4.2192592592592595, + "grad_norm": 4.554577776444402, + "learning_rate": 2.68188509100236e-07, + "logits/chosen": -1.3205063343048096, + "logits/rejected": -1.1293655633926392, + "logps/chosen": -35.74661636352539, + "logps/rejected": -57.69905471801758, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.129513144493103, + "rewards/margins": 5.85588264465332, + "rewards/rejected": -6.985395908355713, + "step": 356 + }, + { + "epoch": 4.231111111111111, + "grad_norm": 3.120753083179545, + "learning_rate": 2.668913868122279e-07, + "logits/chosen": -1.1194713115692139, + "logits/rejected": -1.103295087814331, + "logps/chosen": -24.680492401123047, + "logps/rejected": -60.474884033203125, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4337795376777649, + "rewards/margins": 6.006582260131836, + "rewards/rejected": -6.440362453460693, + "step": 357 + }, + { + "epoch": 4.242962962962963, + "grad_norm": 3.061931183232965, + "learning_rate": 2.6559380755158206e-07, + "logits/chosen": -0.8489376902580261, + "logits/rejected": -0.8107261061668396, + "logps/chosen": -34.79714584350586, + "logps/rejected": -73.919189453125, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6460020542144775, + "rewards/margins": 7.77467155456543, + "rewards/rejected": -9.420673370361328, + "step": 358 + }, + { + "epoch": 4.254814814814814, + "grad_norm": 2.8892305258233, + "learning_rate": 2.642958064224747e-07, + "logits/chosen": -1.3385485410690308, + "logits/rejected": -1.154805064201355, + "logps/chosen": -39.11994934082031, + "logps/rejected": -50.91279602050781, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9563882350921631, + "rewards/margins": 4.840402126312256, + "rewards/rejected": -5.796790599822998, + "step": 359 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 3.476070988392766, + "learning_rate": 2.629974185404951e-07, + "logits/chosen": -0.8726380467414856, + "logits/rejected": -0.7995076179504395, + "logps/chosen": -36.67843246459961, + "logps/rejected": -66.06752014160156, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.965777575969696, + "rewards/margins": 6.506186485290527, + "rewards/rejected": -7.471963405609131, + "step": 360 + }, + { + "epoch": 4.278518518518519, + "grad_norm": 3.479867841295816, + "learning_rate": 2.616986790316952e-07, + "logits/chosen": -1.0837681293487549, + "logits/rejected": -1.1205651760101318, + "logps/chosen": -32.18258285522461, + "logps/rejected": -51.97950744628906, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.356218934059143, + "rewards/margins": 5.362008571624756, + "rewards/rejected": -6.718227386474609, + "step": 361 + }, + { + "epoch": 4.29037037037037, + "grad_norm": 3.836705526683735, + "learning_rate": 2.603996230316402e-07, + "logits/chosen": -1.235141396522522, + "logits/rejected": -1.0966753959655762, + "logps/chosen": -32.577171325683594, + "logps/rejected": -48.43400573730469, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0464580059051514, + "rewards/margins": 5.035365104675293, + "rewards/rejected": -6.081823348999023, + "step": 362 + }, + { + "epoch": 4.302222222222222, + "grad_norm": 4.050233783944833, + "learning_rate": 2.5910028568445716e-07, + "logits/chosen": -1.0736088752746582, + "logits/rejected": -0.9677655696868896, + "logps/chosen": -34.009925842285156, + "logps/rejected": -55.578128814697266, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9978185892105103, + "rewards/margins": 5.743760108947754, + "rewards/rejected": -6.741579055786133, + "step": 363 + }, + { + "epoch": 4.314074074074074, + "grad_norm": 3.229922787157764, + "learning_rate": 2.5780070214188474e-07, + "logits/chosen": -1.1710213422775269, + "logits/rejected": -0.8957576155662537, + "logps/chosen": -41.35856246948242, + "logps/rejected": -65.46880340576172, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9127827882766724, + "rewards/margins": 6.122142791748047, + "rewards/rejected": -8.03492546081543, + "step": 364 + }, + { + "epoch": 4.325925925925926, + "grad_norm": 3.113178376367222, + "learning_rate": 2.5650090756232226e-07, + "logits/chosen": -1.1073739528656006, + "logits/rejected": -1.2185232639312744, + "logps/chosen": -26.82334327697754, + "logps/rejected": -55.30256652832031, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8536565899848938, + "rewards/margins": 5.805755615234375, + "rewards/rejected": -6.659412384033203, + "step": 365 + }, + { + "epoch": 4.337777777777778, + "grad_norm": 3.247603709931723, + "learning_rate": 2.552009371098778e-07, + "logits/chosen": -1.2337862253189087, + "logits/rejected": -1.0264849662780762, + "logps/chosen": -32.95630645751953, + "logps/rejected": -52.31919860839844, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7740206122398376, + "rewards/margins": 5.251730918884277, + "rewards/rejected": -6.02575159072876, + "step": 366 + }, + { + "epoch": 4.3496296296296295, + "grad_norm": 3.3943763130724007, + "learning_rate": 2.5390082595341816e-07, + "logits/chosen": -1.2210817337036133, + "logits/rejected": -1.2176547050476074, + "logps/chosen": -26.88553810119629, + "logps/rejected": -59.887794494628906, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8426360487937927, + "rewards/margins": 6.936334609985352, + "rewards/rejected": -7.778970718383789, + "step": 367 + }, + { + "epoch": 4.361481481481482, + "grad_norm": 4.050869880734819, + "learning_rate": 2.5260060926561604e-07, + "logits/chosen": -0.9647274017333984, + "logits/rejected": -1.103495478630066, + "logps/chosen": -30.085792541503906, + "logps/rejected": -62.07667922973633, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0259263515472412, + "rewards/margins": 6.067399024963379, + "rewards/rejected": -7.093325138092041, + "step": 368 + }, + { + "epoch": 4.373333333333333, + "grad_norm": 3.9890669950982125, + "learning_rate": 2.5130032222199954e-07, + "logits/chosen": -0.9496943950653076, + "logits/rejected": -1.1947317123413086, + "logps/chosen": -25.227035522460938, + "logps/rejected": -64.11875915527344, + "loss": 0.0348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5789353847503662, + "rewards/margins": 5.880395889282227, + "rewards/rejected": -6.459331035614014, + "step": 369 + }, + { + "epoch": 4.385185185185185, + "grad_norm": 3.5570964159820257, + "learning_rate": 2.5e-07, + "logits/chosen": -1.3687735795974731, + "logits/rejected": -1.352237343788147, + "logps/chosen": -22.255563735961914, + "logps/rejected": -51.99371337890625, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23616445064544678, + "rewards/margins": 6.420718193054199, + "rewards/rejected": -6.6568827629089355, + "step": 370 + }, + { + "epoch": 4.397037037037037, + "grad_norm": 4.46941184264617, + "learning_rate": 2.4869967777800055e-07, + "logits/chosen": -1.0241132974624634, + "logits/rejected": -1.050920844078064, + "logps/chosen": -27.66728973388672, + "logps/rejected": -46.08578109741211, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3025878369808197, + "rewards/margins": 5.107050895690918, + "rewards/rejected": -5.40963888168335, + "step": 371 + }, + { + "epoch": 4.408888888888889, + "grad_norm": 3.4408179676361677, + "learning_rate": 2.4739939073438393e-07, + "logits/chosen": -1.1434128284454346, + "logits/rejected": -1.2375893592834473, + "logps/chosen": -27.630720138549805, + "logps/rejected": -57.22761535644531, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9288941621780396, + "rewards/margins": 6.560557842254639, + "rewards/rejected": -7.489451885223389, + "step": 372 + }, + { + "epoch": 4.420740740740741, + "grad_norm": 4.094893116865017, + "learning_rate": 2.460991740465819e-07, + "logits/chosen": -1.1973354816436768, + "logits/rejected": -0.9313310980796814, + "logps/chosen": -28.657133102416992, + "logps/rejected": -58.009613037109375, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.197420358657837, + "rewards/margins": 6.4741129875183105, + "rewards/rejected": -7.671533107757568, + "step": 373 + }, + { + "epoch": 4.432592592592592, + "grad_norm": 3.817829925143414, + "learning_rate": 2.4479906289012216e-07, + "logits/chosen": -1.1140527725219727, + "logits/rejected": -0.9359537959098816, + "logps/chosen": -24.56032943725586, + "logps/rejected": -52.2829704284668, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37612664699554443, + "rewards/margins": 5.741087913513184, + "rewards/rejected": -6.117214679718018, + "step": 374 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 3.838687015003608, + "learning_rate": 2.434990924376778e-07, + "logits/chosen": -1.2416824102401733, + "logits/rejected": -1.3780392408370972, + "logps/chosen": -25.436866760253906, + "logps/rejected": -55.5416259765625, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0123906135559082, + "rewards/margins": 5.48138427734375, + "rewards/rejected": -6.493774890899658, + "step": 375 + }, + { + "epoch": 4.456296296296296, + "grad_norm": 3.2461595488442763, + "learning_rate": 2.421992978581152e-07, + "logits/chosen": -1.0624195337295532, + "logits/rejected": -1.0169167518615723, + "logps/chosen": -44.57556915283203, + "logps/rejected": -57.175086975097656, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.327272653579712, + "rewards/margins": 4.468148231506348, + "rewards/rejected": -5.795420169830322, + "step": 376 + }, + { + "epoch": 4.468148148148148, + "grad_norm": 3.8483104907201993, + "learning_rate": 2.4089971431554287e-07, + "logits/chosen": -1.3426092863082886, + "logits/rejected": -1.1109671592712402, + "logps/chosen": -38.246315002441406, + "logps/rejected": -52.28644561767578, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9233973026275635, + "rewards/margins": 4.656871795654297, + "rewards/rejected": -6.580268859863281, + "step": 377 + }, + { + "epoch": 4.48, + "grad_norm": 3.202841352923816, + "learning_rate": 2.3960037696835987e-07, + "logits/chosen": -1.1502716541290283, + "logits/rejected": -1.059588074684143, + "logps/chosen": -29.64549446105957, + "logps/rejected": -53.606666564941406, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0943526029586792, + "rewards/margins": 4.84756326675415, + "rewards/rejected": -5.941916465759277, + "step": 378 + }, + { + "epoch": 4.491851851851852, + "grad_norm": 3.544360039590633, + "learning_rate": 2.3830132096830475e-07, + "logits/chosen": -0.9218529462814331, + "logits/rejected": -0.9420297145843506, + "logps/chosen": -24.30426597595215, + "logps/rejected": -55.90702438354492, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09170357137918472, + "rewards/margins": 6.058189868927002, + "rewards/rejected": -6.149893760681152, + "step": 379 + }, + { + "epoch": 4.503703703703704, + "grad_norm": 3.1662180648764906, + "learning_rate": 2.3700258145950493e-07, + "logits/chosen": -1.1394635438919067, + "logits/rejected": -1.0172181129455566, + "logps/chosen": -33.90563201904297, + "logps/rejected": -55.3117790222168, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4586858749389648, + "rewards/margins": 5.450444221496582, + "rewards/rejected": -6.9091291427612305, + "step": 380 + }, + { + "epoch": 4.515555555555555, + "grad_norm": 3.735511679116059, + "learning_rate": 2.3570419357752518e-07, + "logits/chosen": -0.8611398935317993, + "logits/rejected": -0.926773190498352, + "logps/chosen": -29.910818099975586, + "logps/rejected": -64.39048767089844, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.275660753250122, + "rewards/margins": 7.312250137329102, + "rewards/rejected": -8.587909698486328, + "step": 381 + }, + { + "epoch": 4.5274074074074075, + "grad_norm": 4.002410075302513, + "learning_rate": 2.3440619244841794e-07, + "logits/chosen": -0.9324372410774231, + "logits/rejected": -0.9605002403259277, + "logps/chosen": -25.845172882080078, + "logps/rejected": -57.81169891357422, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8365424871444702, + "rewards/margins": 6.228395938873291, + "rewards/rejected": -7.064938545227051, + "step": 382 + }, + { + "epoch": 4.539259259259259, + "grad_norm": 4.0545032191525, + "learning_rate": 2.3310861318777214e-07, + "logits/chosen": -1.2109543085098267, + "logits/rejected": -1.2952345609664917, + "logps/chosen": -24.969423294067383, + "logps/rejected": -50.565818786621094, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9535256028175354, + "rewards/margins": 5.523766994476318, + "rewards/rejected": -6.477292060852051, + "step": 383 + }, + { + "epoch": 4.551111111111111, + "grad_norm": 3.8412901445263117, + "learning_rate": 2.3181149089976404e-07, + "logits/chosen": -1.0407981872558594, + "logits/rejected": -1.0591387748718262, + "logps/chosen": -24.82103157043457, + "logps/rejected": -58.05794143676758, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6855615973472595, + "rewards/margins": 6.282498836517334, + "rewards/rejected": -6.968060493469238, + "step": 384 + }, + { + "epoch": 4.562962962962963, + "grad_norm": 3.035878137954456, + "learning_rate": 2.30514860676207e-07, + "logits/chosen": -0.8315334320068359, + "logits/rejected": -0.6608816981315613, + "logps/chosen": -33.760494232177734, + "logps/rejected": -53.18769073486328, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0541050434112549, + "rewards/margins": 6.037905693054199, + "rewards/rejected": -7.092010498046875, + "step": 385 + }, + { + "epoch": 4.574814814814815, + "grad_norm": 3.046420967372995, + "learning_rate": 2.2921875759560207e-07, + "logits/chosen": -1.1959935426712036, + "logits/rejected": -1.0914936065673828, + "logps/chosen": -39.321346282958984, + "logps/rejected": -64.97684478759766, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.732681155204773, + "rewards/margins": 5.395863056182861, + "rewards/rejected": -7.128544807434082, + "step": 386 + }, + { + "epoch": 4.586666666666667, + "grad_norm": 3.5502888586749517, + "learning_rate": 2.2792321672218967e-07, + "logits/chosen": -0.7373791337013245, + "logits/rejected": -0.7411605715751648, + "logps/chosen": -28.97515106201172, + "logps/rejected": -61.38172912597656, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8913424015045166, + "rewards/margins": 6.593505859375, + "rewards/rejected": -7.4848480224609375, + "step": 387 + }, + { + "epoch": 4.598518518518518, + "grad_norm": 3.4771354125546625, + "learning_rate": 2.2662827310499995e-07, + "logits/chosen": -1.158825397491455, + "logits/rejected": -0.9315577745437622, + "logps/chosen": -39.89890670776367, + "logps/rejected": -61.11587142944336, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5864561796188354, + "rewards/margins": 5.710073471069336, + "rewards/rejected": -7.296529769897461, + "step": 388 + }, + { + "epoch": 4.6103703703703705, + "grad_norm": 3.8828355066051503, + "learning_rate": 2.2533396177690562e-07, + "logits/chosen": -0.9048175811767578, + "logits/rejected": -0.7590952515602112, + "logps/chosen": -30.942026138305664, + "logps/rejected": -51.84413528442383, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0910900831222534, + "rewards/margins": 4.568852424621582, + "rewards/rejected": -5.659942626953125, + "step": 389 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 3.4046648013267315, + "learning_rate": 2.2404031775367332e-07, + "logits/chosen": -1.1725707054138184, + "logits/rejected": -1.0109634399414062, + "logps/chosen": -26.178606033325195, + "logps/rejected": -56.07292175292969, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29053014516830444, + "rewards/margins": 6.551156520843506, + "rewards/rejected": -6.841686248779297, + "step": 390 + }, + { + "epoch": 4.634074074074074, + "grad_norm": 4.3144590946344445, + "learning_rate": 2.227473760330169e-07, + "logits/chosen": -1.1439464092254639, + "logits/rejected": -1.023298978805542, + "logps/chosen": -26.24483299255371, + "logps/rejected": -43.60441589355469, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9386296272277832, + "rewards/margins": 4.656396865844727, + "rewards/rejected": -5.595026016235352, + "step": 391 + }, + { + "epoch": 4.645925925925926, + "grad_norm": 3.9806588551475217, + "learning_rate": 2.2145517159365043e-07, + "logits/chosen": -1.1556141376495361, + "logits/rejected": -0.9479185342788696, + "logps/chosen": -38.38218688964844, + "logps/rejected": -58.405189514160156, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3316694498062134, + "rewards/margins": 6.8166351318359375, + "rewards/rejected": -8.148303985595703, + "step": 392 + }, + { + "epoch": 4.657777777777778, + "grad_norm": 3.6433642546456553, + "learning_rate": 2.2016373939434166e-07, + "logits/chosen": -1.053426742553711, + "logits/rejected": -1.010209560394287, + "logps/chosen": -32.55500793457031, + "logps/rejected": -55.443321228027344, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6891828775405884, + "rewards/margins": 6.006121635437012, + "rewards/rejected": -6.695303916931152, + "step": 393 + }, + { + "epoch": 4.66962962962963, + "grad_norm": 3.7812535316967035, + "learning_rate": 2.1887311437296684e-07, + "logits/chosen": -0.9255619049072266, + "logits/rejected": -0.7529337406158447, + "logps/chosen": -28.533212661743164, + "logps/rejected": -51.62010955810547, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5108724236488342, + "rewards/margins": 5.3941545486450195, + "rewards/rejected": -5.905027389526367, + "step": 394 + }, + { + "epoch": 4.681481481481481, + "grad_norm": 2.702856391414899, + "learning_rate": 2.175833314455647e-07, + "logits/chosen": -1.06898033618927, + "logits/rejected": -1.0634517669677734, + "logps/chosen": -46.982357025146484, + "logps/rejected": -84.75677490234375, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7449123859405518, + "rewards/margins": 7.542050838470459, + "rewards/rejected": -9.286964416503906, + "step": 395 + }, + { + "epoch": 4.693333333333333, + "grad_norm": 3.1373317717455516, + "learning_rate": 2.162944255053928e-07, + "logits/chosen": -1.2334667444229126, + "logits/rejected": -1.0262863636016846, + "logps/chosen": -27.508880615234375, + "logps/rejected": -52.11289978027344, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0616217851638794, + "rewards/margins": 5.611589431762695, + "rewards/rejected": -6.673211574554443, + "step": 396 + }, + { + "epoch": 4.705185185185185, + "grad_norm": 2.5700965268119016, + "learning_rate": 2.1500643142198264e-07, + "logits/chosen": -1.18964684009552, + "logits/rejected": -1.1267262697219849, + "logps/chosen": -28.83340072631836, + "logps/rejected": -50.370338439941406, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.270019292831421, + "rewards/margins": 5.287970066070557, + "rewards/rejected": -6.557989120483398, + "step": 397 + }, + { + "epoch": 4.717037037037037, + "grad_norm": 4.377276641204308, + "learning_rate": 2.137193840401968e-07, + "logits/chosen": -0.9488641619682312, + "logits/rejected": -0.6226259469985962, + "logps/chosen": -42.97636032104492, + "logps/rejected": -64.65530395507812, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.293939232826233, + "rewards/margins": 5.974466800689697, + "rewards/rejected": -7.268405914306641, + "step": 398 + }, + { + "epoch": 4.728888888888889, + "grad_norm": 4.104797851994462, + "learning_rate": 2.1243331817928643e-07, + "logits/chosen": -1.1831551790237427, + "logits/rejected": -0.9473021030426025, + "logps/chosen": -30.772737503051758, + "logps/rejected": -56.16963577270508, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5397346019744873, + "rewards/margins": 6.304899215698242, + "rewards/rejected": -6.84463357925415, + "step": 399 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 3.548586845462408, + "learning_rate": 2.1114826863194878e-07, + "logits/chosen": -1.1657699346542358, + "logits/rejected": -1.1083228588104248, + "logps/chosen": -25.673093795776367, + "logps/rejected": -48.5228271484375, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7490501999855042, + "rewards/margins": 5.02781343460083, + "rewards/rejected": -5.7768635749816895, + "step": 400 + }, + { + "epoch": 4.752592592592593, + "grad_norm": 4.513251688450835, + "learning_rate": 2.0986427016338623e-07, + "logits/chosen": -0.6789465546607971, + "logits/rejected": -0.6936579346656799, + "logps/chosen": -30.768882751464844, + "logps/rejected": -57.21465301513672, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0039122104644775, + "rewards/margins": 6.220263957977295, + "rewards/rejected": -7.224175453186035, + "step": 401 + }, + { + "epoch": 4.764444444444445, + "grad_norm": 3.213793093164246, + "learning_rate": 2.0858135751036568e-07, + "logits/chosen": -1.1852058172225952, + "logits/rejected": -0.9876963496208191, + "logps/chosen": -24.95799446105957, + "logps/rejected": -58.4112548828125, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1165269613265991, + "rewards/margins": 6.706012725830078, + "rewards/rejected": -7.822539329528809, + "step": 402 + }, + { + "epoch": 4.776296296296296, + "grad_norm": 3.3765755696597592, + "learning_rate": 2.0729956538027904e-07, + "logits/chosen": -1.2950583696365356, + "logits/rejected": -1.2782042026519775, + "logps/chosen": -28.596763610839844, + "logps/rejected": -55.36784744262695, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.079484224319458, + "rewards/margins": 6.282561779022217, + "rewards/rejected": -7.362045764923096, + "step": 403 + }, + { + "epoch": 4.7881481481481485, + "grad_norm": 3.7436046482867567, + "learning_rate": 2.060189284502037e-07, + "logits/chosen": -1.0621609687805176, + "logits/rejected": -1.1178123950958252, + "logps/chosen": -27.336435317993164, + "logps/rejected": -46.967952728271484, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33046185970306396, + "rewards/margins": 4.788575172424316, + "rewards/rejected": -5.11903715133667, + "step": 404 + }, + { + "epoch": 4.8, + "grad_norm": 2.2858826860151678, + "learning_rate": 2.0473948136596486e-07, + "logits/chosen": -1.21229887008667, + "logits/rejected": -1.2507023811340332, + "logps/chosen": -31.56397247314453, + "logps/rejected": -67.93631744384766, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2148863077163696, + "rewards/margins": 8.519362449645996, + "rewards/rejected": -9.734248161315918, + "step": 405 + }, + { + "epoch": 4.811851851851852, + "grad_norm": 4.197533331570636, + "learning_rate": 2.0346125874119838e-07, + "logits/chosen": -1.1015260219573975, + "logits/rejected": -1.130777359008789, + "logps/chosen": -28.210693359375, + "logps/rejected": -64.23535919189453, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4305483102798462, + "rewards/margins": 6.870315074920654, + "rewards/rejected": -8.300863265991211, + "step": 406 + }, + { + "epoch": 4.823703703703703, + "grad_norm": 3.7607706364061055, + "learning_rate": 2.0218429515641368e-07, + "logits/chosen": -1.361039400100708, + "logits/rejected": -1.3556692600250244, + "logps/chosen": -21.602588653564453, + "logps/rejected": -55.4856071472168, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6024814248085022, + "rewards/margins": 6.8341474533081055, + "rewards/rejected": -7.436628341674805, + "step": 407 + }, + { + "epoch": 4.835555555555556, + "grad_norm": 2.961152190191642, + "learning_rate": 2.0090862515805895e-07, + "logits/chosen": -1.1544498205184937, + "logits/rejected": -1.037913203239441, + "logps/chosen": -26.241416931152344, + "logps/rejected": -55.529850006103516, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1264173984527588, + "rewards/margins": 6.888669490814209, + "rewards/rejected": -8.015087127685547, + "step": 408 + }, + { + "epoch": 4.847407407407408, + "grad_norm": 3.800097064071459, + "learning_rate": 1.9963428325758613e-07, + "logits/chosen": -1.397064447402954, + "logits/rejected": -0.9890981316566467, + "logps/chosen": -37.16471862792969, + "logps/rejected": -63.38473129272461, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5235031843185425, + "rewards/margins": 6.591093063354492, + "rewards/rejected": -8.114595413208008, + "step": 409 + }, + { + "epoch": 4.859259259259259, + "grad_norm": 3.6060737188077017, + "learning_rate": 1.983613039305173e-07, + "logits/chosen": -1.189021348953247, + "logits/rejected": -1.1319453716278076, + "logps/chosen": -33.33507537841797, + "logps/rejected": -58.435791015625, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7115690112113953, + "rewards/margins": 6.030098915100098, + "rewards/rejected": -6.741668224334717, + "step": 410 + }, + { + "epoch": 4.871111111111111, + "grad_norm": 3.8852443618244883, + "learning_rate": 1.9708972161551213e-07, + "logits/chosen": -1.3724188804626465, + "logits/rejected": -1.3005732297897339, + "logps/chosen": -36.054195404052734, + "logps/rejected": -61.728668212890625, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7809228897094727, + "rewards/margins": 5.685511112213135, + "rewards/rejected": -7.466434478759766, + "step": 411 + }, + { + "epoch": 4.882962962962963, + "grad_norm": 3.6521043308854098, + "learning_rate": 1.9581957071343588e-07, + "logits/chosen": -1.1163625717163086, + "logits/rejected": -1.1054105758666992, + "logps/chosen": -22.591720581054688, + "logps/rejected": -56.60470962524414, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49366194009780884, + "rewards/margins": 7.763273239135742, + "rewards/rejected": -8.256935119628906, + "step": 412 + }, + { + "epoch": 4.894814814814815, + "grad_norm": 2.844443421722238, + "learning_rate": 1.9455088558642932e-07, + "logits/chosen": -0.8721749186515808, + "logits/rejected": -1.0191127061843872, + "logps/chosen": -22.482418060302734, + "logps/rejected": -54.2562370300293, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4827694892883301, + "rewards/margins": 6.972842693328857, + "rewards/rejected": -7.455611705780029, + "step": 413 + }, + { + "epoch": 4.906666666666666, + "grad_norm": 3.1150557643187424, + "learning_rate": 1.9328370055697832e-07, + "logits/chosen": -1.18574857711792, + "logits/rejected": -0.9993014931678772, + "logps/chosen": -36.91192626953125, + "logps/rejected": -46.982643127441406, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5664029121398926, + "rewards/margins": 4.784036636352539, + "rewards/rejected": -6.35044002532959, + "step": 414 + }, + { + "epoch": 4.9185185185185185, + "grad_norm": 3.1584595235265462, + "learning_rate": 1.9201804990698616e-07, + "logits/chosen": -1.2075750827789307, + "logits/rejected": -1.027672290802002, + "logps/chosen": -30.833179473876953, + "logps/rejected": -64.46182250976562, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.332330584526062, + "rewards/margins": 7.531144142150879, + "rewards/rejected": -8.86347484588623, + "step": 415 + }, + { + "epoch": 4.930370370370371, + "grad_norm": 3.1465761585985264, + "learning_rate": 1.907539678768453e-07, + "logits/chosen": -1.372673749923706, + "logits/rejected": -1.3895915746688843, + "logps/chosen": -27.67334747314453, + "logps/rejected": -49.52977752685547, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9234645962715149, + "rewards/margins": 4.2349162101745605, + "rewards/rejected": -5.15838098526001, + "step": 416 + }, + { + "epoch": 4.942222222222222, + "grad_norm": 2.42226431528538, + "learning_rate": 1.8949148866451152e-07, + "logits/chosen": -1.155286431312561, + "logits/rejected": -1.319460391998291, + "logps/chosen": -25.511240005493164, + "logps/rejected": -67.6077651977539, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.60393226146698, + "rewards/margins": 7.415879726409912, + "rewards/rejected": -8.01981258392334, + "step": 417 + }, + { + "epoch": 4.954074074074074, + "grad_norm": 3.75114428997573, + "learning_rate": 1.8823064642457876e-07, + "logits/chosen": -1.5565588474273682, + "logits/rejected": -1.2644767761230469, + "logps/chosen": -29.305103302001953, + "logps/rejected": -61.24723815917969, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.068399429321289, + "rewards/margins": 6.726673603057861, + "rewards/rejected": -7.79507303237915, + "step": 418 + }, + { + "epoch": 4.965925925925926, + "grad_norm": 4.195691110710784, + "learning_rate": 1.8697147526735466e-07, + "logits/chosen": -0.9303781390190125, + "logits/rejected": -1.2051749229431152, + "logps/chosen": -29.477602005004883, + "logps/rejected": -69.74530792236328, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8630962371826172, + "rewards/margins": 7.451139450073242, + "rewards/rejected": -9.314236640930176, + "step": 419 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 3.162433032789351, + "learning_rate": 1.8571400925793852e-07, + "logits/chosen": -0.7229827642440796, + "logits/rejected": -0.685709536075592, + "logps/chosen": -24.86301040649414, + "logps/rejected": -57.18008041381836, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.692309558391571, + "rewards/margins": 6.926962375640869, + "rewards/rejected": -7.619271755218506, + "step": 420 + }, + { + "epoch": 4.989629629629629, + "grad_norm": 4.385567117559209, + "learning_rate": 1.844582824152988e-07, + "logits/chosen": -1.165217638015747, + "logits/rejected": -0.9067272543907166, + "logps/chosen": -42.31658172607422, + "logps/rejected": -66.11439514160156, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9213820695877075, + "rewards/margins": 6.796330451965332, + "rewards/rejected": -8.71771240234375, + "step": 421 + }, + { + "epoch": 5.001481481481481, + "grad_norm": 3.4081975873995063, + "learning_rate": 1.8320432871135376e-07, + "logits/chosen": -1.0193674564361572, + "logits/rejected": -0.9758960604667664, + "logps/chosen": -28.47673797607422, + "logps/rejected": -48.93853759765625, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7848235964775085, + "rewards/margins": 4.504067897796631, + "rewards/rejected": -5.288891792297363, + "step": 422 + }, + { + "epoch": 5.013333333333334, + "grad_norm": 2.9484160415611984, + "learning_rate": 1.8195218207005136e-07, + "logits/chosen": -0.9763575792312622, + "logits/rejected": -1.1632423400878906, + "logps/chosen": -37.08030319213867, + "logps/rejected": -64.4261703491211, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5703556537628174, + "rewards/margins": 5.08331298828125, + "rewards/rejected": -6.653668403625488, + "step": 423 + }, + { + "epoch": 5.025185185185185, + "grad_norm": 3.2800699811427507, + "learning_rate": 1.8070187636645237e-07, + "logits/chosen": -1.2202644348144531, + "logits/rejected": -0.9552056193351746, + "logps/chosen": -36.58311080932617, + "logps/rejected": -59.91649627685547, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.681425929069519, + "rewards/margins": 6.159786224365234, + "rewards/rejected": -7.841211318969727, + "step": 424 + }, + { + "epoch": 5.037037037037037, + "grad_norm": 2.3754727914899125, + "learning_rate": 1.7945344542581353e-07, + "logits/chosen": -1.0853219032287598, + "logits/rejected": -0.8817991018295288, + "logps/chosen": -33.09999465942383, + "logps/rejected": -67.8524169921875, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1994686126708984, + "rewards/margins": 7.697717666625977, + "rewards/rejected": -8.897185325622559, + "step": 425 + }, + { + "epoch": 5.0488888888888885, + "grad_norm": 3.033283182538126, + "learning_rate": 1.782069230226725e-07, + "logits/chosen": -1.003400206565857, + "logits/rejected": -1.0244035720825195, + "logps/chosen": -24.51974868774414, + "logps/rejected": -56.68452835083008, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36259013414382935, + "rewards/margins": 7.2209672927856445, + "rewards/rejected": -7.583556652069092, + "step": 426 + }, + { + "epoch": 5.060740740740741, + "grad_norm": 2.82767361060403, + "learning_rate": 1.7696234287993413e-07, + "logits/chosen": -0.9122418761253357, + "logits/rejected": -0.8946365118026733, + "logps/chosen": -37.71500015258789, + "logps/rejected": -70.05258178710938, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4320430755615234, + "rewards/margins": 7.710330486297607, + "rewards/rejected": -9.142374038696289, + "step": 427 + }, + { + "epoch": 5.072592592592593, + "grad_norm": 2.5624017267747266, + "learning_rate": 1.7571973866795813e-07, + "logits/chosen": -1.1369318962097168, + "logits/rejected": -0.8684489727020264, + "logps/chosen": -43.49992370605469, + "logps/rejected": -59.39040756225586, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4235126972198486, + "rewards/margins": 6.148205757141113, + "rewards/rejected": -7.571718215942383, + "step": 428 + }, + { + "epoch": 5.084444444444444, + "grad_norm": 3.295544800188778, + "learning_rate": 1.7447914400364833e-07, + "logits/chosen": -1.2425228357315063, + "logits/rejected": -1.266234040260315, + "logps/chosen": -26.035186767578125, + "logps/rejected": -49.9383544921875, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4317536950111389, + "rewards/margins": 5.771049976348877, + "rewards/rejected": -6.202803611755371, + "step": 429 + }, + { + "epoch": 5.0962962962962965, + "grad_norm": 2.9432919280317735, + "learning_rate": 1.7324059244954292e-07, + "logits/chosen": -1.421688437461853, + "logits/rejected": -1.51046621799469, + "logps/chosen": -29.30355453491211, + "logps/rejected": -51.56528091430664, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2735320329666138, + "rewards/margins": 5.418951988220215, + "rewards/rejected": -6.692483425140381, + "step": 430 + }, + { + "epoch": 5.108148148148148, + "grad_norm": 2.951889299596433, + "learning_rate": 1.720041175129066e-07, + "logits/chosen": -0.8546741008758545, + "logits/rejected": -0.8645275831222534, + "logps/chosen": -28.239816665649414, + "logps/rejected": -57.45808792114258, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.293044924736023, + "rewards/margins": 6.973755359649658, + "rewards/rejected": -8.266800880432129, + "step": 431 + }, + { + "epoch": 5.12, + "grad_norm": 2.987267713140231, + "learning_rate": 1.7076975264482433e-07, + "logits/chosen": -1.0786815881729126, + "logits/rejected": -0.9784872531890869, + "logps/chosen": -34.05535125732422, + "logps/rejected": -56.35095977783203, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0851895809173584, + "rewards/margins": 5.206171989440918, + "rewards/rejected": -6.291361331939697, + "step": 432 + }, + { + "epoch": 5.131851851851851, + "grad_norm": 3.143579423533942, + "learning_rate": 1.6953753123929595e-07, + "logits/chosen": -1.2658149003982544, + "logits/rejected": -1.2656917572021484, + "logps/chosen": -21.0126895904541, + "logps/rejected": -56.76115798950195, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6274415254592896, + "rewards/margins": 7.949789524078369, + "rewards/rejected": -8.577230453491211, + "step": 433 + }, + { + "epoch": 5.143703703703704, + "grad_norm": 3.0660601990113645, + "learning_rate": 1.6830748663233303e-07, + "logits/chosen": -1.1669660806655884, + "logits/rejected": -1.2397806644439697, + "logps/chosen": -31.919248580932617, + "logps/rejected": -63.46931838989258, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6147698163986206, + "rewards/margins": 6.417461395263672, + "rewards/rejected": -8.032230377197266, + "step": 434 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 2.860051014922371, + "learning_rate": 1.6707965210105687e-07, + "logits/chosen": -0.5690155029296875, + "logits/rejected": -0.7288935780525208, + "logps/chosen": -29.347030639648438, + "logps/rejected": -70.51150512695312, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8469069600105286, + "rewards/margins": 7.336713790893555, + "rewards/rejected": -8.183621406555176, + "step": 435 + }, + { + "epoch": 5.167407407407407, + "grad_norm": 3.495746895563705, + "learning_rate": 1.6585406086279846e-07, + "logits/chosen": -0.7923306226730347, + "logits/rejected": -0.900981068611145, + "logps/chosen": -34.12835693359375, + "logps/rejected": -65.13916015625, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2421728372573853, + "rewards/margins": 5.556445121765137, + "rewards/rejected": -6.798617839813232, + "step": 436 + }, + { + "epoch": 5.1792592592592595, + "grad_norm": 2.773845959525342, + "learning_rate": 1.6463074607419942e-07, + "logits/chosen": -1.100500464439392, + "logits/rejected": -1.0029168128967285, + "logps/chosen": -37.937015533447266, + "logps/rejected": -46.45364761352539, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0404378175735474, + "rewards/margins": 4.71007776260376, + "rewards/rejected": -5.750515460968018, + "step": 437 + }, + { + "epoch": 5.191111111111111, + "grad_norm": 3.6862802786560187, + "learning_rate": 1.6340974083031523e-07, + "logits/chosen": -1.1192785501480103, + "logits/rejected": -0.712954580783844, + "logps/chosen": -39.31867980957031, + "logps/rejected": -59.20005798339844, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3697164058685303, + "rewards/margins": 7.339575290679932, + "rewards/rejected": -8.7092924118042, + "step": 438 + }, + { + "epoch": 5.202962962962963, + "grad_norm": 2.6145012544108637, + "learning_rate": 1.6219107816372024e-07, + "logits/chosen": -1.0265417098999023, + "logits/rejected": -1.0565710067749023, + "logps/chosen": -24.545156478881836, + "logps/rejected": -57.86946105957031, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8660164475440979, + "rewards/margins": 6.856147289276123, + "rewards/rejected": -7.722163677215576, + "step": 439 + }, + { + "epoch": 5.214814814814815, + "grad_norm": 2.4016912622887774, + "learning_rate": 1.6097479104361326e-07, + "logits/chosen": -0.8214901685714722, + "logits/rejected": -0.4467310905456543, + "logps/chosen": -34.378780364990234, + "logps/rejected": -54.939334869384766, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4449349641799927, + "rewards/margins": 6.770240306854248, + "rewards/rejected": -7.215175628662109, + "step": 440 + }, + { + "epoch": 5.226666666666667, + "grad_norm": 3.1135292579652587, + "learning_rate": 1.5976091237492634e-07, + "logits/chosen": -0.9876181483268738, + "logits/rejected": -0.9957047700881958, + "logps/chosen": -38.20779037475586, + "logps/rejected": -80.09854888916016, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8875401020050049, + "rewards/margins": 6.763437747955322, + "rewards/rejected": -8.650979042053223, + "step": 441 + }, + { + "epoch": 5.238518518518519, + "grad_norm": 3.663082317807131, + "learning_rate": 1.5854947499743413e-07, + "logits/chosen": -1.130734920501709, + "logits/rejected": -1.0467098951339722, + "logps/chosen": -43.32159423828125, + "logps/rejected": -72.49787902832031, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9329556226730347, + "rewards/margins": 7.5610575675964355, + "rewards/rejected": -9.494012832641602, + "step": 442 + }, + { + "epoch": 5.25037037037037, + "grad_norm": 3.053911267648753, + "learning_rate": 1.573405116848656e-07, + "logits/chosen": -0.8695877194404602, + "logits/rejected": -1.033347487449646, + "logps/chosen": -26.546667098999023, + "logps/rejected": -55.25644302368164, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.794514536857605, + "rewards/margins": 5.641097545623779, + "rewards/rejected": -6.435612201690674, + "step": 443 + }, + { + "epoch": 5.262222222222222, + "grad_norm": 2.9630105870365546, + "learning_rate": 1.5613405514401757e-07, + "logits/chosen": -1.2900482416152954, + "logits/rejected": -0.8595216274261475, + "logps/chosen": -42.08282470703125, + "logps/rejected": -59.17259216308594, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2677626609802246, + "rewards/margins": 5.434237480163574, + "rewards/rejected": -6.701999664306641, + "step": 444 + }, + { + "epoch": 5.274074074074074, + "grad_norm": 3.028060174893764, + "learning_rate": 1.5493013801386923e-07, + "logits/chosen": -0.6489288210868835, + "logits/rejected": -0.8385549783706665, + "logps/chosen": -35.989559173583984, + "logps/rejected": -69.44566345214844, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2387605905532837, + "rewards/margins": 7.3805999755859375, + "rewards/rejected": -8.61936092376709, + "step": 445 + }, + { + "epoch": 5.285925925925926, + "grad_norm": 2.9666582681649962, + "learning_rate": 1.537287928647002e-07, + "logits/chosen": -0.9402166604995728, + "logits/rejected": -0.888253927230835, + "logps/chosen": -42.73618698120117, + "logps/rejected": -75.50381469726562, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3216760158538818, + "rewards/margins": 7.429603099822998, + "rewards/rejected": -8.7512788772583, + "step": 446 + }, + { + "epoch": 5.297777777777778, + "grad_norm": 3.0746084836275913, + "learning_rate": 1.525300521972082e-07, + "logits/chosen": -1.1609197854995728, + "logits/rejected": -1.138796091079712, + "logps/chosen": -33.730140686035156, + "logps/rejected": -58.21106719970703, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.451420783996582, + "rewards/margins": 6.178999423980713, + "rewards/rejected": -7.630419731140137, + "step": 447 + }, + { + "epoch": 5.3096296296296295, + "grad_norm": 2.065490256316474, + "learning_rate": 1.513339484416309e-07, + "logits/chosen": -1.325588583946228, + "logits/rejected": -1.179955244064331, + "logps/chosen": -39.745296478271484, + "logps/rejected": -57.22673797607422, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0886282920837402, + "rewards/margins": 5.040305137634277, + "rewards/rejected": -7.128933429718018, + "step": 448 + }, + { + "epoch": 5.321481481481482, + "grad_norm": 2.565247253136151, + "learning_rate": 1.5014051395686766e-07, + "logits/chosen": -1.3147672414779663, + "logits/rejected": -1.3600736856460571, + "logps/chosen": -28.266178131103516, + "logps/rejected": -64.80318450927734, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2247536182403564, + "rewards/margins": 8.029702186584473, + "rewards/rejected": -9.25445556640625, + "step": 449 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 1.6866324861080002, + "learning_rate": 1.489497810296046e-07, + "logits/chosen": -1.4508033990859985, + "logits/rejected": -1.4773669242858887, + "logps/chosen": -37.937896728515625, + "logps/rejected": -70.61952209472656, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2039074897766113, + "rewards/margins": 6.989311218261719, + "rewards/rejected": -9.193219184875488, + "step": 450 + }, + { + "epoch": 5.345185185185185, + "grad_norm": 2.9199939457346016, + "learning_rate": 1.4776178187344105e-07, + "logits/chosen": -0.642178475856781, + "logits/rejected": -0.5968121290206909, + "logps/chosen": -33.17547607421875, + "logps/rejected": -73.9720458984375, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.686303973197937, + "rewards/margins": 7.578001499176025, + "rewards/rejected": -8.264305114746094, + "step": 451 + }, + { + "epoch": 5.357037037037037, + "grad_norm": 2.677154410735394, + "learning_rate": 1.4657654862801797e-07, + "logits/chosen": -1.3107982873916626, + "logits/rejected": -1.0267812013626099, + "logps/chosen": -33.44599914550781, + "logps/rejected": -51.6804084777832, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.964032769203186, + "rewards/margins": 5.850228309631348, + "rewards/rejected": -6.814260482788086, + "step": 452 + }, + { + "epoch": 5.368888888888889, + "grad_norm": 3.006190765683554, + "learning_rate": 1.4539411335814866e-07, + "logits/chosen": -0.8469028472900391, + "logits/rejected": -0.8455516695976257, + "logps/chosen": -34.55913543701172, + "logps/rejected": -65.2293701171875, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0158193111419678, + "rewards/margins": 6.680525302886963, + "rewards/rejected": -7.696345329284668, + "step": 453 + }, + { + "epoch": 5.380740740740741, + "grad_norm": 2.903509151195431, + "learning_rate": 1.4421450805295082e-07, + "logits/chosen": -1.259004831314087, + "logits/rejected": -1.0897257328033447, + "logps/chosen": -34.40946960449219, + "logps/rejected": -54.516990661621094, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.26055908203125, + "rewards/margins": 4.923694610595703, + "rewards/rejected": -7.184253215789795, + "step": 454 + }, + { + "epoch": 5.392592592592592, + "grad_norm": 2.4956212068455947, + "learning_rate": 1.4303776462498186e-07, + "logits/chosen": -1.470017910003662, + "logits/rejected": -1.487809181213379, + "logps/chosen": -22.259977340698242, + "logps/rejected": -61.082489013671875, + "loss": 0.023, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6950967311859131, + "rewards/margins": 7.098940849304199, + "rewards/rejected": -7.794036865234375, + "step": 455 + }, + { + "epoch": 5.404444444444445, + "grad_norm": 2.1074775036840228, + "learning_rate": 1.418639149093748e-07, + "logits/chosen": -1.1679542064666748, + "logits/rejected": -1.185795783996582, + "logps/chosen": -31.977489471435547, + "logps/rejected": -60.19514465332031, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5915217399597168, + "rewards/margins": 6.129962921142578, + "rewards/rejected": -7.721484184265137, + "step": 456 + }, + { + "epoch": 5.416296296296296, + "grad_norm": 2.540463046994792, + "learning_rate": 1.406929906629774e-07, + "logits/chosen": -1.0157017707824707, + "logits/rejected": -0.8678162693977356, + "logps/chosen": -28.734365463256836, + "logps/rejected": -61.63572311401367, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1730878353118896, + "rewards/margins": 6.156033992767334, + "rewards/rejected": -7.329122066497803, + "step": 457 + }, + { + "epoch": 5.428148148148148, + "grad_norm": 2.5036228156389035, + "learning_rate": 1.3952502356349323e-07, + "logits/chosen": -0.8575838804244995, + "logits/rejected": -0.8398576378822327, + "logps/chosen": -25.19296646118164, + "logps/rejected": -50.973854064941406, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8319410085678101, + "rewards/margins": 5.684410095214844, + "rewards/rejected": -6.516350746154785, + "step": 458 + }, + { + "epoch": 5.44, + "grad_norm": 2.241821085085757, + "learning_rate": 1.38360045208624e-07, + "logits/chosen": -1.3081655502319336, + "logits/rejected": -1.1967523097991943, + "logps/chosen": -27.412466049194336, + "logps/rejected": -59.24693298339844, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4743008017539978, + "rewards/margins": 6.069201469421387, + "rewards/rejected": -6.543503284454346, + "step": 459 + }, + { + "epoch": 5.451851851851852, + "grad_norm": 2.8367212448316637, + "learning_rate": 1.371980871152157e-07, + "logits/chosen": -1.092638611793518, + "logits/rejected": -1.0296908617019653, + "logps/chosen": -40.45366668701172, + "logps/rejected": -66.97181701660156, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5610663890838623, + "rewards/margins": 6.595659255981445, + "rewards/rejected": -8.156725883483887, + "step": 460 + }, + { + "epoch": 5.463703703703704, + "grad_norm": 2.137948549440119, + "learning_rate": 1.3603918071840486e-07, + "logits/chosen": -1.4155157804489136, + "logits/rejected": -1.4184473752975464, + "logps/chosen": -27.53375244140625, + "logps/rejected": -57.63071060180664, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5443230867385864, + "rewards/margins": 6.214169502258301, + "rewards/rejected": -6.7584919929504395, + "step": 461 + }, + { + "epoch": 5.475555555555555, + "grad_norm": 3.172618852002527, + "learning_rate": 1.3488335737076911e-07, + "logits/chosen": -0.9026474356651306, + "logits/rejected": -1.121160864830017, + "logps/chosen": -27.04289436340332, + "logps/rejected": -65.71441650390625, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3873372077941895, + "rewards/margins": 7.394488334655762, + "rewards/rejected": -8.78182601928711, + "step": 462 + }, + { + "epoch": 5.4874074074074075, + "grad_norm": 2.932474050290164, + "learning_rate": 1.3373064834147817e-07, + "logits/chosen": -1.1298450231552124, + "logits/rejected": -1.2291852235794067, + "logps/chosen": -27.613210678100586, + "logps/rejected": -49.24211502075195, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7571765184402466, + "rewards/margins": 5.5813889503479, + "rewards/rejected": -6.338565826416016, + "step": 463 + }, + { + "epoch": 5.499259259259259, + "grad_norm": 1.9766796560537938, + "learning_rate": 1.3258108481544847e-07, + "logits/chosen": -1.0279194116592407, + "logits/rejected": -1.0094687938690186, + "logps/chosen": -28.237319946289062, + "logps/rejected": -59.598148345947266, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1605604887008667, + "rewards/margins": 7.338843822479248, + "rewards/rejected": -8.499403953552246, + "step": 464 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 2.6137685278696936, + "learning_rate": 1.314346978924994e-07, + "logits/chosen": -1.3097034692764282, + "logits/rejected": -1.148521900177002, + "logps/chosen": -32.19894027709961, + "logps/rejected": -53.88103485107422, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4137828350067139, + "rewards/margins": 6.188781261444092, + "rewards/rejected": -7.602563858032227, + "step": 465 + }, + { + "epoch": 5.522962962962963, + "grad_norm": 3.107119868952029, + "learning_rate": 1.3029151858651143e-07, + "logits/chosen": -1.031536340713501, + "logits/rejected": -0.8185826539993286, + "logps/chosen": -32.56446075439453, + "logps/rejected": -59.30012512207031, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0495141744613647, + "rewards/margins": 6.904279708862305, + "rewards/rejected": -7.953794479370117, + "step": 466 + }, + { + "epoch": 5.534814814814815, + "grad_norm": 2.7144216036026543, + "learning_rate": 1.2915157782458802e-07, + "logits/chosen": -1.1127973794937134, + "logits/rejected": -1.0301717519760132, + "logps/chosen": -37.235652923583984, + "logps/rejected": -61.32948303222656, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7692995071411133, + "rewards/margins": 6.307473659515381, + "rewards/rejected": -8.076772689819336, + "step": 467 + }, + { + "epoch": 5.546666666666667, + "grad_norm": 2.2942472784287875, + "learning_rate": 1.2801490644621788e-07, + "logits/chosen": -1.2554562091827393, + "logits/rejected": -1.1846646070480347, + "logps/chosen": -26.260223388671875, + "logps/rejected": -48.008331298828125, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0319814682006836, + "rewards/margins": 5.547323703765869, + "rewards/rejected": -6.5793046951293945, + "step": 468 + }, + { + "epoch": 5.558518518518518, + "grad_norm": 2.853816124944289, + "learning_rate": 1.268815352024416e-07, + "logits/chosen": -1.1085361242294312, + "logits/rejected": -1.1656684875488281, + "logps/chosen": -25.87590789794922, + "logps/rejected": -59.65959930419922, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1430463790893555, + "rewards/margins": 7.823276519775391, + "rewards/rejected": -8.966323852539062, + "step": 469 + }, + { + "epoch": 5.57037037037037, + "grad_norm": 2.5887796053682663, + "learning_rate": 1.257514947550189e-07, + "logits/chosen": -1.5515766143798828, + "logits/rejected": -1.3911482095718384, + "logps/chosen": -34.41626739501953, + "logps/rejected": -56.96218490600586, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6425933837890625, + "rewards/margins": 5.508365154266357, + "rewards/rejected": -7.150958061218262, + "step": 470 + }, + { + "epoch": 5.582222222222223, + "grad_norm": 2.4882096910217877, + "learning_rate": 1.2462481567559966e-07, + "logits/chosen": -0.8641619086265564, + "logits/rejected": -0.9501523971557617, + "logps/chosen": -24.07305145263672, + "logps/rejected": -56.903045654296875, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9779921174049377, + "rewards/margins": 6.467419624328613, + "rewards/rejected": -7.445411682128906, + "step": 471 + }, + { + "epoch": 5.594074074074074, + "grad_norm": 2.7668478038500153, + "learning_rate": 1.2350152844489688e-07, + "logits/chosen": -1.0938386917114258, + "logits/rejected": -1.007843255996704, + "logps/chosen": -36.72554016113281, + "logps/rejected": -72.5058364868164, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4429738521575928, + "rewards/margins": 8.357629776000977, + "rewards/rejected": -9.800602912902832, + "step": 472 + }, + { + "epoch": 5.605925925925926, + "grad_norm": 2.987152924412883, + "learning_rate": 1.2238166345186152e-07, + "logits/chosen": -1.024975299835205, + "logits/rejected": -1.231426477432251, + "logps/chosen": -29.665260314941406, + "logps/rejected": -77.71907043457031, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0892112255096436, + "rewards/margins": 8.2877779006958, + "rewards/rejected": -10.376989364624023, + "step": 473 + }, + { + "epoch": 5.6177777777777775, + "grad_norm": 3.79148887390576, + "learning_rate": 1.2126525099286108e-07, + "logits/chosen": -0.9098807573318481, + "logits/rejected": -0.6766495108604431, + "logps/chosen": -34.021461486816406, + "logps/rejected": -60.278900146484375, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.586209774017334, + "rewards/margins": 6.309510231018066, + "rewards/rejected": -7.895719528198242, + "step": 474 + }, + { + "epoch": 5.62962962962963, + "grad_norm": 2.790675449028797, + "learning_rate": 1.201523212708593e-07, + "logits/chosen": -1.2367568016052246, + "logits/rejected": -1.0846775770187378, + "logps/chosen": -32.69294738769531, + "logps/rejected": -56.54222869873047, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.708235263824463, + "rewards/margins": 5.9487504959106445, + "rewards/rejected": -7.656986236572266, + "step": 475 + }, + { + "epoch": 5.641481481481481, + "grad_norm": 2.603743326248802, + "learning_rate": 1.1904290439459971e-07, + "logits/chosen": -1.2075260877609253, + "logits/rejected": -1.1212043762207031, + "logps/chosen": -32.18739318847656, + "logps/rejected": -58.44438171386719, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.459539771080017, + "rewards/margins": 7.072033882141113, + "rewards/rejected": -8.531574249267578, + "step": 476 + }, + { + "epoch": 5.653333333333333, + "grad_norm": 2.9504541803133684, + "learning_rate": 1.1793703037779055e-07, + "logits/chosen": -1.1014938354492188, + "logits/rejected": -1.2106781005859375, + "logps/chosen": -26.133529663085938, + "logps/rejected": -69.6479721069336, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8243754506111145, + "rewards/margins": 8.763911247253418, + "rewards/rejected": -9.588286399841309, + "step": 477 + }, + { + "epoch": 5.6651851851851855, + "grad_norm": 2.6441222916008504, + "learning_rate": 1.1683472913829284e-07, + "logits/chosen": -1.0774693489074707, + "logits/rejected": -1.1322718858718872, + "logps/chosen": -31.37884521484375, + "logps/rejected": -64.80766296386719, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6216317415237427, + "rewards/margins": 6.900985240936279, + "rewards/rejected": -8.52261734008789, + "step": 478 + }, + { + "epoch": 5.677037037037037, + "grad_norm": 4.661580579547073, + "learning_rate": 1.1573603049731153e-07, + "logits/chosen": -1.20614755153656, + "logits/rejected": -0.8982871174812317, + "logps/chosen": -48.37188720703125, + "logps/rejected": -61.23235321044922, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8891024589538574, + "rewards/margins": 6.148949146270752, + "rewards/rejected": -8.038052558898926, + "step": 479 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 2.2006957188380785, + "learning_rate": 1.146409641785882e-07, + "logits/chosen": -1.3002790212631226, + "logits/rejected": -1.0651378631591797, + "logps/chosen": -25.08184051513672, + "logps/rejected": -49.062400817871094, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5322498679161072, + "rewards/margins": 6.5421624183654785, + "rewards/rejected": -7.074413299560547, + "step": 480 + }, + { + "epoch": 5.70074074074074, + "grad_norm": 2.6580827536506346, + "learning_rate": 1.1354955980759689e-07, + "logits/chosen": -1.0489715337753296, + "logits/rejected": -1.1641223430633545, + "logps/chosen": -34.531166076660156, + "logps/rejected": -67.30549621582031, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0368874073028564, + "rewards/margins": 7.102381229400635, + "rewards/rejected": -8.13926887512207, + "step": 481 + }, + { + "epoch": 5.712592592592593, + "grad_norm": 3.0113174027880363, + "learning_rate": 1.1246184691074314e-07, + "logits/chosen": -1.0164622068405151, + "logits/rejected": -0.9794116616249084, + "logps/chosen": -31.135639190673828, + "logps/rejected": -64.71720886230469, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5428776741027832, + "rewards/margins": 6.710593223571777, + "rewards/rejected": -8.253470420837402, + "step": 482 + }, + { + "epoch": 5.724444444444444, + "grad_norm": 2.615616578526652, + "learning_rate": 1.1137785491456453e-07, + "logits/chosen": -1.1091269254684448, + "logits/rejected": -0.7848995327949524, + "logps/chosen": -31.23629379272461, + "logps/rejected": -50.40385437011719, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2773419618606567, + "rewards/margins": 4.750949382781982, + "rewards/rejected": -6.02829122543335, + "step": 483 + }, + { + "epoch": 5.736296296296296, + "grad_norm": 2.188593697679317, + "learning_rate": 1.1029761314493518e-07, + "logits/chosen": -1.2400413751602173, + "logits/rejected": -1.1589823961257935, + "logps/chosen": -35.95014953613281, + "logps/rejected": -68.69471740722656, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5209312438964844, + "rewards/margins": 7.694516658782959, + "rewards/rejected": -9.215447425842285, + "step": 484 + }, + { + "epoch": 5.7481481481481485, + "grad_norm": 3.5115952525737826, + "learning_rate": 1.0922115082627196e-07, + "logits/chosen": -1.3057835102081299, + "logits/rejected": -1.158517599105835, + "logps/chosen": -35.280059814453125, + "logps/rejected": -68.58432006835938, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7615442276000977, + "rewards/margins": 7.462545871734619, + "rewards/rejected": -9.224090576171875, + "step": 485 + }, + { + "epoch": 5.76, + "grad_norm": 2.1510413328277123, + "learning_rate": 1.0814849708074414e-07, + "logits/chosen": -1.316745400428772, + "logits/rejected": -1.0071841478347778, + "logps/chosen": -30.68706703186035, + "logps/rejected": -59.9561882019043, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7682050466537476, + "rewards/margins": 7.350435256958008, + "rewards/rejected": -8.118640899658203, + "step": 486 + }, + { + "epoch": 5.771851851851852, + "grad_norm": 3.181914233461486, + "learning_rate": 1.070796809274853e-07, + "logits/chosen": -1.1496213674545288, + "logits/rejected": -1.1401625871658325, + "logps/chosen": -28.064807891845703, + "logps/rejected": -66.26860046386719, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7950414419174194, + "rewards/margins": 8.238448143005371, + "rewards/rejected": -10.033490180969238, + "step": 487 + }, + { + "epoch": 5.783703703703703, + "grad_norm": 2.5436639084646266, + "learning_rate": 1.0601473128180854e-07, + "logits/chosen": -1.0235388278961182, + "logits/rejected": -0.7555651068687439, + "logps/chosen": -37.206642150878906, + "logps/rejected": -62.38896942138672, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0393364429473877, + "rewards/margins": 7.206859111785889, + "rewards/rejected": -8.246195793151855, + "step": 488 + }, + { + "epoch": 5.795555555555556, + "grad_norm": 1.9218672278778193, + "learning_rate": 1.0495367695442392e-07, + "logits/chosen": -1.169034719467163, + "logits/rejected": -1.2706691026687622, + "logps/chosen": -27.194473266601562, + "logps/rejected": -62.6011962890625, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8176056146621704, + "rewards/margins": 6.933694362640381, + "rewards/rejected": -8.751298904418945, + "step": 489 + }, + { + "epoch": 5.807407407407408, + "grad_norm": 1.9977551486881404, + "learning_rate": 1.0389654665065908e-07, + "logits/chosen": -1.3202329874038696, + "logits/rejected": -1.480948567390442, + "logps/chosen": -28.038394927978516, + "logps/rejected": -61.36920166015625, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7177014946937561, + "rewards/margins": 7.891351222991943, + "rewards/rejected": -8.609053611755371, + "step": 490 + }, + { + "epoch": 5.819259259259259, + "grad_norm": 2.250794182792882, + "learning_rate": 1.0284336896968304e-07, + "logits/chosen": -1.0238221883773804, + "logits/rejected": -1.111476182937622, + "logps/chosen": -32.318992614746094, + "logps/rejected": -81.92007446289062, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8146699666976929, + "rewards/margins": 8.974444389343262, + "rewards/rejected": -10.78911304473877, + "step": 491 + }, + { + "epoch": 5.831111111111111, + "grad_norm": 3.0522503366900064, + "learning_rate": 1.0179417240373182e-07, + "logits/chosen": -1.0641810894012451, + "logits/rejected": -1.1482521295547485, + "logps/chosen": -27.489585876464844, + "logps/rejected": -64.82382202148438, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3054454326629639, + "rewards/margins": 7.11977481842041, + "rewards/rejected": -8.425220489501953, + "step": 492 + }, + { + "epoch": 5.842962962962963, + "grad_norm": 2.3010920457773105, + "learning_rate": 1.0074898533733833e-07, + "logits/chosen": -1.274016261100769, + "logits/rejected": -1.1447503566741943, + "logps/chosen": -36.9109001159668, + "logps/rejected": -71.91200256347656, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5965338945388794, + "rewards/margins": 7.742778778076172, + "rewards/rejected": -9.339312553405762, + "step": 493 + }, + { + "epoch": 5.854814814814815, + "grad_norm": 2.8527471395619894, + "learning_rate": 9.970783604656383e-08, + "logits/chosen": -1.2319526672363281, + "logits/rejected": -1.268723487854004, + "logps/chosen": -24.263778686523438, + "logps/rejected": -49.15123748779297, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2518236637115479, + "rewards/margins": 5.670311450958252, + "rewards/rejected": -6.922135353088379, + "step": 494 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 3.7257951905167124, + "learning_rate": 9.867075269823353e-08, + "logits/chosen": -0.89796382188797, + "logits/rejected": -0.7868020534515381, + "logps/chosen": -33.333683013916016, + "logps/rejected": -56.5141487121582, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8444391489028931, + "rewards/margins": 6.135402202606201, + "rewards/rejected": -6.9798407554626465, + "step": 495 + }, + { + "epoch": 5.8785185185185185, + "grad_norm": 2.375314146679173, + "learning_rate": 9.763776334917398e-08, + "logits/chosen": -1.2864789962768555, + "logits/rejected": -1.2498905658721924, + "logps/chosen": -28.753864288330078, + "logps/rejected": -63.23728942871094, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3437196016311646, + "rewards/margins": 7.696622371673584, + "rewards/rejected": -9.040342330932617, + "step": 496 + }, + { + "epoch": 5.890370370370371, + "grad_norm": 2.3165745968499927, + "learning_rate": 9.660889594545469e-08, + "logits/chosen": -0.8466818928718567, + "logits/rejected": -0.88641756772995, + "logps/chosen": -31.070556640625, + "logps/rejected": -72.79938507080078, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4867016077041626, + "rewards/margins": 7.354016304016113, + "rewards/rejected": -8.840718269348145, + "step": 497 + }, + { + "epoch": 5.902222222222222, + "grad_norm": 3.081071543156241, + "learning_rate": 9.558417832163162e-08, + "logits/chosen": -1.1182941198349, + "logits/rejected": -1.2115066051483154, + "logps/chosen": -32.31890869140625, + "logps/rejected": -61.95922088623047, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6548502445220947, + "rewards/margins": 6.778643608093262, + "rewards/rejected": -8.433493614196777, + "step": 498 + }, + { + "epoch": 5.914074074074074, + "grad_norm": 3.3156804186603317, + "learning_rate": 9.456363819999419e-08, + "logits/chosen": -1.1249277591705322, + "logits/rejected": -1.197086215019226, + "logps/chosen": -29.88912582397461, + "logps/rejected": -65.54908752441406, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.750339388847351, + "rewards/margins": 6.563896179199219, + "rewards/rejected": -8.31423568725586, + "step": 499 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 3.160346367183824, + "learning_rate": 9.354730318981561e-08, + "logits/chosen": -1.2213014364242554, + "logits/rejected": -0.8933899402618408, + "logps/chosen": -29.600210189819336, + "logps/rejected": -57.81951904296875, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3094991445541382, + "rewards/margins": 6.17173433303833, + "rewards/rejected": -7.481233596801758, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 672, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}