diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10110 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.952662721893491, + "eval_steps": 1, + "global_step": 672, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011834319526627219, + "grad_norm": 55.38106439710106, + "learning_rate": 7.352941176470588e-09, + "logits/chosen": -0.587167501449585, + "logits/rejected": -0.6672874093055725, + "logps/chosen": -39.686065673828125, + "logps/rejected": -46.94537353515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.023668639053254437, + "grad_norm": 54.65079993565704, + "learning_rate": 1.4705882352941176e-08, + "logits/chosen": -0.3381628394126892, + "logits/rejected": -0.2981947958469391, + "logps/chosen": -38.55506134033203, + "logps/rejected": -47.09852600097656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.03550295857988166, + "grad_norm": 54.205478847015584, + "learning_rate": 2.2058823529411764e-08, + "logits/chosen": -0.47191303968429565, + "logits/rejected": -0.5924246311187744, + "logps/chosen": -36.32940673828125, + "logps/rejected": -37.75663375854492, + "loss": 0.6975, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004663002677261829, + "rewards/margins": 0.020077597349882126, + "rewards/rejected": -0.02474059723317623, + "step": 3 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 53.90641855823955, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -0.715237021446228, + "logits/rejected": -0.8035542964935303, + "logps/chosen": -39.215999603271484, + "logps/rejected": -47.370750427246094, + "loss": 0.6839, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01616680435836315, + "rewards/margins": 0.024854015558958054, + "rewards/rejected": -0.04102082550525665, + "step": 4 + }, + { + "epoch": 0.05917159763313609, + "grad_norm": 52.25558555929606, + "learning_rate": 3.676470588235294e-08, + "logits/chosen": -0.8661510944366455, + "logits/rejected": -0.7564424276351929, + "logps/chosen": -46.795005798339844, + "logps/rejected": -44.85298538208008, + "loss": 0.6964, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.021712014451622963, + "rewards/margins": -0.05135791748762131, + "rewards/rejected": 0.0296458937227726, + "step": 5 + }, + { + "epoch": 0.07100591715976332, + "grad_norm": 61.52227993189212, + "learning_rate": 4.411764705882353e-08, + "logits/chosen": -0.6120268106460571, + "logits/rejected": -0.5849899053573608, + "logps/chosen": -38.418251037597656, + "logps/rejected": -42.02568054199219, + "loss": 0.7025, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.042597055435180664, + "rewards/margins": -0.009472893550992012, + "rewards/rejected": -0.0331241637468338, + "step": 6 + }, + { + "epoch": 0.08284023668639054, + "grad_norm": 54.74581505857575, + "learning_rate": 5.147058823529411e-08, + "logits/chosen": -0.382029265165329, + "logits/rejected": -0.3890838623046875, + "logps/chosen": -38.0916633605957, + "logps/rejected": -48.64350509643555, + "loss": 0.6941, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0039642686024308205, + "rewards/margins": 0.00704039353877306, + "rewards/rejected": -0.01100466400384903, + "step": 7 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 55.737912792059, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -0.4821730852127075, + "logits/rejected": -0.497173935174942, + "logps/chosen": -41.562705993652344, + "logps/rejected": -36.803367614746094, + "loss": 0.6874, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.02364335022866726, + "rewards/margins": -0.013885259628295898, + "rewards/rejected": -0.009758088737726212, + "step": 8 + }, + { + "epoch": 0.10650887573964497, + "grad_norm": 57.095682285700036, + "learning_rate": 6.617647058823529e-08, + "logits/chosen": -0.3777148723602295, + "logits/rejected": -0.5628172755241394, + "logps/chosen": -39.834068298339844, + "logps/rejected": -40.3427734375, + "loss": 0.6969, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.025170041248202324, + "rewards/margins": -0.03208901360630989, + "rewards/rejected": 0.006918976083397865, + "step": 9 + }, + { + "epoch": 0.11834319526627218, + "grad_norm": 54.157479939562684, + "learning_rate": 7.352941176470588e-08, + "logits/chosen": -0.7878235578536987, + "logits/rejected": -0.9192472696304321, + "logps/chosen": -42.91743469238281, + "logps/rejected": -41.28839874267578, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03524742275476456, + "rewards/margins": 0.016644442453980446, + "rewards/rejected": -0.051891863346099854, + "step": 10 + }, + { + "epoch": 0.1301775147928994, + "grad_norm": 57.291604252818274, + "learning_rate": 8.088235294117647e-08, + "logits/chosen": -0.8743460774421692, + "logits/rejected": -0.8040152788162231, + "logps/chosen": -46.50861358642578, + "logps/rejected": -48.608097076416016, + "loss": 0.6911, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04767334461212158, + "rewards/margins": 0.07948525249958038, + "rewards/rejected": -0.0318119041621685, + "step": 11 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 54.33703237230041, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -0.7734875679016113, + "logits/rejected": -0.7457428574562073, + "logps/chosen": -39.402000427246094, + "logps/rejected": -47.64637756347656, + "loss": 0.6986, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00954131968319416, + "rewards/margins": -0.019432254135608673, + "rewards/rejected": 0.009890936315059662, + "step": 12 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 52.16368650444846, + "learning_rate": 9.558823529411763e-08, + "logits/chosen": -0.6576703786849976, + "logits/rejected": -0.6208328008651733, + "logps/chosen": -38.19804000854492, + "logps/rejected": -49.989341735839844, + "loss": 0.6868, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.007916189730167389, + "rewards/margins": 0.02689201757311821, + "rewards/rejected": -0.01897583156824112, + "step": 13 + }, + { + "epoch": 0.16568047337278108, + "grad_norm": 59.87799896888102, + "learning_rate": 1.0294117647058822e-07, + "logits/chosen": -0.4240450859069824, + "logits/rejected": -0.5086762309074402, + "logps/chosen": -31.308774948120117, + "logps/rejected": -37.28623580932617, + "loss": 0.6977, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005666483659297228, + "rewards/margins": -0.00042060669511556625, + "rewards/rejected": -0.005245877429842949, + "step": 14 + }, + { + "epoch": 0.17751479289940827, + "grad_norm": 51.93559977619766, + "learning_rate": 1.1029411764705881e-07, + "logits/chosen": -0.6712979674339294, + "logits/rejected": -0.40396082401275635, + "logps/chosen": -32.445457458496094, + "logps/rejected": -51.878196716308594, + "loss": 0.6975, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0392225980758667, + "rewards/margins": -0.022353485226631165, + "rewards/rejected": -0.016869116574525833, + "step": 15 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 54.783459080693845, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -0.5053819417953491, + "logits/rejected": -0.7511165738105774, + "logps/chosen": -39.814659118652344, + "logps/rejected": -38.76427459716797, + "loss": 0.6901, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.004495692439377308, + "rewards/margins": 0.00818572100251913, + "rewards/rejected": -0.01268141157925129, + "step": 16 + }, + { + "epoch": 0.20118343195266272, + "grad_norm": 50.13716138258182, + "learning_rate": 1.25e-07, + "logits/chosen": -0.38788411021232605, + "logits/rejected": -0.5259600877761841, + "logps/chosen": -39.752655029296875, + "logps/rejected": -35.34233856201172, + "loss": 0.6873, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0529358834028244, + "rewards/margins": 0.05223493278026581, + "rewards/rejected": 0.0007009506225585938, + "step": 17 + }, + { + "epoch": 0.21301775147928995, + "grad_norm": 53.56051736985278, + "learning_rate": 1.3235294117647057e-07, + "logits/chosen": -0.6249361038208008, + "logits/rejected": -0.2918693721294403, + "logps/chosen": -37.1358528137207, + "logps/rejected": -54.117340087890625, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01282811164855957, + "rewards/margins": 0.035147905349731445, + "rewards/rejected": -0.047976016998291016, + "step": 18 + }, + { + "epoch": 0.22485207100591717, + "grad_norm": 55.4929889677955, + "learning_rate": 1.3970588235294117e-07, + "logits/chosen": -0.7626937627792358, + "logits/rejected": -0.6267987489700317, + "logps/chosen": -34.65644073486328, + "logps/rejected": -41.60660934448242, + "loss": 0.687, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.019578397274017334, + "rewards/margins": 0.030142582952976227, + "rewards/rejected": -0.010564185678958893, + "step": 19 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 53.81366773458671, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -0.9788577556610107, + "logits/rejected": -0.9218689203262329, + "logps/chosen": -35.290775299072266, + "logps/rejected": -40.17462158203125, + "loss": 0.6949, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02336425706744194, + "rewards/margins": -0.008133504539728165, + "rewards/rejected": -0.015230750665068626, + "step": 20 + }, + { + "epoch": 0.2485207100591716, + "grad_norm": 51.609039023069855, + "learning_rate": 1.5441176470588236e-07, + "logits/chosen": -0.7643724083900452, + "logits/rejected": -0.8248336911201477, + "logps/chosen": -39.32087707519531, + "logps/rejected": -44.490169525146484, + "loss": 0.6859, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023657750338315964, + "rewards/margins": 0.06776070594787598, + "rewards/rejected": -0.04410295560956001, + "step": 21 + }, + { + "epoch": 0.2603550295857988, + "grad_norm": 52.42721385787476, + "learning_rate": 1.6176470588235293e-07, + "logits/chosen": -0.6979169249534607, + "logits/rejected": -0.5965849161148071, + "logps/chosen": -35.5829963684082, + "logps/rejected": -46.63249206542969, + "loss": 0.686, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.017513036727905273, + "rewards/margins": 0.06794863194227219, + "rewards/rejected": -0.05043559521436691, + "step": 22 + }, + { + "epoch": 0.27218934911242604, + "grad_norm": 52.14376613382141, + "learning_rate": 1.6911764705882354e-07, + "logits/chosen": -0.26172494888305664, + "logits/rejected": -0.34854474663734436, + "logps/chosen": -43.53734588623047, + "logps/rejected": -47.87934494018555, + "loss": 0.684, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08613558113574982, + "rewards/margins": -0.0002110544592142105, + "rewards/rejected": -0.08592452853918076, + "step": 23 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 50.932647643811045, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -0.5405491590499878, + "logits/rejected": -0.6527445316314697, + "logps/chosen": -41.2894287109375, + "logps/rejected": -42.569549560546875, + "loss": 0.6868, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.017691707238554955, + "rewards/margins": 0.07997651398181915, + "rewards/rejected": -0.09766822308301926, + "step": 24 + }, + { + "epoch": 0.2958579881656805, + "grad_norm": 52.98859914460327, + "learning_rate": 1.8382352941176472e-07, + "logits/chosen": -0.8704635500907898, + "logits/rejected": -0.8379695415496826, + "logps/chosen": -28.815597534179688, + "logps/rejected": -40.756103515625, + "loss": 0.6874, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02325289323925972, + "rewards/margins": 0.040268998593091965, + "rewards/rejected": -0.06352189183235168, + "step": 25 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 48.75103011196724, + "learning_rate": 1.9117647058823527e-07, + "logits/chosen": -0.7580403089523315, + "logits/rejected": -0.5783815979957581, + "logps/chosen": -37.27444839477539, + "logps/rejected": -50.22969055175781, + "loss": 0.679, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04801575839519501, + "rewards/margins": 0.08580491691827774, + "rewards/rejected": -0.13382068276405334, + "step": 26 + }, + { + "epoch": 0.31952662721893493, + "grad_norm": 50.933141633314996, + "learning_rate": 1.9852941176470587e-07, + "logits/chosen": -0.5684085488319397, + "logits/rejected": -0.41421839594841003, + "logps/chosen": -29.699729919433594, + "logps/rejected": -45.36609649658203, + "loss": 0.6783, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1085064709186554, + "rewards/margins": -0.0001874007284641266, + "rewards/rejected": -0.10831907391548157, + "step": 27 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 51.922779020745416, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -0.6725336909294128, + "logits/rejected": -0.6964073777198792, + "logps/chosen": -35.67041015625, + "logps/rejected": -41.71068572998047, + "loss": 0.6749, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.059418633580207825, + "rewards/margins": 0.05292558670043945, + "rewards/rejected": -0.11234420537948608, + "step": 28 + }, + { + "epoch": 0.3431952662721893, + "grad_norm": 50.897993802285555, + "learning_rate": 2.1323529411764705e-07, + "logits/chosen": -0.45363789796829224, + "logits/rejected": -0.5877288579940796, + "logps/chosen": -41.74443817138672, + "logps/rejected": -40.014991760253906, + "loss": 0.6687, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09418225288391113, + "rewards/margins": 0.036634661257267, + "rewards/rejected": -0.13081692159175873, + "step": 29 + }, + { + "epoch": 0.35502958579881655, + "grad_norm": 47.17244651587936, + "learning_rate": 2.2058823529411763e-07, + "logits/chosen": -0.2768702805042267, + "logits/rejected": -0.2787284255027771, + "logps/chosen": -42.12376403808594, + "logps/rejected": -51.230594635009766, + "loss": 0.6608, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11093667894601822, + "rewards/margins": 0.1094408631324768, + "rewards/rejected": -0.22037753462791443, + "step": 30 + }, + { + "epoch": 0.3668639053254438, + "grad_norm": 47.9586113592662, + "learning_rate": 2.2794117647058823e-07, + "logits/chosen": -0.4938819110393524, + "logits/rejected": -0.5225380659103394, + "logps/chosen": -32.536563873291016, + "logps/rejected": -38.28359603881836, + "loss": 0.6474, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12956911325454712, + "rewards/margins": 0.13196122646331787, + "rewards/rejected": -0.261530339717865, + "step": 31 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 52.2429961485054, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -0.7137393951416016, + "logits/rejected": -0.7953929305076599, + "logps/chosen": -34.03690719604492, + "logps/rejected": -42.32886505126953, + "loss": 0.6439, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13849107921123505, + "rewards/margins": 0.14229899644851685, + "rewards/rejected": -0.2807900607585907, + "step": 32 + }, + { + "epoch": 0.3905325443786982, + "grad_norm": 48.070320304458484, + "learning_rate": 2.426470588235294e-07, + "logits/chosen": -0.709818959236145, + "logits/rejected": -0.4917900562286377, + "logps/chosen": -32.813262939453125, + "logps/rejected": -52.584774017333984, + "loss": 0.6496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10162033885717392, + "rewards/margins": 0.21768181025981903, + "rewards/rejected": -0.31930214166641235, + "step": 33 + }, + { + "epoch": 0.40236686390532544, + "grad_norm": 50.501375226344486, + "learning_rate": 2.5e-07, + "logits/chosen": -1.2210612297058105, + "logits/rejected": -1.069713830947876, + "logps/chosen": -28.096343994140625, + "logps/rejected": -42.77171325683594, + "loss": 0.6609, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06562051922082901, + "rewards/margins": 0.1742829531431198, + "rewards/rejected": -0.23990347981452942, + "step": 34 + }, + { + "epoch": 0.41420118343195267, + "grad_norm": 50.18256699637281, + "learning_rate": 2.5735294117647057e-07, + "logits/chosen": -0.5130794048309326, + "logits/rejected": -0.5719175934791565, + "logps/chosen": -37.741825103759766, + "logps/rejected": -45.18247985839844, + "loss": 0.6448, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.18347424268722534, + "rewards/margins": 0.13004478812217712, + "rewards/rejected": -0.31351903080940247, + "step": 35 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 49.643545907741306, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -0.9145612716674805, + "logits/rejected": -0.8272578120231628, + "logps/chosen": -34.39950180053711, + "logps/rejected": -54.22731018066406, + "loss": 0.6389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22830158472061157, + "rewards/margins": 0.19368687272071838, + "rewards/rejected": -0.42198845744132996, + "step": 36 + }, + { + "epoch": 0.4378698224852071, + "grad_norm": 46.37858153447604, + "learning_rate": 2.720588235294117e-07, + "logits/chosen": -0.8228033781051636, + "logits/rejected": -0.6848031282424927, + "logps/chosen": -30.318634033203125, + "logps/rejected": -42.5218505859375, + "loss": 0.6339, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1820099651813507, + "rewards/margins": 0.23087123036384583, + "rewards/rejected": -0.41288119554519653, + "step": 37 + }, + { + "epoch": 0.44970414201183434, + "grad_norm": 47.68646874364111, + "learning_rate": 2.7941176470588235e-07, + "logits/chosen": -0.41094350814819336, + "logits/rejected": -0.457292765378952, + "logps/chosen": -40.93132019042969, + "logps/rejected": -44.33328628540039, + "loss": 0.6219, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2081282138824463, + "rewards/margins": 0.17258334159851074, + "rewards/rejected": -0.38071155548095703, + "step": 38 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 46.99139408792661, + "learning_rate": 2.8676470588235293e-07, + "logits/chosen": -0.9174866676330566, + "logits/rejected": -0.8664580583572388, + "logps/chosen": -34.22110366821289, + "logps/rejected": -45.25912857055664, + "loss": 0.6077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24102823436260223, + "rewards/margins": 0.3032039701938629, + "rewards/rejected": -0.544232189655304, + "step": 39 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 44.92025317341631, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.5698567628860474, + "logits/rejected": -0.6562440395355225, + "logps/chosen": -35.16241455078125, + "logps/rejected": -43.87369155883789, + "loss": 0.6074, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3447490930557251, + "rewards/margins": 0.1720276027917862, + "rewards/rejected": -0.5167766809463501, + "step": 40 + }, + { + "epoch": 0.48520710059171596, + "grad_norm": 43.61954903949639, + "learning_rate": 3.014705882352941e-07, + "logits/chosen": -0.9510785341262817, + "logits/rejected": -1.0125081539154053, + "logps/chosen": -43.066551208496094, + "logps/rejected": -50.81156921386719, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42604392766952515, + "rewards/margins": 0.2472018599510193, + "rewards/rejected": -0.6732457876205444, + "step": 41 + }, + { + "epoch": 0.4970414201183432, + "grad_norm": 48.79189553867012, + "learning_rate": 3.088235294117647e-07, + "logits/chosen": -1.0401208400726318, + "logits/rejected": -0.9916763305664062, + "logps/chosen": -36.0028076171875, + "logps/rejected": -47.314449310302734, + "loss": 0.6188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3239697813987732, + "rewards/margins": 0.3483524024486542, + "rewards/rejected": -0.672322154045105, + "step": 42 + }, + { + "epoch": 0.5088757396449705, + "grad_norm": 45.29293394827699, + "learning_rate": 3.161764705882353e-07, + "logits/chosen": -0.6100953817367554, + "logits/rejected": -0.5119140148162842, + "logps/chosen": -36.110836029052734, + "logps/rejected": -55.10944366455078, + "loss": 0.5727, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.42954936623573303, + "rewards/margins": 0.548741340637207, + "rewards/rejected": -0.9782906770706177, + "step": 43 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 43.75505094492696, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -0.5729663372039795, + "logits/rejected": -0.4085710644721985, + "logps/chosen": -37.87840270996094, + "logps/rejected": -50.89206314086914, + "loss": 0.6021, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3693719506263733, + "rewards/margins": 0.343736857175827, + "rewards/rejected": -0.7131087779998779, + "step": 44 + }, + { + "epoch": 0.5325443786982249, + "grad_norm": 46.651471884498214, + "learning_rate": 3.3088235294117644e-07, + "logits/chosen": -0.5654891133308411, + "logits/rejected": -0.689947783946991, + "logps/chosen": -41.97024154663086, + "logps/rejected": -46.030967712402344, + "loss": 0.573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35547372698783875, + "rewards/margins": 0.4130653738975525, + "rewards/rejected": -0.7685391902923584, + "step": 45 + }, + { + "epoch": 0.5443786982248521, + "grad_norm": 43.74968458385381, + "learning_rate": 3.3823529411764707e-07, + "logits/chosen": -0.6354714632034302, + "logits/rejected": -0.7248853445053101, + "logps/chosen": -34.041595458984375, + "logps/rejected": -39.55751037597656, + "loss": 0.5944, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3268294930458069, + "rewards/margins": 0.2843489646911621, + "rewards/rejected": -0.611178457736969, + "step": 46 + }, + { + "epoch": 0.5562130177514792, + "grad_norm": 44.663170642243635, + "learning_rate": 3.4558823529411765e-07, + "logits/chosen": -0.43914204835891724, + "logits/rejected": -0.41449761390686035, + "logps/chosen": -32.82707595825195, + "logps/rejected": -37.95698547363281, + "loss": 0.5942, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3479520082473755, + "rewards/margins": 0.3092345595359802, + "rewards/rejected": -0.6571865677833557, + "step": 47 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 43.36900384363229, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -0.6816117167472839, + "logits/rejected": -0.6095008254051208, + "logps/chosen": -33.6269416809082, + "logps/rejected": -47.18854522705078, + "loss": 0.5445, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.40539008378982544, + "rewards/margins": 0.5039654970169067, + "rewards/rejected": -0.909355640411377, + "step": 48 + }, + { + "epoch": 0.5798816568047337, + "grad_norm": 44.70080178640567, + "learning_rate": 3.602941176470588e-07, + "logits/chosen": -0.49769917130470276, + "logits/rejected": -0.33341550827026367, + "logps/chosen": -37.26922607421875, + "logps/rejected": -45.382469177246094, + "loss": 0.5435, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.41994184255599976, + "rewards/margins": 0.42326265573501587, + "rewards/rejected": -0.8432044982910156, + "step": 49 + }, + { + "epoch": 0.591715976331361, + "grad_norm": 42.832062592121694, + "learning_rate": 3.6764705882352943e-07, + "logits/chosen": -0.6227316856384277, + "logits/rejected": -0.6291834712028503, + "logps/chosen": -37.443336486816406, + "logps/rejected": -56.363712310791016, + "loss": 0.564, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3350003659725189, + "rewards/margins": 0.5508270263671875, + "rewards/rejected": -0.8858274817466736, + "step": 50 + }, + { + "epoch": 0.6035502958579881, + "grad_norm": 40.537726641128835, + "learning_rate": 3.75e-07, + "logits/chosen": -0.5767950415611267, + "logits/rejected": -0.40233951807022095, + "logps/chosen": -34.15302658081055, + "logps/rejected": -41.203514099121094, + "loss": 0.5447, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4783581495285034, + "rewards/margins": 0.36022478342056274, + "rewards/rejected": -0.8385828733444214, + "step": 51 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 37.3320312919307, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -0.33203282952308655, + "logits/rejected": -0.39482319355010986, + "logps/chosen": -35.42144012451172, + "logps/rejected": -38.94422149658203, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3685012459754944, + "rewards/margins": 0.5317042469978333, + "rewards/rejected": -0.9002054929733276, + "step": 52 + }, + { + "epoch": 0.6272189349112426, + "grad_norm": 44.962671519597414, + "learning_rate": 3.8970588235294116e-07, + "logits/chosen": -0.7768542766571045, + "logits/rejected": -0.7662684917449951, + "logps/chosen": -40.90711975097656, + "logps/rejected": -43.82652282714844, + "loss": 0.5597, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5888789892196655, + "rewards/margins": 0.7121487259864807, + "rewards/rejected": -1.3010276556015015, + "step": 53 + }, + { + "epoch": 0.6390532544378699, + "grad_norm": 40.11509893356046, + "learning_rate": 3.9705882352941174e-07, + "logits/chosen": -0.44021129608154297, + "logits/rejected": -0.33102425932884216, + "logps/chosen": -35.29465103149414, + "logps/rejected": -48.99385070800781, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4893972873687744, + "rewards/margins": 0.4504549503326416, + "rewards/rejected": -0.939852237701416, + "step": 54 + }, + { + "epoch": 0.650887573964497, + "grad_norm": 43.113444451145575, + "learning_rate": 4.044117647058823e-07, + "logits/chosen": -0.546736478805542, + "logits/rejected": -0.6314712762832642, + "logps/chosen": -43.929542541503906, + "logps/rejected": -49.84693908691406, + "loss": 0.5246, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6373113393783569, + "rewards/margins": 0.9987615346908569, + "rewards/rejected": -1.6360729932785034, + "step": 55 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 44.49441569027861, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -0.5550782084465027, + "logits/rejected": -0.5997810959815979, + "logps/chosen": -39.631263732910156, + "logps/rejected": -41.65299987792969, + "loss": 0.5692, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5492139458656311, + "rewards/margins": 0.6075623631477356, + "rewards/rejected": -1.1567763090133667, + "step": 56 + }, + { + "epoch": 0.6745562130177515, + "grad_norm": 49.99500479519036, + "learning_rate": 4.191176470588235e-07, + "logits/chosen": -0.7147572040557861, + "logits/rejected": -0.809965193271637, + "logps/chosen": -39.72154998779297, + "logps/rejected": -44.34455871582031, + "loss": 0.5461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4619465172290802, + "rewards/margins": 0.43853867053985596, + "rewards/rejected": -0.9004851579666138, + "step": 57 + }, + { + "epoch": 0.6863905325443787, + "grad_norm": 41.686424338259535, + "learning_rate": 4.264705882352941e-07, + "logits/chosen": -0.4276542663574219, + "logits/rejected": -0.6359944939613342, + "logps/chosen": -38.81695556640625, + "logps/rejected": -44.787322998046875, + "loss": 0.4877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31248077750205994, + "rewards/margins": 0.9359371662139893, + "rewards/rejected": -1.248417854309082, + "step": 58 + }, + { + "epoch": 0.6982248520710059, + "grad_norm": 39.36173522050298, + "learning_rate": 4.338235294117647e-07, + "logits/chosen": -0.6473718285560608, + "logits/rejected": -0.7196489572525024, + "logps/chosen": -39.105743408203125, + "logps/rejected": -43.301734924316406, + "loss": 0.4509, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5941877365112305, + "rewards/margins": 0.8893916010856628, + "rewards/rejected": -1.483579397201538, + "step": 59 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 37.06656262082701, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -0.4076049029827118, + "logits/rejected": -0.20640525221824646, + "logps/chosen": -32.933773040771484, + "logps/rejected": -50.366722106933594, + "loss": 0.4341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39851048588752747, + "rewards/margins": 0.7430359125137329, + "rewards/rejected": -1.141546368598938, + "step": 60 + }, + { + "epoch": 0.7218934911242604, + "grad_norm": 36.19815173586054, + "learning_rate": 4.485294117647059e-07, + "logits/chosen": -0.7895621061325073, + "logits/rejected": -0.8529470562934875, + "logps/chosen": -41.30886459350586, + "logps/rejected": -52.98210906982422, + "loss": 0.4463, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4878369867801666, + "rewards/margins": 1.4172559976577759, + "rewards/rejected": -1.9050930738449097, + "step": 61 + }, + { + "epoch": 0.7337278106508875, + "grad_norm": 37.646252584686685, + "learning_rate": 4.5588235294117646e-07, + "logits/chosen": -0.46224623918533325, + "logits/rejected": -0.5132017731666565, + "logps/chosen": -35.61504364013672, + "logps/rejected": -45.02849578857422, + "loss": 0.446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31568989157676697, + "rewards/margins": 1.0938689708709717, + "rewards/rejected": -1.409558892250061, + "step": 62 + }, + { + "epoch": 0.7455621301775148, + "grad_norm": 40.99920932267914, + "learning_rate": 4.6323529411764704e-07, + "logits/chosen": -0.24282173812389374, + "logits/rejected": -0.41654685139656067, + "logps/chosen": -40.87584686279297, + "logps/rejected": -45.074466705322266, + "loss": 0.4864, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5393080711364746, + "rewards/margins": 0.729945182800293, + "rewards/rejected": -1.2692532539367676, + "step": 63 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 39.08132618780929, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -0.7257353067398071, + "logits/rejected": -0.6613651514053345, + "logps/chosen": -31.330947875976562, + "logps/rejected": -46.061580657958984, + "loss": 0.431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2516774535179138, + "rewards/margins": 1.220921277999878, + "rewards/rejected": -1.472598910331726, + "step": 64 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 41.475907121673465, + "learning_rate": 4.779411764705882e-07, + "logits/chosen": -0.609380841255188, + "logits/rejected": -0.6923696994781494, + "logps/chosen": -42.626522064208984, + "logps/rejected": -47.41869354248047, + "loss": 0.4345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4935339689254761, + "rewards/margins": 0.8159966468811035, + "rewards/rejected": -1.30953049659729, + "step": 65 + }, + { + "epoch": 0.7810650887573964, + "grad_norm": 40.932537298379174, + "learning_rate": 4.852941176470588e-07, + "logits/chosen": -1.1143670082092285, + "logits/rejected": -1.1686656475067139, + "logps/chosen": -49.556392669677734, + "logps/rejected": -38.85927200317383, + "loss": 0.4568, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46525275707244873, + "rewards/margins": 0.4739494323730469, + "rewards/rejected": -0.9392022490501404, + "step": 66 + }, + { + "epoch": 0.7928994082840237, + "grad_norm": 36.93572313017648, + "learning_rate": 4.926470588235295e-07, + "logits/chosen": -0.659454882144928, + "logits/rejected": -0.7761635184288025, + "logps/chosen": -36.797096252441406, + "logps/rejected": -46.951560974121094, + "loss": 0.4149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47968119382858276, + "rewards/margins": 1.5291297435760498, + "rewards/rejected": -2.0088109970092773, + "step": 67 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 44.10212062947347, + "learning_rate": 5e-07, + "logits/chosen": -0.22745545208454132, + "logits/rejected": -0.3337535858154297, + "logps/chosen": -43.72065734863281, + "logps/rejected": -46.58799743652344, + "loss": 0.451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11050184071063995, + "rewards/margins": 1.0298746824264526, + "rewards/rejected": -1.1403765678405762, + "step": 68 + }, + { + "epoch": 0.8165680473372781, + "grad_norm": 41.21338166081281, + "learning_rate": 4.999966183013662e-07, + "logits/chosen": -0.5445861220359802, + "logits/rejected": -0.8019598126411438, + "logps/chosen": -41.4810791015625, + "logps/rejected": -36.41514587402344, + "loss": 0.4593, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.505042314529419, + "rewards/margins": 0.4535430669784546, + "rewards/rejected": -0.9585853815078735, + "step": 69 + }, + { + "epoch": 0.8284023668639053, + "grad_norm": 35.095204565269476, + "learning_rate": 4.999864732969518e-07, + "logits/chosen": -0.7983517050743103, + "logits/rejected": -0.7316067814826965, + "logps/chosen": -34.78556442260742, + "logps/rejected": -48.34429931640625, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.520796537399292, + "rewards/margins": 1.445544958114624, + "rewards/rejected": -1.966341495513916, + "step": 70 + }, + { + "epoch": 0.8402366863905325, + "grad_norm": 40.30265829692534, + "learning_rate": 4.999695652612155e-07, + "logits/chosen": -0.9361473917961121, + "logits/rejected": -1.0400917530059814, + "logps/chosen": -34.00445556640625, + "logps/rejected": -45.20707702636719, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12054769694805145, + "rewards/margins": 1.341543197631836, + "rewards/rejected": -1.4620908498764038, + "step": 71 + }, + { + "epoch": 0.8520710059171598, + "grad_norm": 40.415075736728824, + "learning_rate": 4.999458946515807e-07, + "logits/chosen": -0.5303232669830322, + "logits/rejected": -0.6230794191360474, + "logps/chosen": -39.0274658203125, + "logps/rejected": -45.97068786621094, + "loss": 0.4644, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.48386603593826294, + "rewards/margins": 0.9363454580307007, + "rewards/rejected": -1.4202115535736084, + "step": 72 + }, + { + "epoch": 0.863905325443787, + "grad_norm": 38.77891075432772, + "learning_rate": 4.999154621084221e-07, + "logits/chosen": -0.6193030476570129, + "logits/rejected": -0.6146333813667297, + "logps/chosen": -43.54295349121094, + "logps/rejected": -48.63895034790039, + "loss": 0.4151, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.49407535791397095, + "rewards/margins": 1.051137924194336, + "rewards/rejected": -1.5452133417129517, + "step": 73 + }, + { + "epoch": 0.8757396449704142, + "grad_norm": 37.77568612282111, + "learning_rate": 4.998782684550491e-07, + "logits/chosen": -0.36244261264801025, + "logits/rejected": -0.44385403394699097, + "logps/chosen": -42.133628845214844, + "logps/rejected": -43.378028869628906, + "loss": 0.4466, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2831823229789734, + "rewards/margins": 0.7465813159942627, + "rewards/rejected": -1.0297636985778809, + "step": 74 + }, + { + "epoch": 0.8875739644970414, + "grad_norm": 41.66281833701898, + "learning_rate": 4.998343146976837e-07, + "logits/chosen": -0.9405574202537537, + "logits/rejected": -0.9447466135025024, + "logps/chosen": -37.16999053955078, + "logps/rejected": -50.3101921081543, + "loss": 0.4843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34810084104537964, + "rewards/margins": 1.4797228574752808, + "rewards/rejected": -1.8278236389160156, + "step": 75 + }, + { + "epoch": 0.8994082840236687, + "grad_norm": 41.38364147290228, + "learning_rate": 4.997836020254328e-07, + "logits/chosen": -0.760882556438446, + "logits/rejected": -0.8380026817321777, + "logps/chosen": -32.650291442871094, + "logps/rejected": -44.52061462402344, + "loss": 0.4716, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0004928633570671082, + "rewards/margins": 1.002596378326416, + "rewards/rejected": -1.003089189529419, + "step": 76 + }, + { + "epoch": 0.9112426035502958, + "grad_norm": 37.663962046996495, + "learning_rate": 4.99726131810256e-07, + "logits/chosen": -0.6014434099197388, + "logits/rejected": -0.8074021935462952, + "logps/chosen": -40.66423797607422, + "logps/rejected": -30.966102600097656, + "loss": 0.414, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06758510321378708, + "rewards/margins": 0.7224689722061157, + "rewards/rejected": -0.7900540232658386, + "step": 77 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 35.62116536902, + "learning_rate": 4.996619056069291e-07, + "logits/chosen": -0.8005455732345581, + "logits/rejected": -1.0737569332122803, + "logps/chosen": -47.74522399902344, + "logps/rejected": -40.22245788574219, + "loss": 0.3874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11318114399909973, + "rewards/margins": 0.9375712275505066, + "rewards/rejected": -0.8243900537490845, + "step": 78 + }, + { + "epoch": 0.9349112426035503, + "grad_norm": 41.14929411833454, + "learning_rate": 4.995909251530013e-07, + "logits/chosen": -0.764075517654419, + "logits/rejected": -0.7951244115829468, + "logps/chosen": -37.42706298828125, + "logps/rejected": -49.54686737060547, + "loss": 0.4372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2584840655326843, + "rewards/margins": 1.3500514030456543, + "rewards/rejected": -1.6085355281829834, + "step": 79 + }, + { + "epoch": 0.9467455621301775, + "grad_norm": 37.43972651882339, + "learning_rate": 4.995131923687487e-07, + "logits/chosen": -0.6362431049346924, + "logits/rejected": -0.4747117757797241, + "logps/chosen": -37.49502944946289, + "logps/rejected": -55.17448806762695, + "loss": 0.3999, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2551206946372986, + "rewards/margins": 0.9605333209037781, + "rewards/rejected": -1.2156541347503662, + "step": 80 + }, + { + "epoch": 0.9585798816568047, + "grad_norm": 43.94031843678492, + "learning_rate": 4.994287093571221e-07, + "logits/chosen": -0.7600383758544922, + "logits/rejected": -0.537135660648346, + "logps/chosen": -39.631553649902344, + "logps/rejected": -59.48577880859375, + "loss": 0.4709, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.30705738067626953, + "rewards/margins": 1.0295747518539429, + "rewards/rejected": -1.3366321325302124, + "step": 81 + }, + { + "epoch": 0.9704142011834319, + "grad_norm": 43.25344667732382, + "learning_rate": 4.993374784036901e-07, + "logits/chosen": -0.8365252017974854, + "logits/rejected": -0.7885805368423462, + "logps/chosen": -38.859161376953125, + "logps/rejected": -54.36302947998047, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3023694157600403, + "rewards/margins": 2.4610185623168945, + "rewards/rejected": -2.763388156890869, + "step": 82 + }, + { + "epoch": 0.9822485207100592, + "grad_norm": 37.685612874220524, + "learning_rate": 4.992395019765775e-07, + "logits/chosen": -0.6076855659484863, + "logits/rejected": -0.6970850229263306, + "logps/chosen": -37.532691955566406, + "logps/rejected": -53.90351486206055, + "loss": 0.3857, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5352523326873779, + "rewards/margins": 1.9930164813995361, + "rewards/rejected": -2.528268814086914, + "step": 83 + }, + { + "epoch": 0.9940828402366864, + "grad_norm": 36.826262132209905, + "learning_rate": 4.991347827263982e-07, + "logits/chosen": -0.8733373880386353, + "logits/rejected": -0.7132407426834106, + "logps/chosen": -33.2780647277832, + "logps/rejected": -47.66956329345703, + "loss": 0.4261, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13416826725006104, + "rewards/margins": 1.7117326259613037, + "rewards/rejected": -1.5775643587112427, + "step": 84 + }, + { + "epoch": 1.0059171597633136, + "grad_norm": 40.77876491907824, + "learning_rate": 4.990233234861839e-07, + "logits/chosen": -0.6671120524406433, + "logits/rejected": -0.961428701877594, + "logps/chosen": -37.66517639160156, + "logps/rejected": -42.085426330566406, + "loss": 0.375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03282582014799118, + "rewards/margins": 2.404395580291748, + "rewards/rejected": -2.4372215270996094, + "step": 85 + }, + { + "epoch": 1.017751479289941, + "grad_norm": 35.343447731733626, + "learning_rate": 4.989051272713069e-07, + "logits/chosen": -0.9536569118499756, + "logits/rejected": -1.1731913089752197, + "logps/chosen": -35.637535095214844, + "logps/rejected": -42.364036560058594, + "loss": 0.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12619183957576752, + "rewards/margins": 2.5188794136047363, + "rewards/rejected": -2.645071268081665, + "step": 86 + }, + { + "epoch": 1.029585798816568, + "grad_norm": 33.223106342915955, + "learning_rate": 4.987801972793993e-07, + "logits/chosen": -0.7712712287902832, + "logits/rejected": -0.9531198740005493, + "logps/chosen": -45.71641540527344, + "logps/rejected": -58.85322952270508, + "loss": 0.3053, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2449014037847519, + "rewards/margins": 3.286306142807007, + "rewards/rejected": -3.0414042472839355, + "step": 87 + }, + { + "epoch": 1.0414201183431953, + "grad_norm": 35.64530002995125, + "learning_rate": 4.986485368902656e-07, + "logits/chosen": -0.6423381567001343, + "logits/rejected": -0.6168457865715027, + "logps/chosen": -31.16773223876953, + "logps/rejected": -45.484893798828125, + "loss": 0.3375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22324074804782867, + "rewards/margins": 1.2495863437652588, + "rewards/rejected": -1.4728271961212158, + "step": 88 + }, + { + "epoch": 1.0532544378698225, + "grad_norm": 37.20259605794752, + "learning_rate": 4.985101496657918e-07, + "logits/chosen": -1.1634238958358765, + "logits/rejected": -1.103615164756775, + "logps/chosen": -41.237342834472656, + "logps/rejected": -62.546531677246094, + "loss": 0.344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3175942599773407, + "rewards/margins": 2.2205255031585693, + "rewards/rejected": -2.5381197929382324, + "step": 89 + }, + { + "epoch": 1.0650887573964498, + "grad_norm": 32.598385557495284, + "learning_rate": 4.983650393498489e-07, + "logits/chosen": -0.922864556312561, + "logits/rejected": -0.8289706707000732, + "logps/chosen": -29.635332107543945, + "logps/rejected": -45.355323791503906, + "loss": 0.3229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7630589008331299, + "rewards/margins": 1.704155683517456, + "rewards/rejected": -2.467214584350586, + "step": 90 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 38.26125209425027, + "learning_rate": 4.982132098681923e-07, + "logits/chosen": -1.0017441511154175, + "logits/rejected": -0.9490557909011841, + "logps/chosen": -40.40355682373047, + "logps/rejected": -53.06212615966797, + "loss": 0.3536, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7460950016975403, + "rewards/margins": 2.146270513534546, + "rewards/rejected": -2.8923654556274414, + "step": 91 + }, + { + "epoch": 1.0887573964497042, + "grad_norm": 35.48709030046864, + "learning_rate": 4.980546653283537e-07, + "logits/chosen": -0.8610115051269531, + "logits/rejected": -0.8497661352157593, + "logps/chosen": -33.734352111816406, + "logps/rejected": -51.21432113647461, + "loss": 0.3493, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08930137753486633, + "rewards/margins": 2.130716562271118, + "rewards/rejected": -2.041415214538574, + "step": 92 + }, + { + "epoch": 1.1005917159763314, + "grad_norm": 33.05555278643355, + "learning_rate": 4.978894100195324e-07, + "logits/chosen": -0.7339059114456177, + "logits/rejected": -0.7007895112037659, + "logps/chosen": -43.019500732421875, + "logps/rejected": -54.64815902709961, + "loss": 0.2908, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5389220118522644, + "rewards/margins": 2.059274673461914, + "rewards/rejected": -2.5981967449188232, + "step": 93 + }, + { + "epoch": 1.1124260355029585, + "grad_norm": 32.64806139806569, + "learning_rate": 4.977174484124775e-07, + "logits/chosen": -0.6110660433769226, + "logits/rejected": -0.6840114593505859, + "logps/chosen": -39.90541458129883, + "logps/rejected": -41.75571060180664, + "loss": 0.322, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3695860803127289, + "rewards/margins": 1.257794737815857, + "rewards/rejected": -1.6273807287216187, + "step": 94 + }, + { + "epoch": 1.1242603550295858, + "grad_norm": 28.73456224275126, + "learning_rate": 4.975387851593676e-07, + "logits/chosen": -1.0169286727905273, + "logits/rejected": -0.9107025861740112, + "logps/chosen": -38.64891052246094, + "logps/rejected": -54.81282043457031, + "loss": 0.3026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09714408218860626, + "rewards/margins": 1.6703901290893555, + "rewards/rejected": -1.7675341367721558, + "step": 95 + }, + { + "epoch": 1.136094674556213, + "grad_norm": 40.46066004537338, + "learning_rate": 4.97353425093685e-07, + "logits/chosen": -0.5108687877655029, + "logits/rejected": -0.5603746175765991, + "logps/chosen": -38.765052795410156, + "logps/rejected": -44.74797058105469, + "loss": 0.3836, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35121989250183105, + "rewards/margins": 1.2221122980117798, + "rewards/rejected": -1.5733323097229004, + "step": 96 + }, + { + "epoch": 1.1479289940828403, + "grad_norm": 33.115171654161905, + "learning_rate": 4.971613732300848e-07, + "logits/chosen": -0.5614181756973267, + "logits/rejected": -0.44806018471717834, + "logps/chosen": -34.740657806396484, + "logps/rejected": -51.19026184082031, + "loss": 0.3033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37993931770324707, + "rewards/margins": 2.2012622356414795, + "rewards/rejected": -2.5812015533447266, + "step": 97 + }, + { + "epoch": 1.1597633136094674, + "grad_norm": 30.314211745480826, + "learning_rate": 4.96962634764259e-07, + "logits/chosen": -0.6665958762168884, + "logits/rejected": -0.9045260548591614, + "logps/chosen": -50.40956115722656, + "logps/rejected": -52.3349494934082, + "loss": 0.2727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28259751200675964, + "rewards/margins": 2.2699124813079834, + "rewards/rejected": -2.5525100231170654, + "step": 98 + }, + { + "epoch": 1.1715976331360947, + "grad_norm": 35.25579514857924, + "learning_rate": 4.967572150727964e-07, + "logits/chosen": -0.6161059737205505, + "logits/rejected": -0.619467556476593, + "logps/chosen": -41.40134048461914, + "logps/rejected": -47.676292419433594, + "loss": 0.3182, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.002119541168213, + "rewards/margins": 1.5968635082244873, + "rewards/rejected": -2.598982810974121, + "step": 99 + }, + { + "epoch": 1.183431952662722, + "grad_norm": 31.87031408729149, + "learning_rate": 4.965451197130372e-07, + "logits/chosen": -0.6040835380554199, + "logits/rejected": -0.6801787614822388, + "logps/chosen": -35.400917053222656, + "logps/rejected": -46.94811248779297, + "loss": 0.3003, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07005947083234787, + "rewards/margins": 2.207547903060913, + "rewards/rejected": -2.2776074409484863, + "step": 100 + }, + { + "epoch": 1.195266272189349, + "grad_norm": 34.95313563934469, + "learning_rate": 4.963263544229219e-07, + "logits/chosen": -0.7886058688163757, + "logits/rejected": -0.8541610240936279, + "logps/chosen": -37.76707077026367, + "logps/rejected": -52.185367584228516, + "loss": 0.2959, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6315910816192627, + "rewards/margins": 2.7951231002807617, + "rewards/rejected": -3.4267144203186035, + "step": 101 + }, + { + "epoch": 1.2071005917159763, + "grad_norm": 32.38499457275457, + "learning_rate": 4.961009251208367e-07, + "logits/chosen": -0.9285825490951538, + "logits/rejected": -0.9554150700569153, + "logps/chosen": -34.80419921875, + "logps/rejected": -44.33222961425781, + "loss": 0.3286, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12241765856742859, + "rewards/margins": 1.6503454446792603, + "rewards/rejected": -1.7727631330490112, + "step": 102 + }, + { + "epoch": 1.2189349112426036, + "grad_norm": 35.88300586588521, + "learning_rate": 4.958688379054535e-07, + "logits/chosen": -0.717126727104187, + "logits/rejected": -0.5765900611877441, + "logps/chosen": -32.79739761352539, + "logps/rejected": -56.2498893737793, + "loss": 0.338, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11334749311208725, + "rewards/margins": 2.23618745803833, + "rewards/rejected": -2.3495349884033203, + "step": 103 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 32.32700164175202, + "learning_rate": 4.956300990555643e-07, + "logits/chosen": -0.8154267072677612, + "logits/rejected": -0.9468405246734619, + "logps/chosen": -38.78935623168945, + "logps/rejected": -56.32616424560547, + "loss": 0.2922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04591973125934601, + "rewards/margins": 3.170891761779785, + "rewards/rejected": -3.2168116569519043, + "step": 104 + }, + { + "epoch": 1.242603550295858, + "grad_norm": 31.39200334822418, + "learning_rate": 4.953847150299118e-07, + "logits/chosen": -0.6333639621734619, + "logits/rejected": -0.5471930503845215, + "logps/chosen": -36.49355697631836, + "logps/rejected": -41.39311218261719, + "loss": 0.3301, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8302067518234253, + "rewards/margins": 0.9772886037826538, + "rewards/rejected": -1.8074952363967896, + "step": 105 + }, + { + "epoch": 1.2544378698224852, + "grad_norm": 28.680251994430197, + "learning_rate": 4.951326924670147e-07, + "logits/chosen": -0.7039244771003723, + "logits/rejected": -0.6648294925689697, + "logps/chosen": -34.96596145629883, + "logps/rejected": -54.543174743652344, + "loss": 0.2705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4160834550857544, + "rewards/margins": 4.053628444671631, + "rewards/rejected": -4.469712257385254, + "step": 106 + }, + { + "epoch": 1.2662721893491125, + "grad_norm": 34.660972787041885, + "learning_rate": 4.948740381849879e-07, + "logits/chosen": -0.436050683259964, + "logits/rejected": -0.49883347749710083, + "logps/chosen": -42.65693664550781, + "logps/rejected": -48.37257766723633, + "loss": 0.2924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3416898846626282, + "rewards/margins": 2.6665897369384766, + "rewards/rejected": -3.00827956199646, + "step": 107 + }, + { + "epoch": 1.2781065088757395, + "grad_norm": 35.73754376882297, + "learning_rate": 4.94608759181358e-07, + "logits/chosen": -1.3829025030136108, + "logits/rejected": -1.270089864730835, + "logps/chosen": -42.553672790527344, + "logps/rejected": -68.03410339355469, + "loss": 0.3177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4059150815010071, + "rewards/margins": 2.707810878753662, + "rewards/rejected": -3.1137256622314453, + "step": 108 + }, + { + "epoch": 1.2899408284023668, + "grad_norm": 29.030913535413347, + "learning_rate": 4.943368626328741e-07, + "logits/chosen": -0.7719374895095825, + "logits/rejected": -0.8931103348731995, + "logps/chosen": -37.83287048339844, + "logps/rejected": -55.789669036865234, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5285115838050842, + "rewards/margins": 3.6779208183288574, + "rewards/rejected": -4.206432819366455, + "step": 109 + }, + { + "epoch": 1.301775147928994, + "grad_norm": 32.18624732441535, + "learning_rate": 4.940583558953137e-07, + "logits/chosen": -0.8106911778450012, + "logits/rejected": -0.6266003847122192, + "logps/chosen": -27.738155364990234, + "logps/rejected": -53.347442626953125, + "loss": 0.2545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49206268787384033, + "rewards/margins": 3.2540321350097656, + "rewards/rejected": -3.7460951805114746, + "step": 110 + }, + { + "epoch": 1.3136094674556213, + "grad_norm": 29.81498338818304, + "learning_rate": 4.937732465032838e-07, + "logits/chosen": -0.9487054347991943, + "logits/rejected": -0.7686504125595093, + "logps/chosen": -37.48987579345703, + "logps/rejected": -61.35884475708008, + "loss": 0.2985, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.44103536009788513, + "rewards/margins": 2.5024337768554688, + "rewards/rejected": -2.9434690475463867, + "step": 111 + }, + { + "epoch": 1.3254437869822486, + "grad_norm": 31.489716700318738, + "learning_rate": 4.934815421700164e-07, + "logits/chosen": -0.6628305315971375, + "logits/rejected": -0.5205198526382446, + "logps/chosen": -38.25563430786133, + "logps/rejected": -62.12502670288086, + "loss": 0.2841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37647631764411926, + "rewards/margins": 2.712799549102783, + "rewards/rejected": -3.08927583694458, + "step": 112 + }, + { + "epoch": 1.3372781065088757, + "grad_norm": 26.849776060745437, + "learning_rate": 4.93183250787161e-07, + "logits/chosen": -0.8486281633377075, + "logits/rejected": -0.6799747347831726, + "logps/chosen": -36.62342834472656, + "logps/rejected": -51.13971710205078, + "loss": 0.2563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19857852160930634, + "rewards/margins": 2.1364779472351074, + "rewards/rejected": -2.3350563049316406, + "step": 113 + }, + { + "epoch": 1.349112426035503, + "grad_norm": 30.82600982658151, + "learning_rate": 4.928783804245699e-07, + "logits/chosen": -0.9438729286193848, + "logits/rejected": -1.0174190998077393, + "logps/chosen": -40.94575119018555, + "logps/rejected": -47.26662826538086, + "loss": 0.2841, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4203495681285858, + "rewards/margins": 2.5113162994384766, + "rewards/rejected": -2.9316658973693848, + "step": 114 + }, + { + "epoch": 1.3609467455621302, + "grad_norm": 37.63814438854822, + "learning_rate": 4.925669393300807e-07, + "logits/chosen": -0.5685192942619324, + "logits/rejected": -0.6837583184242249, + "logps/chosen": -35.69676208496094, + "logps/rejected": -42.57113265991211, + "loss": 0.3434, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2215765118598938, + "rewards/margins": 2.7080235481262207, + "rewards/rejected": -2.9296000003814697, + "step": 115 + }, + { + "epoch": 1.3727810650887573, + "grad_norm": 27.187951013233324, + "learning_rate": 4.922489359292927e-07, + "logits/chosen": -0.8806582689285278, + "logits/rejected": -0.8417138457298279, + "logps/chosen": -48.617950439453125, + "logps/rejected": -63.693763732910156, + "loss": 0.258, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.31812140345573425, + "rewards/margins": 3.5926265716552734, + "rewards/rejected": -3.910747766494751, + "step": 116 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 33.37578095279378, + "learning_rate": 4.919243788253393e-07, + "logits/chosen": -0.6644065380096436, + "logits/rejected": -0.8404392004013062, + "logps/chosen": -34.850852966308594, + "logps/rejected": -44.51880645751953, + "loss": 0.3009, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36010777950286865, + "rewards/margins": 2.36444091796875, + "rewards/rejected": -2.724548816680908, + "step": 117 + }, + { + "epoch": 1.3964497041420119, + "grad_norm": 31.874390557999053, + "learning_rate": 4.915932767986551e-07, + "logits/chosen": -0.3398258686065674, + "logits/rejected": -0.35200411081314087, + "logps/chosen": -29.706180572509766, + "logps/rejected": -36.340633392333984, + "loss": 0.2804, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17402225732803345, + "rewards/margins": 1.5369799137115479, + "rewards/rejected": -1.3629577159881592, + "step": 118 + }, + { + "epoch": 1.4082840236686391, + "grad_norm": 31.69981967422697, + "learning_rate": 4.912556388067381e-07, + "logits/chosen": -0.8166269063949585, + "logits/rejected": -0.867180347442627, + "logps/chosen": -31.888137817382812, + "logps/rejected": -43.59930419921875, + "loss": 0.2591, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0748639851808548, + "rewards/margins": 1.9631797075271606, + "rewards/rejected": -2.038043737411499, + "step": 119 + }, + { + "epoch": 1.4201183431952662, + "grad_norm": 31.59528234355252, + "learning_rate": 4.909114739839079e-07, + "logits/chosen": -0.6452093124389648, + "logits/rejected": -0.5552669763565063, + "logps/chosen": -29.506380081176758, + "logps/rejected": -50.04014587402344, + "loss": 0.2972, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0014930292963981628, + "rewards/margins": 2.9174089431762695, + "rewards/rejected": -2.9159157276153564, + "step": 120 + }, + { + "epoch": 1.4319526627218935, + "grad_norm": 29.665894281084412, + "learning_rate": 4.90560791641058e-07, + "logits/chosen": -0.9231401085853577, + "logits/rejected": -0.9149960279464722, + "logps/chosen": -39.093299865722656, + "logps/rejected": -58.45146942138672, + "loss": 0.2505, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0397074818611145, + "rewards/margins": 3.4828078746795654, + "rewards/rejected": -3.4431004524230957, + "step": 121 + }, + { + "epoch": 1.4437869822485208, + "grad_norm": 26.842826013555836, + "learning_rate": 4.902036012654048e-07, + "logits/chosen": -0.743898332118988, + "logits/rejected": -0.85213303565979, + "logps/chosen": -37.486961364746094, + "logps/rejected": -45.735897064208984, + "loss": 0.2211, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2370671182870865, + "rewards/margins": 2.456345558166504, + "rewards/rejected": -2.219278335571289, + "step": 122 + }, + { + "epoch": 1.4556213017751478, + "grad_norm": 33.17228544783056, + "learning_rate": 4.898399125202295e-07, + "logits/chosen": -0.6975520849227905, + "logits/rejected": -0.5711613893508911, + "logps/chosen": -34.24563217163086, + "logps/rejected": -54.74011993408203, + "loss": 0.2873, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04336928948760033, + "rewards/margins": 2.063955307006836, + "rewards/rejected": -2.0205860137939453, + "step": 123 + }, + { + "epoch": 1.467455621301775, + "grad_norm": 23.661270990308882, + "learning_rate": 4.894697352446182e-07, + "logits/chosen": -1.0904819965362549, + "logits/rejected": -0.9323499798774719, + "logps/chosen": -32.2510986328125, + "logps/rejected": -49.083892822265625, + "loss": 0.2113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38007357716560364, + "rewards/margins": 2.8477623462677, + "rewards/rejected": -3.2278361320495605, + "step": 124 + }, + { + "epoch": 1.4792899408284024, + "grad_norm": 34.604687836314845, + "learning_rate": 4.890930794531947e-07, + "logits/chosen": -0.8184336423873901, + "logits/rejected": -0.7664436101913452, + "logps/chosen": -33.786197662353516, + "logps/rejected": -51.256500244140625, + "loss": 0.3319, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.057498496025800705, + "rewards/margins": 2.2278025150299072, + "rewards/rejected": -2.2853007316589355, + "step": 125 + }, + { + "epoch": 1.4911242603550297, + "grad_norm": 32.152626338805746, + "learning_rate": 4.887099553358501e-07, + "logits/chosen": -0.557933509349823, + "logits/rejected": -0.46904832124710083, + "logps/chosen": -32.54315948486328, + "logps/rejected": -43.39186477661133, + "loss": 0.2547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3194792866706848, + "rewards/margins": 2.087386131286621, + "rewards/rejected": -1.767906665802002, + "step": 126 + }, + { + "epoch": 1.502958579881657, + "grad_norm": 27.486858140339642, + "learning_rate": 4.883203732574667e-07, + "logits/chosen": -0.6726119518280029, + "logits/rejected": -0.9127836227416992, + "logps/chosen": -37.70806884765625, + "logps/rejected": -49.78596878051758, + "loss": 0.2222, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04751332104206085, + "rewards/margins": 4.030301094055176, + "rewards/rejected": -4.07781457901001, + "step": 127 + }, + { + "epoch": 1.514792899408284, + "grad_norm": 31.048864646685523, + "learning_rate": 4.879243437576383e-07, + "logits/chosen": -0.7582688927650452, + "logits/rejected": -0.5859317183494568, + "logps/chosen": -36.134395599365234, + "logps/rejected": -54.693885803222656, + "loss": 0.2645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12503023445606232, + "rewards/margins": 2.837843894958496, + "rewards/rejected": -2.712813377380371, + "step": 128 + }, + { + "epoch": 1.5266272189349113, + "grad_norm": 24.406363199351727, + "learning_rate": 4.875218775503837e-07, + "logits/chosen": -0.8559905290603638, + "logits/rejected": -0.5925788879394531, + "logps/chosen": -30.623798370361328, + "logps/rejected": -54.57696533203125, + "loss": 0.1985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09095414727926254, + "rewards/margins": 2.6515233516693115, + "rewards/rejected": -2.7424774169921875, + "step": 129 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 29.935002372070645, + "learning_rate": 4.871129855238588e-07, + "logits/chosen": -0.6436349749565125, + "logits/rejected": -0.6043447852134705, + "logps/chosen": -39.27113723754883, + "logps/rejected": -53.498077392578125, + "loss": 0.2637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1115611121058464, + "rewards/margins": 2.6680796146392822, + "rewards/rejected": -2.7796406745910645, + "step": 130 + }, + { + "epoch": 1.5502958579881656, + "grad_norm": 34.141742145577254, + "learning_rate": 4.866976787400601e-07, + "logits/chosen": -0.6176570653915405, + "logits/rejected": -0.6334538459777832, + "logps/chosen": -32.522979736328125, + "logps/rejected": -41.17702865600586, + "loss": 0.3054, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09118051826953888, + "rewards/margins": 1.9793744087219238, + "rewards/rejected": -1.8881936073303223, + "step": 131 + }, + { + "epoch": 1.5621301775147929, + "grad_norm": 32.31778466187239, + "learning_rate": 4.862759684345269e-07, + "logits/chosen": -0.8636021614074707, + "logits/rejected": -0.8571298122406006, + "logps/chosen": -27.641830444335938, + "logps/rejected": -47.43977737426758, + "loss": 0.2662, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15791228413581848, + "rewards/margins": 3.5890026092529297, + "rewards/rejected": -3.746914863586426, + "step": 132 + }, + { + "epoch": 1.5739644970414202, + "grad_norm": 32.025278441483124, + "learning_rate": 4.858478660160363e-07, + "logits/chosen": -0.7859846353530884, + "logits/rejected": -0.855846643447876, + "logps/chosen": -42.2872314453125, + "logps/rejected": -59.02558135986328, + "loss": 0.2718, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20162326097488403, + "rewards/margins": 4.159268856048584, + "rewards/rejected": -4.360892295837402, + "step": 133 + }, + { + "epoch": 1.5857988165680474, + "grad_norm": 31.88463706660638, + "learning_rate": 4.854133830662955e-07, + "logits/chosen": -1.0088858604431152, + "logits/rejected": -1.0345858335494995, + "logps/chosen": -36.01057052612305, + "logps/rejected": -40.89894104003906, + "loss": 0.2697, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.32271209359169006, + "rewards/margins": 1.9244492053985596, + "rewards/rejected": -2.247161388397217, + "step": 134 + }, + { + "epoch": 1.5976331360946747, + "grad_norm": 32.09028994814745, + "learning_rate": 4.849725313396274e-07, + "logits/chosen": -0.8029739856719971, + "logits/rejected": -0.668880820274353, + "logps/chosen": -31.962514877319336, + "logps/rejected": -56.866539001464844, + "loss": 0.2921, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.030226286500692368, + "rewards/margins": 3.690258026123047, + "rewards/rejected": -3.660031795501709, + "step": 135 + }, + { + "epoch": 1.6094674556213018, + "grad_norm": 25.510937214506086, + "learning_rate": 4.845253227626536e-07, + "logits/chosen": -1.067764401435852, + "logits/rejected": -1.126332402229309, + "logps/chosen": -29.248348236083984, + "logps/rejected": -38.563045501708984, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5202600359916687, + "rewards/margins": 2.1755950450897217, + "rewards/rejected": -2.695855140686035, + "step": 136 + }, + { + "epoch": 1.6213017751479288, + "grad_norm": 35.45937914643292, + "learning_rate": 4.84071769433971e-07, + "logits/chosen": -0.9243749380111694, + "logits/rejected": -1.0414302349090576, + "logps/chosen": -43.77132797241211, + "logps/rejected": -46.58074951171875, + "loss": 0.2904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.23414787650108337, + "rewards/margins": 2.354924201965332, + "rewards/rejected": -2.5890719890594482, + "step": 137 + }, + { + "epoch": 1.6331360946745561, + "grad_norm": 27.49463867525898, + "learning_rate": 4.836118836238252e-07, + "logits/chosen": -0.8824262619018555, + "logits/rejected": -0.8479146361351013, + "logps/chosen": -38.267608642578125, + "logps/rejected": -54.601646423339844, + "loss": 0.2142, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2675169110298157, + "rewards/margins": 3.23266339302063, + "rewards/rejected": -3.500180244445801, + "step": 138 + }, + { + "epoch": 1.6449704142011834, + "grad_norm": 25.03352720952285, + "learning_rate": 4.831456777737779e-07, + "logits/chosen": -0.87726891040802, + "logits/rejected": -0.6373116374015808, + "logps/chosen": -37.56227111816406, + "logps/rejected": -62.3519287109375, + "loss": 0.2154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22444042563438416, + "rewards/margins": 2.3676767349243164, + "rewards/rejected": -2.5921175479888916, + "step": 139 + }, + { + "epoch": 1.6568047337278107, + "grad_norm": 30.77350000619414, + "learning_rate": 4.826731644963704e-07, + "logits/chosen": -0.7138141989707947, + "logits/rejected": -0.746364951133728, + "logps/chosen": -56.473121643066406, + "logps/rejected": -67.74131774902344, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1767007112503052, + "rewards/margins": 3.261204719543457, + "rewards/rejected": -4.437905311584473, + "step": 140 + }, + { + "epoch": 1.668639053254438, + "grad_norm": 28.0953853311355, + "learning_rate": 4.82194356574783e-07, + "logits/chosen": -1.0583032369613647, + "logits/rejected": -1.1049402952194214, + "logps/chosen": -34.80393981933594, + "logps/rejected": -44.68275451660156, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5045955181121826, + "rewards/margins": 1.9453740119934082, + "rewards/rejected": -2.4499692916870117, + "step": 141 + }, + { + "epoch": 1.6804733727810652, + "grad_norm": 29.806685142730963, + "learning_rate": 4.817092669624882e-07, + "logits/chosen": -0.9740761518478394, + "logits/rejected": -0.8185986876487732, + "logps/chosen": -39.042179107666016, + "logps/rejected": -60.931121826171875, + "loss": 0.2278, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.528903067111969, + "rewards/margins": 3.3279364109039307, + "rewards/rejected": -3.856839656829834, + "step": 142 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 29.649829246510613, + "learning_rate": 4.812179087829012e-07, + "logits/chosen": -0.6173474788665771, + "logits/rejected": -0.5431764125823975, + "logps/chosen": -27.147438049316406, + "logps/rejected": -43.8935546875, + "loss": 0.2404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24039342999458313, + "rewards/margins": 2.5034737586975098, + "rewards/rejected": -2.7438669204711914, + "step": 143 + }, + { + "epoch": 1.7041420118343196, + "grad_norm": 30.07302253382677, + "learning_rate": 4.807202953290243e-07, + "logits/chosen": -0.523086428642273, + "logits/rejected": -0.5003029108047485, + "logps/chosen": -37.44425582885742, + "logps/rejected": -49.62798309326172, + "loss": 0.2756, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09302498400211334, + "rewards/margins": 1.9907047748565674, + "rewards/rejected": -1.8976799249649048, + "step": 144 + }, + { + "epoch": 1.7159763313609466, + "grad_norm": 30.457439377224762, + "learning_rate": 4.802164400630872e-07, + "logits/chosen": -1.0345890522003174, + "logits/rejected": -1.0042998790740967, + "logps/chosen": -41.94634246826172, + "logps/rejected": -54.282379150390625, + "loss": 0.2416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.555735170841217, + "rewards/margins": 3.464998960494995, + "rewards/rejected": -4.020733833312988, + "step": 145 + }, + { + "epoch": 1.727810650887574, + "grad_norm": 29.365588226716095, + "learning_rate": 4.797063566161834e-07, + "logits/chosen": -0.5482521653175354, + "logits/rejected": -0.8150917291641235, + "logps/chosen": -45.255279541015625, + "logps/rejected": -44.48121643066406, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004989638924598694, + "rewards/margins": 2.8408713340759277, + "rewards/rejected": -2.845860719680786, + "step": 146 + }, + { + "epoch": 1.7396449704142012, + "grad_norm": 23.051187654817465, + "learning_rate": 4.791900587879009e-07, + "logits/chosen": -1.079697608947754, + "logits/rejected": -1.1642544269561768, + "logps/chosen": -35.86004638671875, + "logps/rejected": -55.203956604003906, + "loss": 0.2049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20117734372615814, + "rewards/margins": 4.757822513580322, + "rewards/rejected": -4.959000110626221, + "step": 147 + }, + { + "epoch": 1.7514792899408285, + "grad_norm": 23.873165296629157, + "learning_rate": 4.786675605459487e-07, + "logits/chosen": -0.6171930432319641, + "logits/rejected": -0.7254853844642639, + "logps/chosen": -37.1898193359375, + "logps/rejected": -58.64811706542969, + "loss": 0.1909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25659316778182983, + "rewards/margins": 4.113015174865723, + "rewards/rejected": -3.856421947479248, + "step": 148 + }, + { + "epoch": 1.7633136094674557, + "grad_norm": 28.365749172580184, + "learning_rate": 4.781388760257799e-07, + "logits/chosen": -0.6407098174095154, + "logits/rejected": -0.5406571626663208, + "logps/chosen": -32.85944366455078, + "logps/rejected": -45.028846740722656, + "loss": 0.2134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13824699819087982, + "rewards/margins": 1.9902299642562866, + "rewards/rejected": -1.8519829511642456, + "step": 149 + }, + { + "epoch": 1.7751479289940828, + "grad_norm": 26.001580942778393, + "learning_rate": 4.776040195302079e-07, + "logits/chosen": -0.7967870235443115, + "logits/rejected": -0.925701916217804, + "logps/chosen": -34.21118927001953, + "logps/rejected": -47.52817916870117, + "loss": 0.1779, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07275627553462982, + "rewards/margins": 3.335747003555298, + "rewards/rejected": -3.262990951538086, + "step": 150 + }, + { + "epoch": 1.78698224852071, + "grad_norm": 31.836026789786995, + "learning_rate": 4.770630055290208e-07, + "logits/chosen": -1.0263866186141968, + "logits/rejected": -0.8518810868263245, + "logps/chosen": -40.71949768066406, + "logps/rejected": -57.82533264160156, + "loss": 0.2399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18406735360622406, + "rewards/margins": 2.9664227962493896, + "rewards/rejected": -3.1504902839660645, + "step": 151 + }, + { + "epoch": 1.7988165680473371, + "grad_norm": 28.63311269352365, + "learning_rate": 4.76515848658589e-07, + "logits/chosen": -0.5594848394393921, + "logits/rejected": -0.8863150477409363, + "logps/chosen": -45.70458221435547, + "logps/rejected": -38.8409538269043, + "loss": 0.2137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2182983160018921, + "rewards/margins": 2.2750673294067383, + "rewards/rejected": -2.4933652877807617, + "step": 152 + }, + { + "epoch": 1.8106508875739644, + "grad_norm": 34.34513187647427, + "learning_rate": 4.759625637214696e-07, + "logits/chosen": -0.7560346126556396, + "logits/rejected": -0.8882582187652588, + "logps/chosen": -30.11992073059082, + "logps/rejected": -39.5860595703125, + "loss": 0.2751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3838905096054077, + "rewards/margins": 1.65618097782135, + "rewards/rejected": -2.040071487426758, + "step": 153 + }, + { + "epoch": 1.8224852071005917, + "grad_norm": 32.10962096929911, + "learning_rate": 4.754031656860059e-07, + "logits/chosen": -0.8416492342948914, + "logits/rejected": -0.6819140911102295, + "logps/chosen": -33.661521911621094, + "logps/rejected": -44.78226852416992, + "loss": 0.2833, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.014047026634216309, + "rewards/margins": 2.2703661918640137, + "rewards/rejected": -2.256319046020508, + "step": 154 + }, + { + "epoch": 1.834319526627219, + "grad_norm": 30.000217414301275, + "learning_rate": 4.748376696859226e-07, + "logits/chosen": -0.9286520481109619, + "logits/rejected": -0.9778493642807007, + "logps/chosen": -41.20292663574219, + "logps/rejected": -48.67192077636719, + "loss": 0.2596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4870182275772095, + "rewards/margins": 2.4510464668273926, + "rewards/rejected": -1.9640284776687622, + "step": 155 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 32.4118395869003, + "learning_rate": 4.74266091019916e-07, + "logits/chosen": -0.8907891511917114, + "logits/rejected": -0.9750989079475403, + "logps/chosen": -45.65682601928711, + "logps/rejected": -50.134220123291016, + "loss": 0.2678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07820607721805573, + "rewards/margins": 3.1200551986694336, + "rewards/rejected": -3.198261260986328, + "step": 156 + }, + { + "epoch": 1.8579881656804735, + "grad_norm": 28.928479525804008, + "learning_rate": 4.7368844515124046e-07, + "logits/chosen": -0.6818917393684387, + "logits/rejected": -0.8876403570175171, + "logps/chosen": -38.4929084777832, + "logps/rejected": -44.042869567871094, + "loss": 0.2156, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11902601271867752, + "rewards/margins": 3.1182360649108887, + "rewards/rejected": -2.9992103576660156, + "step": 157 + }, + { + "epoch": 1.8698224852071006, + "grad_norm": 29.65958384914858, + "learning_rate": 4.7310474770728996e-07, + "logits/chosen": -0.6357210278511047, + "logits/rejected": -0.5424870252609253, + "logps/chosen": -37.36800765991211, + "logps/rejected": -58.64324951171875, + "loss": 0.2376, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6739649176597595, + "rewards/margins": 3.992888927459717, + "rewards/rejected": -4.666853904724121, + "step": 158 + }, + { + "epoch": 1.8816568047337277, + "grad_norm": 34.26684990746155, + "learning_rate": 4.725150144791753e-07, + "logits/chosen": -0.46081531047821045, + "logits/rejected": -0.5821120142936707, + "logps/chosen": -37.67076110839844, + "logps/rejected": -41.890037536621094, + "loss": 0.3107, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4266676902770996, + "rewards/margins": 1.9253289699554443, + "rewards/rejected": -2.351996660232544, + "step": 159 + }, + { + "epoch": 1.893491124260355, + "grad_norm": 35.05902021201059, + "learning_rate": 4.719192614212969e-07, + "logits/chosen": -0.6496328711509705, + "logits/rejected": -0.5829223990440369, + "logps/chosen": -36.085716247558594, + "logps/rejected": -51.334327697753906, + "loss": 0.2594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33468520641326904, + "rewards/margins": 2.861034631729126, + "rewards/rejected": -3.1957197189331055, + "step": 160 + }, + { + "epoch": 1.9053254437869822, + "grad_norm": 30.04841620365748, + "learning_rate": 4.713175046509131e-07, + "logits/chosen": -0.9227581024169922, + "logits/rejected": -0.8245525360107422, + "logps/chosen": -39.57933807373047, + "logps/rejected": -66.19513702392578, + "loss": 0.23, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03423337638378143, + "rewards/margins": 5.112496376037598, + "rewards/rejected": -5.146729469299316, + "step": 161 + }, + { + "epoch": 1.9171597633136095, + "grad_norm": 31.04728022665797, + "learning_rate": 4.707097604477045e-07, + "logits/chosen": -1.0367375612258911, + "logits/rejected": -1.0377088785171509, + "logps/chosen": -46.939781188964844, + "logps/rejected": -56.965232849121094, + "loss": 0.2494, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24659229815006256, + "rewards/margins": 2.464108467102051, + "rewards/rejected": -2.710700750350952, + "step": 162 + }, + { + "epoch": 1.9289940828402368, + "grad_norm": 28.85191727468816, + "learning_rate": 4.700960452533328e-07, + "logits/chosen": -0.5171566605567932, + "logits/rejected": -0.5847671627998352, + "logps/chosen": -41.256561279296875, + "logps/rejected": -48.218727111816406, + "loss": 0.2072, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.48802250623703003, + "rewards/margins": 2.6758077144622803, + "rewards/rejected": -3.163830280303955, + "step": 163 + }, + { + "epoch": 1.940828402366864, + "grad_norm": 34.99511678470972, + "learning_rate": 4.694763756709967e-07, + "logits/chosen": -0.9630662202835083, + "logits/rejected": -1.0143767595291138, + "logps/chosen": -33.528255462646484, + "logps/rejected": -41.041446685791016, + "loss": 0.2857, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.00032395869493484497, + "rewards/margins": 2.951721429824829, + "rewards/rejected": -2.952045440673828, + "step": 164 + }, + { + "epoch": 1.952662721893491, + "grad_norm": 32.6916182628487, + "learning_rate": 4.688507684649825e-07, + "logits/chosen": -0.5867289304733276, + "logits/rejected": -0.6317815184593201, + "logps/chosen": -34.935386657714844, + "logps/rejected": -50.50403594970703, + "loss": 0.299, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3650425374507904, + "rewards/margins": 3.305384635925293, + "rewards/rejected": -2.9403419494628906, + "step": 165 + }, + { + "epoch": 1.9644970414201184, + "grad_norm": 25.572043392572215, + "learning_rate": 4.6821924056021053e-07, + "logits/chosen": -0.8966530561447144, + "logits/rejected": -0.9081934094429016, + "logps/chosen": -39.484596252441406, + "logps/rejected": -59.22516632080078, + "loss": 0.2167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013990327715873718, + "rewards/margins": 4.235882759094238, + "rewards/rejected": -4.237281799316406, + "step": 166 + }, + { + "epoch": 1.9763313609467454, + "grad_norm": 33.32850848147448, + "learning_rate": 4.6758180904177715e-07, + "logits/chosen": -0.9043580293655396, + "logits/rejected": -0.9197327494621277, + "logps/chosen": -30.749338150024414, + "logps/rejected": -46.336402893066406, + "loss": 0.2978, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.061946846544742584, + "rewards/margins": 3.1583476066589355, + "rewards/rejected": -3.0964009761810303, + "step": 167 + }, + { + "epoch": 1.9881656804733727, + "grad_norm": 30.12339676116848, + "learning_rate": 4.669384911544926e-07, + "logits/chosen": -0.7334628701210022, + "logits/rejected": -0.6194922924041748, + "logps/chosen": -33.2913818359375, + "logps/rejected": -58.153072357177734, + "loss": 0.2359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35717520117759705, + "rewards/margins": 3.8730297088623047, + "rewards/rejected": -4.230205059051514, + "step": 168 + }, + { + "epoch": 2.0, + "grad_norm": 21.4554135691754, + "learning_rate": 4.6628930430241495e-07, + "logits/chosen": -0.5781211853027344, + "logits/rejected": -0.6272490620613098, + "logps/chosen": -33.10559844970703, + "logps/rejected": -47.765830993652344, + "loss": 0.1717, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07726180553436279, + "rewards/margins": 3.996021270751953, + "rewards/rejected": -4.0732831954956055, + "step": 169 + }, + { + "epoch": 2.0118343195266273, + "grad_norm": 17.804389758687186, + "learning_rate": 4.6563426604837817e-07, + "logits/chosen": -0.7812290787696838, + "logits/rejected": -0.6464096307754517, + "logps/chosen": -37.48862838745117, + "logps/rejected": -57.36015319824219, + "loss": 0.1516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08392468094825745, + "rewards/margins": 3.2438626289367676, + "rewards/rejected": -3.327787160873413, + "step": 170 + }, + { + "epoch": 2.0236686390532546, + "grad_norm": 16.40347419407978, + "learning_rate": 4.649733941135183e-07, + "logits/chosen": -0.9235811233520508, + "logits/rejected": -0.966956615447998, + "logps/chosen": -46.7747802734375, + "logps/rejected": -61.03520584106445, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24425256252288818, + "rewards/margins": 3.003178834915161, + "rewards/rejected": -3.2474312782287598, + "step": 171 + }, + { + "epoch": 2.035502958579882, + "grad_norm": 17.315280346491143, + "learning_rate": 4.6430670637679294e-07, + "logits/chosen": -0.8347846865653992, + "logits/rejected": -0.8632474541664124, + "logps/chosen": -40.783226013183594, + "logps/rejected": -54.76463317871094, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18245816230773926, + "rewards/margins": 3.924067497253418, + "rewards/rejected": -4.106525421142578, + "step": 172 + }, + { + "epoch": 2.0473372781065087, + "grad_norm": 17.457800428549817, + "learning_rate": 4.636342208744981e-07, + "logits/chosen": -0.6723726987838745, + "logits/rejected": -0.7173234820365906, + "logps/chosen": -28.166179656982422, + "logps/rejected": -44.342166900634766, + "loss": 0.1436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3225460350513458, + "rewards/margins": 3.5667474269866943, + "rewards/rejected": -3.244201421737671, + "step": 173 + }, + { + "epoch": 2.059171597633136, + "grad_norm": 15.061594900995148, + "learning_rate": 4.629559557997804e-07, + "logits/chosen": -0.6233261823654175, + "logits/rejected": -0.7046049237251282, + "logps/chosen": -51.49015426635742, + "logps/rejected": -72.83427429199219, + "loss": 0.1249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3748975396156311, + "rewards/margins": 4.756686210632324, + "rewards/rejected": -5.1315836906433105, + "step": 174 + }, + { + "epoch": 2.0710059171597632, + "grad_norm": 15.61277777512069, + "learning_rate": 4.6227192950214435e-07, + "logits/chosen": -0.8263496160507202, + "logits/rejected": -0.7221701145172119, + "logps/chosen": -37.889495849609375, + "logps/rejected": -54.355918884277344, + "loss": 0.1236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1404503583908081, + "rewards/margins": 3.4195923805236816, + "rewards/rejected": -3.279142379760742, + "step": 175 + }, + { + "epoch": 2.0828402366863905, + "grad_norm": 15.2773228213431, + "learning_rate": 4.615821604869563e-07, + "logits/chosen": -0.9737514853477478, + "logits/rejected": -0.8237433433532715, + "logps/chosen": -39.199546813964844, + "logps/rejected": -62.34404754638672, + "loss": 0.111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.041255027055740356, + "rewards/margins": 3.107170581817627, + "rewards/rejected": -3.148425579071045, + "step": 176 + }, + { + "epoch": 2.094674556213018, + "grad_norm": 15.829900533787184, + "learning_rate": 4.6088666741494384e-07, + "logits/chosen": -1.1444345712661743, + "logits/rejected": -1.1528784036636353, + "logps/chosen": -38.0962028503418, + "logps/rejected": -62.006046295166016, + "loss": 0.1109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7900293469429016, + "rewards/margins": 4.543603420257568, + "rewards/rejected": -5.3336334228515625, + "step": 177 + }, + { + "epoch": 2.106508875739645, + "grad_norm": 16.116982532695665, + "learning_rate": 4.6018546910169067e-07, + "logits/chosen": -1.0515778064727783, + "logits/rejected": -0.9833648800849915, + "logps/chosen": -40.967002868652344, + "logps/rejected": -65.30772399902344, + "loss": 0.1273, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15716314315795898, + "rewards/margins": 3.4091856479644775, + "rewards/rejected": -3.5663485527038574, + "step": 178 + }, + { + "epoch": 2.1183431952662723, + "grad_norm": 15.361490349201445, + "learning_rate": 4.5947858451712773e-07, + "logits/chosen": -0.819975733757019, + "logits/rejected": -0.8770939111709595, + "logps/chosen": -33.2042236328125, + "logps/rejected": -50.4263916015625, + "loss": 0.1302, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4746937155723572, + "rewards/margins": 4.132750511169434, + "rewards/rejected": -3.6580564975738525, + "step": 179 + }, + { + "epoch": 2.1301775147928996, + "grad_norm": 14.5720865184576, + "learning_rate": 4.5876603278502027e-07, + "logits/chosen": -0.7717497944831848, + "logits/rejected": -0.7804837822914124, + "logps/chosen": -43.37432861328125, + "logps/rejected": -59.71336364746094, + "loss": 0.1184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3754761517047882, + "rewards/margins": 3.8695340156555176, + "rewards/rejected": -4.2450103759765625, + "step": 180 + }, + { + "epoch": 2.1420118343195265, + "grad_norm": 17.24169816378187, + "learning_rate": 4.580478331824498e-07, + "logits/chosen": -0.7291906476020813, + "logits/rejected": -0.7426820993423462, + "logps/chosen": -35.4974365234375, + "logps/rejected": -49.904014587402344, + "loss": 0.1635, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2558782994747162, + "rewards/margins": 2.3701655864715576, + "rewards/rejected": -2.1142873764038086, + "step": 181 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 17.93219012330719, + "learning_rate": 4.573240051392935e-07, + "logits/chosen": -0.6299488544464111, + "logits/rejected": -0.5285680294036865, + "logps/chosen": -27.371360778808594, + "logps/rejected": -41.3109245300293, + "loss": 0.1323, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1316337138414383, + "rewards/margins": 2.6835739612579346, + "rewards/rejected": -2.8152074813842773, + "step": 182 + }, + { + "epoch": 2.165680473372781, + "grad_norm": 19.89980345891806, + "learning_rate": 4.565945682376977e-07, + "logits/chosen": -0.9120879173278809, + "logits/rejected": -0.6737322211265564, + "logps/chosen": -43.5380859375, + "logps/rejected": -75.94633483886719, + "loss": 0.1551, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.55577552318573, + "rewards/margins": 4.484833717346191, + "rewards/rejected": -5.040609359741211, + "step": 183 + }, + { + "epoch": 2.1775147928994083, + "grad_norm": 16.09665942372561, + "learning_rate": 4.5585954221154853e-07, + "logits/chosen": -0.8184518814086914, + "logits/rejected": -0.7758468985557556, + "logps/chosen": -30.036758422851562, + "logps/rejected": -45.972900390625, + "loss": 0.1244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20884810388088226, + "rewards/margins": 3.8934361934661865, + "rewards/rejected": -3.6845881938934326, + "step": 184 + }, + { + "epoch": 2.1893491124260356, + "grad_norm": 16.783451535199216, + "learning_rate": 4.551189469459382e-07, + "logits/chosen": -0.9438289403915405, + "logits/rejected": -0.7987072467803955, + "logps/chosen": -31.17508888244629, + "logps/rejected": -50.338836669921875, + "loss": 0.1361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023317746818065643, + "rewards/margins": 2.8164429664611816, + "rewards/rejected": -2.8397610187530518, + "step": 185 + }, + { + "epoch": 2.201183431952663, + "grad_norm": 20.143036888697704, + "learning_rate": 4.5437280247662646e-07, + "logits/chosen": -0.7536525130271912, + "logits/rejected": -0.855215311050415, + "logps/chosen": -54.403778076171875, + "logps/rejected": -66.35308074951172, + "loss": 0.1466, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0882877111434937, + "rewards/margins": 4.7153778076171875, + "rewards/rejected": -5.803666114807129, + "step": 186 + }, + { + "epoch": 2.21301775147929, + "grad_norm": 14.808692042518373, + "learning_rate": 4.5362112898949947e-07, + "logits/chosen": -0.48705536127090454, + "logits/rejected": -0.572258710861206, + "logps/chosen": -37.71576690673828, + "logps/rejected": -45.96680450439453, + "loss": 0.1157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4106942117214203, + "rewards/margins": 4.168295860290527, + "rewards/rejected": -4.5789899826049805, + "step": 187 + }, + { + "epoch": 2.224852071005917, + "grad_norm": 18.376017289584023, + "learning_rate": 4.528639468200226e-07, + "logits/chosen": -0.9109969735145569, + "logits/rejected": -0.9377925992012024, + "logps/chosen": -35.95753479003906, + "logps/rejected": -53.1741943359375, + "loss": 0.1315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26368990540504456, + "rewards/margins": 4.901925563812256, + "rewards/rejected": -5.165615558624268, + "step": 188 + }, + { + "epoch": 2.2366863905325443, + "grad_norm": 17.965404516840803, + "learning_rate": 4.5210127645269125e-07, + "logits/chosen": -0.7325922846794128, + "logits/rejected": -0.7497273683547974, + "logps/chosen": -32.5103759765625, + "logps/rejected": -51.267616271972656, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3522591292858124, + "rewards/margins": 4.243406295776367, + "rewards/rejected": -4.595664978027344, + "step": 189 + }, + { + "epoch": 2.2485207100591715, + "grad_norm": 16.570545177365638, + "learning_rate": 4.5133313852047613e-07, + "logits/chosen": -0.351344496011734, + "logits/rejected": -0.3544999659061432, + "logps/chosen": -30.0961971282959, + "logps/rejected": -47.018310546875, + "loss": 0.1119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17489728331565857, + "rewards/margins": 3.530579090118408, + "rewards/rejected": -3.7054765224456787, + "step": 190 + }, + { + "epoch": 2.260355029585799, + "grad_norm": 15.819997196077969, + "learning_rate": 4.5055955380426514e-07, + "logits/chosen": -0.6617011427879333, + "logits/rejected": -0.6919267773628235, + "logps/chosen": -31.14508819580078, + "logps/rejected": -49.937660217285156, + "loss": 0.1371, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.010032139718532562, + "rewards/margins": 4.074734687805176, + "rewards/rejected": -4.06470251083374, + "step": 191 + }, + { + "epoch": 2.272189349112426, + "grad_norm": 14.990393051749372, + "learning_rate": 4.4978054323230144e-07, + "logits/chosen": -0.8150711059570312, + "logits/rejected": -0.8408107757568359, + "logps/chosen": -27.763072967529297, + "logps/rejected": -43.43778991699219, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2238546460866928, + "rewards/margins": 3.774090051651001, + "rewards/rejected": -3.5502355098724365, + "step": 192 + }, + { + "epoch": 2.2840236686390534, + "grad_norm": 16.48968206240205, + "learning_rate": 4.489961278796167e-07, + "logits/chosen": -1.1060757637023926, + "logits/rejected": -0.9984903335571289, + "logps/chosen": -39.745208740234375, + "logps/rejected": -61.31498718261719, + "loss": 0.1238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17969991266727448, + "rewards/margins": 4.915201663970947, + "rewards/rejected": -5.0949015617370605, + "step": 193 + }, + { + "epoch": 2.2958579881656807, + "grad_norm": 19.190560048976156, + "learning_rate": 4.482063289674618e-07, + "logits/chosen": -0.8471282720565796, + "logits/rejected": -0.7247289419174194, + "logps/chosen": -32.47812271118164, + "logps/rejected": -54.26726150512695, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25966179370880127, + "rewards/margins": 3.339050531387329, + "rewards/rejected": -3.0793888568878174, + "step": 194 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 14.39242536769752, + "learning_rate": 4.4741116786273176e-07, + "logits/chosen": -0.8266146779060364, + "logits/rejected": -0.815448522567749, + "logps/chosen": -34.32430648803711, + "logps/rejected": -54.35994338989258, + "loss": 0.1023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08938950300216675, + "rewards/margins": 4.4216461181640625, + "rewards/rejected": -4.511035919189453, + "step": 195 + }, + { + "epoch": 2.3195266272189348, + "grad_norm": 15.55346828770515, + "learning_rate": 4.466106660773884e-07, + "logits/chosen": -0.7416298389434814, + "logits/rejected": -0.7652521133422852, + "logps/chosen": -38.31879425048828, + "logps/rejected": -59.721343994140625, + "loss": 0.103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3820110857486725, + "rewards/margins": 4.027256488800049, + "rewards/rejected": -3.645245313644409, + "step": 196 + }, + { + "epoch": 2.331360946745562, + "grad_norm": 20.33506239450445, + "learning_rate": 4.4580484526787807e-07, + "logits/chosen": -0.9200114011764526, + "logits/rejected": -0.8991610407829285, + "logps/chosen": -34.80711364746094, + "logps/rejected": -54.14820861816406, + "loss": 0.1412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4271819293498993, + "rewards/margins": 4.234493732452393, + "rewards/rejected": -4.661675930023193, + "step": 197 + }, + { + "epoch": 2.3431952662721893, + "grad_norm": 16.966597309776112, + "learning_rate": 4.44993727234546e-07, + "logits/chosen": -0.9545549154281616, + "logits/rejected": -0.9994832277297974, + "logps/chosen": -41.37195587158203, + "logps/rejected": -53.36943817138672, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6568608283996582, + "rewards/margins": 4.235930442810059, + "rewards/rejected": -4.892791748046875, + "step": 198 + }, + { + "epoch": 2.3550295857988166, + "grad_norm": 16.008353794022174, + "learning_rate": 4.4417733392104585e-07, + "logits/chosen": -0.9546022415161133, + "logits/rejected": -1.0305328369140625, + "logps/chosen": -37.549339294433594, + "logps/rejected": -52.69127655029297, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14469043910503387, + "rewards/margins": 4.475038528442383, + "rewards/rejected": -4.619729042053223, + "step": 199 + }, + { + "epoch": 2.366863905325444, + "grad_norm": 12.501500717461248, + "learning_rate": 4.4335568741374695e-07, + "logits/chosen": -0.9863907098770142, + "logits/rejected": -0.9025635123252869, + "logps/chosen": -31.974454879760742, + "logps/rejected": -47.34270477294922, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17588046193122864, + "rewards/margins": 4.036709308624268, + "rewards/rejected": -4.212589740753174, + "step": 200 + }, + { + "epoch": 2.378698224852071, + "grad_norm": 12.198066199001053, + "learning_rate": 4.425288099411364e-07, + "logits/chosen": -1.135861873626709, + "logits/rejected": -1.1184909343719482, + "logps/chosen": -42.55821228027344, + "logps/rejected": -58.36648178100586, + "loss": 0.077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1261935532093048, + "rewards/margins": 6.448820114135742, + "rewards/rejected": -6.575014591217041, + "step": 201 + }, + { + "epoch": 2.390532544378698, + "grad_norm": 18.343824652909003, + "learning_rate": 4.4169672387321735e-07, + "logits/chosen": -0.7927027344703674, + "logits/rejected": -0.8605716228485107, + "logps/chosen": -44.98955535888672, + "logps/rejected": -52.32347106933594, + "loss": 0.1207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08800630271434784, + "rewards/margins": 4.026900768280029, + "rewards/rejected": -4.1149067878723145, + "step": 202 + }, + { + "epoch": 2.4023668639053253, + "grad_norm": 14.754318186294096, + "learning_rate": 4.408594517209045e-07, + "logits/chosen": -0.9587286710739136, + "logits/rejected": -0.9846871495246887, + "logps/chosen": -34.01704025268555, + "logps/rejected": -53.6927375793457, + "loss": 0.108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4293210208415985, + "rewards/margins": 5.7534356117248535, + "rewards/rejected": -6.182756423950195, + "step": 203 + }, + { + "epoch": 2.4142011834319526, + "grad_norm": 16.259714501612795, + "learning_rate": 4.4001701613541454e-07, + "logits/chosen": -0.8172876834869385, + "logits/rejected": -1.0052968263626099, + "logps/chosen": -50.09556198120117, + "logps/rejected": -48.03728103637695, + "loss": 0.1187, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.36984017491340637, + "rewards/margins": 3.9901065826416016, + "rewards/rejected": -4.3599467277526855, + "step": 204 + }, + { + "epoch": 2.42603550295858, + "grad_norm": 19.017744132865513, + "learning_rate": 4.391694399076536e-07, + "logits/chosen": -0.9216486215591431, + "logits/rejected": -0.8954623937606812, + "logps/chosen": -29.45151710510254, + "logps/rejected": -53.07656478881836, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30826300382614136, + "rewards/margins": 5.172004699707031, + "rewards/rejected": -5.4802680015563965, + "step": 205 + }, + { + "epoch": 2.437869822485207, + "grad_norm": 16.95344970048546, + "learning_rate": 4.383167459676008e-07, + "logits/chosen": -0.8062804341316223, + "logits/rejected": -0.5907716155052185, + "logps/chosen": -37.306583404541016, + "logps/rejected": -65.82327270507812, + "loss": 0.1077, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03246502950787544, + "rewards/margins": 3.6565213203430176, + "rewards/rejected": -3.624056100845337, + "step": 206 + }, + { + "epoch": 2.4497041420118344, + "grad_norm": 14.56061263170088, + "learning_rate": 4.374589573836874e-07, + "logits/chosen": -0.9072690606117249, + "logits/rejected": -1.0106024742126465, + "logps/chosen": -39.9183349609375, + "logps/rejected": -51.78164291381836, + "loss": 0.0865, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8933297395706177, + "rewards/margins": 4.566455841064453, + "rewards/rejected": -5.459786415100098, + "step": 207 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 14.148619010746572, + "learning_rate": 4.365960973621734e-07, + "logits/chosen": -0.9763280749320984, + "logits/rejected": -1.2256582975387573, + "logps/chosen": -43.78369903564453, + "logps/rejected": -54.03715896606445, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2019457072019577, + "rewards/margins": 4.386034965515137, + "rewards/rejected": -4.587980270385742, + "step": 208 + }, + { + "epoch": 2.4733727810650885, + "grad_norm": 11.952569068353256, + "learning_rate": 4.357281892465191e-07, + "logits/chosen": -0.850165069103241, + "logits/rejected": -0.9085839986801147, + "logps/chosen": -30.494304656982422, + "logps/rejected": -45.906429290771484, + "loss": 0.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06118796020746231, + "rewards/margins": 4.031259536743164, + "rewards/rejected": -3.970071792602539, + "step": 209 + }, + { + "epoch": 2.485207100591716, + "grad_norm": 14.94650375178369, + "learning_rate": 4.348552565167542e-07, + "logits/chosen": -0.9337579011917114, + "logits/rejected": -0.8570343255996704, + "logps/chosen": -37.36121368408203, + "logps/rejected": -63.51266098022461, + "loss": 0.1052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1832229197025299, + "rewards/margins": 5.160886287689209, + "rewards/rejected": -5.344109058380127, + "step": 210 + }, + { + "epoch": 2.497041420118343, + "grad_norm": 17.82179694890819, + "learning_rate": 4.3397732278884194e-07, + "logits/chosen": -0.7793615460395813, + "logits/rejected": -0.7347290515899658, + "logps/chosen": -38.538475036621094, + "logps/rejected": -51.74705123901367, + "loss": 0.1133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.012450069189071655, + "rewards/margins": 2.9992494583129883, + "rewards/rejected": -2.9867992401123047, + "step": 211 + }, + { + "epoch": 2.5088757396449703, + "grad_norm": 17.00316151116976, + "learning_rate": 4.330944118140406e-07, + "logits/chosen": -0.671549916267395, + "logits/rejected": -0.530448317527771, + "logps/chosen": -36.966880798339844, + "logps/rejected": -57.631778717041016, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0925275981426239, + "rewards/margins": 5.078682899475098, + "rewards/rejected": -5.171210289001465, + "step": 212 + }, + { + "epoch": 2.5207100591715976, + "grad_norm": 12.753485442227316, + "learning_rate": 4.322065474782609e-07, + "logits/chosen": -0.9630086421966553, + "logits/rejected": -0.7986509799957275, + "logps/chosen": -34.51360321044922, + "logps/rejected": -55.286319732666016, + "loss": 0.0986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17616373300552368, + "rewards/margins": 4.3915815353393555, + "rewards/rejected": -4.567745208740234, + "step": 213 + }, + { + "epoch": 2.532544378698225, + "grad_norm": 19.996942513731593, + "learning_rate": 4.313137538014198e-07, + "logits/chosen": -0.6898477077484131, + "logits/rejected": -0.636679470539093, + "logps/chosen": -35.616844177246094, + "logps/rejected": -56.1121826171875, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24041543900966644, + "rewards/margins": 4.271088600158691, + "rewards/rejected": -4.51150369644165, + "step": 214 + }, + { + "epoch": 2.544378698224852, + "grad_norm": 16.48295901806472, + "learning_rate": 4.304160549367906e-07, + "logits/chosen": -1.0932834148406982, + "logits/rejected": -1.1098268032073975, + "logps/chosen": -29.987680435180664, + "logps/rejected": -46.118587493896484, + "loss": 0.0842, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.31450319290161133, + "rewards/margins": 3.8299648761749268, + "rewards/rejected": -3.5154621601104736, + "step": 215 + }, + { + "epoch": 2.556213017751479, + "grad_norm": 17.61093634952303, + "learning_rate": 4.295134751703492e-07, + "logits/chosen": -0.9821122884750366, + "logits/rejected": -1.0032066106796265, + "logps/chosen": -29.145097732543945, + "logps/rejected": -53.80962371826172, + "loss": 0.1108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3220353424549103, + "rewards/margins": 5.978228569030762, + "rewards/rejected": -6.300264358520508, + "step": 216 + }, + { + "epoch": 2.5680473372781067, + "grad_norm": 12.734227646855976, + "learning_rate": 4.28606038920118e-07, + "logits/chosen": -0.8004301190376282, + "logits/rejected": -0.6481240391731262, + "logps/chosen": -33.56897735595703, + "logps/rejected": -59.778018951416016, + "loss": 0.0721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31138908863067627, + "rewards/margins": 5.115996837615967, + "rewards/rejected": -5.427386283874512, + "step": 217 + }, + { + "epoch": 2.5798816568047336, + "grad_norm": 19.316829502268995, + "learning_rate": 4.276937707355044e-07, + "logits/chosen": -0.8966995477676392, + "logits/rejected": -0.970150887966156, + "logps/chosen": -37.73095703125, + "logps/rejected": -48.6308708190918, + "loss": 0.1289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6531825065612793, + "rewards/margins": 4.240229606628418, + "rewards/rejected": -4.893411636352539, + "step": 218 + }, + { + "epoch": 2.591715976331361, + "grad_norm": 15.35867876097276, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": -0.7338119745254517, + "logits/rejected": -0.7964376211166382, + "logps/chosen": -35.940223693847656, + "logps/rejected": -49.16367721557617, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5470548868179321, + "rewards/margins": 4.513436317443848, + "rewards/rejected": -5.060491561889648, + "step": 219 + }, + { + "epoch": 2.603550295857988, + "grad_norm": 15.463745366871002, + "learning_rate": 4.2585483741369755e-07, + "logits/chosen": -0.7903125882148743, + "logits/rejected": -0.6668514609336853, + "logps/chosen": -39.22986602783203, + "logps/rejected": -56.949462890625, + "loss": 0.109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4734441637992859, + "rewards/margins": 3.438096523284912, + "rewards/rejected": -3.9115407466888428, + "step": 220 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 11.656065312741738, + "learning_rate": 4.2492822202625065e-07, + "logits/chosen": -0.746453046798706, + "logits/rejected": -0.5671579837799072, + "logps/chosen": -29.767318725585938, + "logps/rejected": -55.21434020996094, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5708121657371521, + "rewards/margins": 4.064294338226318, + "rewards/rejected": -4.635106563568115, + "step": 221 + }, + { + "epoch": 2.6272189349112427, + "grad_norm": 21.80300920307137, + "learning_rate": 4.239968742025684e-07, + "logits/chosen": -1.0824371576309204, + "logits/rejected": -1.0959596633911133, + "logps/chosen": -43.30390548706055, + "logps/rejected": -61.727516174316406, + "loss": 0.1193, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6028754711151123, + "rewards/margins": 4.46554708480835, + "rewards/rejected": -5.068422317504883, + "step": 222 + }, + { + "epoch": 2.63905325443787, + "grad_norm": 14.049260804964101, + "learning_rate": 4.2306081913895177e-07, + "logits/chosen": -0.9191200137138367, + "logits/rejected": -1.0407882928848267, + "logps/chosen": -32.684852600097656, + "logps/rejected": -46.769744873046875, + "loss": 0.0912, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8108103275299072, + "rewards/margins": 4.522951126098633, + "rewards/rejected": -5.333761215209961, + "step": 223 + }, + { + "epoch": 2.6508875739644973, + "grad_norm": 15.994418099284374, + "learning_rate": 4.2212008215905e-07, + "logits/chosen": -0.8396817445755005, + "logits/rejected": -0.7051962018013, + "logps/chosen": -41.968048095703125, + "logps/rejected": -68.28170776367188, + "loss": 0.0873, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5884815454483032, + "rewards/margins": 5.965177536010742, + "rewards/rejected": -6.553658962249756, + "step": 224 + }, + { + "epoch": 2.662721893491124, + "grad_norm": 17.35636622879192, + "learning_rate": 4.2117468871317465e-07, + "logits/chosen": -1.0077329874038696, + "logits/rejected": -1.021416425704956, + "logps/chosen": -33.865623474121094, + "logps/rejected": -55.850704193115234, + "loss": 0.1052, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3826393485069275, + "rewards/margins": 5.580774784088135, + "rewards/rejected": -5.963414192199707, + "step": 225 + }, + { + "epoch": 2.6745562130177514, + "grad_norm": 12.60270784440611, + "learning_rate": 4.2022466437761154e-07, + "logits/chosen": -1.1540577411651611, + "logits/rejected": -1.0161852836608887, + "logps/chosen": -32.255516052246094, + "logps/rejected": -64.34480285644531, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15079544484615326, + "rewards/margins": 4.2565717697143555, + "rewards/rejected": -4.105776786804199, + "step": 226 + }, + { + "epoch": 2.6863905325443787, + "grad_norm": 11.837128157661406, + "learning_rate": 4.1927003485392873e-07, + "logits/chosen": -1.1032685041427612, + "logits/rejected": -1.2223795652389526, + "logps/chosen": -30.195100784301758, + "logps/rejected": -43.28514099121094, + "loss": 0.0599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1709037721157074, + "rewards/margins": 3.4937398433685303, + "rewards/rejected": -3.6646437644958496, + "step": 227 + }, + { + "epoch": 2.698224852071006, + "grad_norm": 16.871047034781974, + "learning_rate": 4.18310825968281e-07, + "logits/chosen": -1.022660732269287, + "logits/rejected": -1.1288559436798096, + "logps/chosen": -40.125972747802734, + "logps/rejected": -48.385841369628906, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03397001326084137, + "rewards/margins": 3.533397674560547, + "rewards/rejected": -3.4994277954101562, + "step": 228 + }, + { + "epoch": 2.710059171597633, + "grad_norm": 14.91535959862739, + "learning_rate": 4.173470636707115e-07, + "logits/chosen": -1.0440349578857422, + "logits/rejected": -1.0170382261276245, + "logps/chosen": -31.154685974121094, + "logps/rejected": -54.95928192138672, + "loss": 0.0941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.514030933380127, + "rewards/margins": 5.214803695678711, + "rewards/rejected": -5.72883415222168, + "step": 229 + }, + { + "epoch": 2.7218934911242605, + "grad_norm": 16.4841578502459, + "learning_rate": 4.1637877403444923e-07, + "logits/chosen": -0.771393358707428, + "logits/rejected": -0.7641423940658569, + "logps/chosen": -42.56052017211914, + "logps/rejected": -58.539730072021484, + "loss": 0.0918, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2780624628067017, + "rewards/margins": 5.585483551025391, + "rewards/rejected": -6.863546371459961, + "step": 230 + }, + { + "epoch": 2.7337278106508878, + "grad_norm": 12.082861048786787, + "learning_rate": 4.1540598325520406e-07, + "logits/chosen": -0.8163421154022217, + "logits/rejected": -0.8682371377944946, + "logps/chosen": -26.423309326171875, + "logps/rejected": -41.69921875, + "loss": 0.0766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16670477390289307, + "rewards/margins": 4.770580291748047, + "rewards/rejected": -4.93728494644165, + "step": 231 + }, + { + "epoch": 2.7455621301775146, + "grad_norm": 13.206244471660067, + "learning_rate": 4.144287176504582e-07, + "logits/chosen": -1.0770142078399658, + "logits/rejected": -1.100901484489441, + "logps/chosen": -43.73130416870117, + "logps/rejected": -58.910987854003906, + "loss": 0.0782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.775425136089325, + "rewards/margins": 5.230464935302734, + "rewards/rejected": -6.005890369415283, + "step": 232 + }, + { + "epoch": 2.757396449704142, + "grad_norm": 18.831608964137178, + "learning_rate": 4.1344700365875353e-07, + "logits/chosen": -1.2907536029815674, + "logits/rejected": -1.1133619546890259, + "logps/chosen": -42.22511672973633, + "logps/rejected": -80.1695327758789, + "loss": 0.1184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7888930439949036, + "rewards/margins": 4.728762149810791, + "rewards/rejected": -5.517655372619629, + "step": 233 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 18.13801394049149, + "learning_rate": 4.1246086783897713e-07, + "logits/chosen": -0.7159754037857056, + "logits/rejected": -0.9155410528182983, + "logps/chosen": -38.486732482910156, + "logps/rejected": -46.4632568359375, + "loss": 0.1165, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.28693825006484985, + "rewards/margins": 4.148380756378174, + "rewards/rejected": -4.435318946838379, + "step": 234 + }, + { + "epoch": 2.7810650887573964, + "grad_norm": 14.561324524299124, + "learning_rate": 4.1147033686964213e-07, + "logits/chosen": -1.0087186098098755, + "logits/rejected": -1.0201172828674316, + "logps/chosen": -38.35453796386719, + "logps/rejected": -61.89058303833008, + "loss": 0.082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6061041951179504, + "rewards/margins": 6.066676139831543, + "rewards/rejected": -6.6727800369262695, + "step": 235 + }, + { + "epoch": 2.7928994082840237, + "grad_norm": 18.20449643988736, + "learning_rate": 4.104754375481664e-07, + "logits/chosen": -1.0226837396621704, + "logits/rejected": -1.1159076690673828, + "logps/chosen": -33.471160888671875, + "logps/rejected": -49.130104064941406, + "loss": 0.1018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9132391214370728, + "rewards/margins": 4.530313491821289, + "rewards/rejected": -5.443552494049072, + "step": 236 + }, + { + "epoch": 2.804733727810651, + "grad_norm": 18.714482911950206, + "learning_rate": 4.0947619679014733e-07, + "logits/chosen": -1.3095102310180664, + "logits/rejected": -1.3122498989105225, + "logps/chosen": -37.17940902709961, + "logps/rejected": -56.99421691894531, + "loss": 0.0986, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6820850372314453, + "rewards/margins": 5.377782821655273, + "rewards/rejected": -6.059867858886719, + "step": 237 + }, + { + "epoch": 2.8165680473372783, + "grad_norm": 11.370290693432937, + "learning_rate": 4.084726416286337e-07, + "logits/chosen": -1.0340602397918701, + "logits/rejected": -0.8559252023696899, + "logps/chosen": -36.95317840576172, + "logps/rejected": -69.14118957519531, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7623500227928162, + "rewards/margins": 6.243594169616699, + "rewards/rejected": -7.00594425201416, + "step": 238 + }, + { + "epoch": 2.828402366863905, + "grad_norm": 18.470159692479328, + "learning_rate": 4.0746479921339456e-07, + "logits/chosen": -1.0048712491989136, + "logits/rejected": -1.0136165618896484, + "logps/chosen": -51.31761169433594, + "logps/rejected": -61.482666015625, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9726583361625671, + "rewards/margins": 3.5378739833831787, + "rewards/rejected": -4.510532379150391, + "step": 239 + }, + { + "epoch": 2.8402366863905324, + "grad_norm": 11.915657733560364, + "learning_rate": 4.0645269681018434e-07, + "logits/chosen": -1.22614324092865, + "logits/rejected": -1.3402845859527588, + "logps/chosen": -37.68153381347656, + "logps/rejected": -55.563926696777344, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6234626173973083, + "rewards/margins": 4.600487232208252, + "rewards/rejected": -5.223949909210205, + "step": 240 + }, + { + "epoch": 2.8520710059171597, + "grad_norm": 15.928696873524284, + "learning_rate": 4.054363618000057e-07, + "logits/chosen": -1.0241750478744507, + "logits/rejected": -0.9920932650566101, + "logps/chosen": -41.79422378540039, + "logps/rejected": -73.42777252197266, + "loss": 0.1032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0987391397356987, + "rewards/margins": 6.280758857727051, + "rewards/rejected": -6.18202018737793, + "step": 241 + }, + { + "epoch": 2.863905325443787, + "grad_norm": 14.965124803326795, + "learning_rate": 4.044158216783684e-07, + "logits/chosen": -1.2023519277572632, + "logits/rejected": -1.1554661989212036, + "logps/chosen": -50.22661590576172, + "logps/rejected": -60.445709228515625, + "loss": 0.0811, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7524704933166504, + "rewards/margins": 4.429455280303955, + "rewards/rejected": -5.1819257736206055, + "step": 242 + }, + { + "epoch": 2.8757396449704142, + "grad_norm": 15.090679655603102, + "learning_rate": 4.033911040545453e-07, + "logits/chosen": -1.0586270093917847, + "logits/rejected": -1.1135869026184082, + "logps/chosen": -39.405941009521484, + "logps/rejected": -55.257102966308594, + "loss": 0.0836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5358131527900696, + "rewards/margins": 4.630053520202637, + "rewards/rejected": -5.165866374969482, + "step": 243 + }, + { + "epoch": 2.8875739644970415, + "grad_norm": 15.379447226520636, + "learning_rate": 4.0236223665082605e-07, + "logits/chosen": -0.9784256219863892, + "logits/rejected": -1.0398386716842651, + "logps/chosen": -40.73570251464844, + "logps/rejected": -48.330848693847656, + "loss": 0.0896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9628621935844421, + "rewards/margins": 3.23721981048584, + "rewards/rejected": -4.200081825256348, + "step": 244 + }, + { + "epoch": 2.899408284023669, + "grad_norm": 16.74378459980107, + "learning_rate": 4.0132924730176653e-07, + "logits/chosen": -0.723244309425354, + "logits/rejected": -0.7906000018119812, + "logps/chosen": -33.3350944519043, + "logps/rejected": -44.06977844238281, + "loss": 0.1087, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1651308238506317, + "rewards/margins": 4.324864864349365, + "rewards/rejected": -4.489995956420898, + "step": 245 + }, + { + "epoch": 2.9112426035502956, + "grad_norm": 16.171542896949546, + "learning_rate": 4.0029216395343617e-07, + "logits/chosen": -0.9244989156723022, + "logits/rejected": -0.7894719839096069, + "logps/chosen": -38.35075759887695, + "logps/rejected": -63.83277893066406, + "loss": 0.104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6405995488166809, + "rewards/margins": 5.280316352844238, + "rewards/rejected": -5.9209160804748535, + "step": 246 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 16.103173612778058, + "learning_rate": 3.992510146626617e-07, + "logits/chosen": -1.072364091873169, + "logits/rejected": -1.219006061553955, + "logps/chosen": -52.24778366088867, + "logps/rejected": -54.28422546386719, + "loss": 0.088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5743333101272583, + "rewards/margins": 4.490229606628418, + "rewards/rejected": -5.064562797546387, + "step": 247 + }, + { + "epoch": 2.93491124260355, + "grad_norm": 21.934919226614063, + "learning_rate": 3.982058275962682e-07, + "logits/chosen": -1.0643049478530884, + "logits/rejected": -1.0713043212890625, + "logps/chosen": -38.724098205566406, + "logps/rejected": -49.14268112182617, + "loss": 0.1283, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.003525674343109131, + "rewards/margins": 3.779031991958618, + "rewards/rejected": -3.775506019592285, + "step": 248 + }, + { + "epoch": 2.9467455621301775, + "grad_norm": 13.845420751689739, + "learning_rate": 3.9715663103031706e-07, + "logits/chosen": -0.8697773218154907, + "logits/rejected": -1.0021997690200806, + "logps/chosen": -48.516197204589844, + "logps/rejected": -63.481204986572266, + "loss": 0.0863, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2427257299423218, + "rewards/margins": 5.817460060119629, + "rewards/rejected": -7.06018590927124, + "step": 249 + }, + { + "epoch": 2.9585798816568047, + "grad_norm": 15.20831572963414, + "learning_rate": 3.9610345334934094e-07, + "logits/chosen": -0.9560275673866272, + "logits/rejected": -0.7608417272567749, + "logps/chosen": -27.386804580688477, + "logps/rejected": -56.76541519165039, + "loss": 0.0957, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.055940259248018265, + "rewards/margins": 4.628420829772949, + "rewards/rejected": -4.57248067855835, + "step": 250 + }, + { + "epoch": 2.970414201183432, + "grad_norm": 13.509957666437309, + "learning_rate": 3.950463230455761e-07, + "logits/chosen": -0.9403877258300781, + "logits/rejected": -1.0766938924789429, + "logps/chosen": -46.44561004638672, + "logps/rejected": -55.376487731933594, + "loss": 0.0769, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6139943599700928, + "rewards/margins": 5.4038591384887695, + "rewards/rejected": -7.017853260040283, + "step": 251 + }, + { + "epoch": 2.9822485207100593, + "grad_norm": 13.281356988917038, + "learning_rate": 3.939852687181915e-07, + "logits/chosen": -1.5572373867034912, + "logits/rejected": -1.509653091430664, + "logps/chosen": -38.27780532836914, + "logps/rejected": -59.58174514770508, + "loss": 0.0765, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13283273577690125, + "rewards/margins": 6.1197829246521, + "rewards/rejected": -6.252615451812744, + "step": 252 + }, + { + "epoch": 2.994082840236686, + "grad_norm": 13.363993336432463, + "learning_rate": 3.9292031907251464e-07, + "logits/chosen": -0.9404221773147583, + "logits/rejected": -0.8659825921058655, + "logps/chosen": -47.44232177734375, + "logps/rejected": -72.00565338134766, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.288888692855835, + "rewards/margins": 6.671911239624023, + "rewards/rejected": -8.960800170898438, + "step": 253 + }, + { + "epoch": 3.0059171597633134, + "grad_norm": 13.191144507547312, + "learning_rate": 3.9185150291925585e-07, + "logits/chosen": -0.9490239024162292, + "logits/rejected": -0.7950053215026855, + "logps/chosen": -27.338720321655273, + "logps/rejected": -54.10641860961914, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2650642395019531, + "rewards/margins": 4.751713275909424, + "rewards/rejected": -5.016777992248535, + "step": 254 + }, + { + "epoch": 3.0177514792899407, + "grad_norm": 10.096716674793884, + "learning_rate": 3.9077884917372806e-07, + "logits/chosen": -0.9493421912193298, + "logits/rejected": -0.9330881834030151, + "logps/chosen": -33.75062561035156, + "logps/rejected": -58.77503967285156, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49208763241767883, + "rewards/margins": 5.006468296051025, + "rewards/rejected": -5.498556137084961, + "step": 255 + }, + { + "epoch": 3.029585798816568, + "grad_norm": 8.187374294888702, + "learning_rate": 3.8970238685506486e-07, + "logits/chosen": -1.0067384243011475, + "logits/rejected": -1.0715603828430176, + "logps/chosen": -29.130413055419922, + "logps/rejected": -55.432456970214844, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37083467841148376, + "rewards/margins": 6.350147247314453, + "rewards/rejected": -6.720982551574707, + "step": 256 + }, + { + "epoch": 3.0414201183431953, + "grad_norm": 10.400956778483433, + "learning_rate": 3.8862214508543544e-07, + "logits/chosen": -1.1537256240844727, + "logits/rejected": -1.2295022010803223, + "logps/chosen": -44.73936080932617, + "logps/rejected": -56.7740478515625, + "loss": 0.0558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8702179789543152, + "rewards/margins": 4.224546432495117, + "rewards/rejected": -5.09476375579834, + "step": 257 + }, + { + "epoch": 3.0532544378698225, + "grad_norm": 9.746601784268003, + "learning_rate": 3.8753815308925685e-07, + "logits/chosen": -0.9428281784057617, + "logits/rejected": -0.7745850682258606, + "logps/chosen": -42.13050842285156, + "logps/rejected": -81.40808868408203, + "loss": 0.0576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15858441591262817, + "rewards/margins": 5.615185260772705, + "rewards/rejected": -5.773769378662109, + "step": 258 + }, + { + "epoch": 3.06508875739645, + "grad_norm": 8.307502810968026, + "learning_rate": 3.864504401924031e-07, + "logits/chosen": -0.750975489616394, + "logits/rejected": -0.8657329082489014, + "logps/chosen": -42.026031494140625, + "logps/rejected": -60.68046951293945, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10235822200775146, + "rewards/margins": 5.683484077453613, + "rewards/rejected": -5.7858428955078125, + "step": 259 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 10.229530562143605, + "learning_rate": 3.8535903582141184e-07, + "logits/chosen": -0.9478952288627625, + "logits/rejected": -1.0307767391204834, + "logps/chosen": -55.219512939453125, + "logps/rejected": -65.23870849609375, + "loss": 0.0592, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5834192633628845, + "rewards/margins": 5.146862983703613, + "rewards/rejected": -5.730282306671143, + "step": 260 + }, + { + "epoch": 3.088757396449704, + "grad_norm": 11.360767151960536, + "learning_rate": 3.8426396950268846e-07, + "logits/chosen": -0.9507812261581421, + "logits/rejected": -1.0435519218444824, + "logps/chosen": -37.444190979003906, + "logps/rejected": -53.09665298461914, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7893316745758057, + "rewards/margins": 4.056205749511719, + "rewards/rejected": -4.845537185668945, + "step": 261 + }, + { + "epoch": 3.100591715976331, + "grad_norm": 10.428147481093854, + "learning_rate": 3.8316527086170727e-07, + "logits/chosen": -1.1136678457260132, + "logits/rejected": -1.1166181564331055, + "logps/chosen": -46.36027526855469, + "logps/rejected": -61.403350830078125, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6686680316925049, + "rewards/margins": 5.635904312133789, + "rewards/rejected": -7.304572582244873, + "step": 262 + }, + { + "epoch": 3.1124260355029585, + "grad_norm": 9.067198800478028, + "learning_rate": 3.820629696222096e-07, + "logits/chosen": -0.6134490370750427, + "logits/rejected": -0.6816850900650024, + "logps/chosen": -37.14327621459961, + "logps/rejected": -45.768043518066406, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10417760908603668, + "rewards/margins": 4.2560577392578125, + "rewards/rejected": -4.36023473739624, + "step": 263 + }, + { + "epoch": 3.1242603550295858, + "grad_norm": 9.142972342246532, + "learning_rate": 3.809570956054003e-07, + "logits/chosen": -1.110413908958435, + "logits/rejected": -1.2050219774246216, + "logps/chosen": -46.59968185424805, + "logps/rejected": -67.91047668457031, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3769968748092651, + "rewards/margins": 4.8837504386901855, + "rewards/rejected": -6.260746955871582, + "step": 264 + }, + { + "epoch": 3.136094674556213, + "grad_norm": 9.722445350426577, + "learning_rate": 3.798476787291407e-07, + "logits/chosen": -1.204038143157959, + "logits/rejected": -1.2214388847351074, + "logps/chosen": -55.631988525390625, + "logps/rejected": -70.27022552490234, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.43508243560791, + "rewards/margins": 5.983229637145996, + "rewards/rejected": -8.418312072753906, + "step": 265 + }, + { + "epoch": 3.1479289940828403, + "grad_norm": 12.165954862754088, + "learning_rate": 3.787347490071389e-07, + "logits/chosen": -1.0962128639221191, + "logits/rejected": -1.0446038246154785, + "logps/chosen": -30.58979034423828, + "logps/rejected": -57.98908233642578, + "loss": 0.0751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3746563792228699, + "rewards/margins": 6.635012626647949, + "rewards/rejected": -7.009669303894043, + "step": 266 + }, + { + "epoch": 3.1597633136094676, + "grad_norm": 9.45931606123372, + "learning_rate": 3.776183365481385e-07, + "logits/chosen": -1.1556551456451416, + "logits/rejected": -1.1371455192565918, + "logps/chosen": -29.80742835998535, + "logps/rejected": -65.14445495605469, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.65096515417099, + "rewards/margins": 8.631277084350586, + "rewards/rejected": -9.282241821289062, + "step": 267 + }, + { + "epoch": 3.171597633136095, + "grad_norm": 8.216009521089312, + "learning_rate": 3.764984715551031e-07, + "logits/chosen": -0.7823802828788757, + "logits/rejected": -0.8771790266036987, + "logps/chosen": -33.291709899902344, + "logps/rejected": -53.34422302246094, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4557185471057892, + "rewards/margins": 5.85953426361084, + "rewards/rejected": -6.315252780914307, + "step": 268 + }, + { + "epoch": 3.1834319526627217, + "grad_norm": 8.44405243827575, + "learning_rate": 3.753751843244003e-07, + "logits/chosen": -0.6730700731277466, + "logits/rejected": -0.7033834457397461, + "logps/chosen": -38.24939727783203, + "logps/rejected": -50.691505432128906, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48223841190338135, + "rewards/margins": 5.0005269050598145, + "rewards/rejected": -5.482766151428223, + "step": 269 + }, + { + "epoch": 3.195266272189349, + "grad_norm": 12.678568405782729, + "learning_rate": 3.7424850524498113e-07, + "logits/chosen": -1.1234819889068604, + "logits/rejected": -1.0680205821990967, + "logps/chosen": -49.61648178100586, + "logps/rejected": -62.6021728515625, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4030035734176636, + "rewards/margins": 6.600266456604004, + "rewards/rejected": -8.003270149230957, + "step": 270 + }, + { + "epoch": 3.2071005917159763, + "grad_norm": 8.769925116184863, + "learning_rate": 3.731184647975584e-07, + "logits/chosen": -1.0641188621520996, + "logits/rejected": -1.0786867141723633, + "logps/chosen": -39.86817169189453, + "logps/rejected": -59.44279479980469, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2523530125617981, + "rewards/margins": 5.5049519538879395, + "rewards/rejected": -5.757305145263672, + "step": 271 + }, + { + "epoch": 3.2189349112426036, + "grad_norm": 9.128279564046098, + "learning_rate": 3.7198509355378207e-07, + "logits/chosen": -1.1525990962982178, + "logits/rejected": -1.1050852537155151, + "logps/chosen": -31.691631317138672, + "logps/rejected": -58.040374755859375, + "loss": 0.0547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3830980956554413, + "rewards/margins": 5.858831882476807, + "rewards/rejected": -6.24193000793457, + "step": 272 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 5.945965238283508, + "learning_rate": 3.7084842217541196e-07, + "logits/chosen": -1.1103689670562744, + "logits/rejected": -1.2655141353607178, + "logps/chosen": -43.58381652832031, + "logps/rejected": -63.32284927368164, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6699415445327759, + "rewards/margins": 7.9212541580200195, + "rewards/rejected": -8.591195106506348, + "step": 273 + }, + { + "epoch": 3.242603550295858, + "grad_norm": 9.302647208034704, + "learning_rate": 3.6970848141348855e-07, + "logits/chosen": -1.2158900499343872, + "logits/rejected": -1.2491604089736938, + "logps/chosen": -43.763771057128906, + "logps/rejected": -65.52889251708984, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5131170153617859, + "rewards/margins": 5.668525218963623, + "rewards/rejected": -6.181642532348633, + "step": 274 + }, + { + "epoch": 3.2544378698224854, + "grad_norm": 8.636078483244923, + "learning_rate": 3.685653021075006e-07, + "logits/chosen": -1.3144724369049072, + "logits/rejected": -1.1518325805664062, + "logps/chosen": -32.579593658447266, + "logps/rejected": -59.97221374511719, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.165370911359787, + "rewards/margins": 6.193992614746094, + "rewards/rejected": -6.359362602233887, + "step": 275 + }, + { + "epoch": 3.2662721893491122, + "grad_norm": 8.631432982295077, + "learning_rate": 3.6741891518455146e-07, + "logits/chosen": -1.0432945489883423, + "logits/rejected": -0.9308419227600098, + "logps/chosen": -38.913291931152344, + "logps/rejected": -64.8463363647461, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2648208141326904, + "rewards/margins": 7.041728496551514, + "rewards/rejected": -8.306549072265625, + "step": 276 + }, + { + "epoch": 3.2781065088757395, + "grad_norm": 8.627648180187752, + "learning_rate": 3.6626935165852183e-07, + "logits/chosen": -1.1467763185501099, + "logits/rejected": -0.95106440782547, + "logps/chosen": -45.197444915771484, + "logps/rejected": -77.6880874633789, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7282447218894958, + "rewards/margins": 6.463216304779053, + "rewards/rejected": -7.191461086273193, + "step": 277 + }, + { + "epoch": 3.289940828402367, + "grad_norm": 10.925286523452975, + "learning_rate": 3.6511664262923094e-07, + "logits/chosen": -1.0780082941055298, + "logits/rejected": -1.060866355895996, + "logps/chosen": -41.23103332519531, + "logps/rejected": -64.39918518066406, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5627799034118652, + "rewards/margins": 4.834949970245361, + "rewards/rejected": -5.397729873657227, + "step": 278 + }, + { + "epoch": 3.301775147928994, + "grad_norm": 7.9347499078081745, + "learning_rate": 3.639608192815951e-07, + "logits/chosen": -1.1390736103057861, + "logits/rejected": -1.1505769491195679, + "logps/chosen": -31.97028923034668, + "logps/rejected": -52.920997619628906, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.059164464473724365, + "rewards/margins": 5.323459148406982, + "rewards/rejected": -5.264294624328613, + "step": 279 + }, + { + "epoch": 3.3136094674556213, + "grad_norm": 9.658317222064715, + "learning_rate": 3.6280191288478435e-07, + "logits/chosen": -0.3561999797821045, + "logits/rejected": -0.4131832420825958, + "logps/chosen": -49.74396514892578, + "logps/rejected": -51.83927917480469, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05161542445421219, + "rewards/margins": 3.9036967754364014, + "rewards/rejected": -3.852081298828125, + "step": 280 + }, + { + "epoch": 3.3254437869822486, + "grad_norm": 7.0333602841005565, + "learning_rate": 3.61639954791376e-07, + "logits/chosen": -0.7871007919311523, + "logits/rejected": -0.7978352308273315, + "logps/chosen": -37.140010833740234, + "logps/rejected": -54.86417007446289, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12373484671115875, + "rewards/margins": 5.170371055603027, + "rewards/rejected": -5.2941060066223145, + "step": 281 + }, + { + "epoch": 3.337278106508876, + "grad_norm": 7.845433796623778, + "learning_rate": 3.604749764365069e-07, + "logits/chosen": -0.9799227714538574, + "logits/rejected": -0.933765709400177, + "logps/chosen": -41.03892517089844, + "logps/rejected": -58.41741943359375, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3890104293823242, + "rewards/margins": 4.3942551612854, + "rewards/rejected": -4.783265113830566, + "step": 282 + }, + { + "epoch": 3.3491124260355027, + "grad_norm": 8.104393341578428, + "learning_rate": 3.593070093370226e-07, + "logits/chosen": -1.1032558679580688, + "logits/rejected": -1.1244325637817383, + "logps/chosen": -37.84381866455078, + "logps/rejected": -54.38085174560547, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03373198211193085, + "rewards/margins": 5.32047176361084, + "rewards/rejected": -5.286739349365234, + "step": 283 + }, + { + "epoch": 3.36094674556213, + "grad_norm": 7.636891980111439, + "learning_rate": 3.5813608509062526e-07, + "logits/chosen": -1.0172635316848755, + "logits/rejected": -1.1440664529800415, + "logps/chosen": -46.45831298828125, + "logps/rejected": -58.084495544433594, + "loss": 0.0364, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6164510250091553, + "rewards/margins": 5.46311092376709, + "rewards/rejected": -7.079561233520508, + "step": 284 + }, + { + "epoch": 3.3727810650887573, + "grad_norm": 8.575228042301212, + "learning_rate": 3.569622353750181e-07, + "logits/chosen": -1.1266000270843506, + "logits/rejected": -1.0841931104660034, + "logps/chosen": -30.485980987548828, + "logps/rejected": -66.34876251220703, + "loss": 0.0429, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12574508786201477, + "rewards/margins": 6.036714553833008, + "rewards/rejected": -5.910969257354736, + "step": 285 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 14.644268169698243, + "learning_rate": 3.557854919470491e-07, + "logits/chosen": -0.8823134899139404, + "logits/rejected": -0.9364801645278931, + "logps/chosen": -42.28858947753906, + "logps/rejected": -47.119564056396484, + "loss": 0.0808, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7562182545661926, + "rewards/margins": 3.6561923027038574, + "rewards/rejected": -4.412410736083984, + "step": 286 + }, + { + "epoch": 3.396449704142012, + "grad_norm": 8.345961744927349, + "learning_rate": 3.546058866418513e-07, + "logits/chosen": -1.3047679662704468, + "logits/rejected": -1.2904943227767944, + "logps/chosen": -32.27897644042969, + "logps/rejected": -55.80604553222656, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10520702600479126, + "rewards/margins": 6.855145454406738, + "rewards/rejected": -6.960352420806885, + "step": 287 + }, + { + "epoch": 3.408284023668639, + "grad_norm": 6.8991491567748895, + "learning_rate": 3.5342345137198206e-07, + "logits/chosen": -1.0065886974334717, + "logits/rejected": -0.9561058282852173, + "logps/chosen": -36.960044860839844, + "logps/rejected": -61.72477340698242, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1544219255447388, + "rewards/margins": 6.217504978179932, + "rewards/rejected": -7.371927261352539, + "step": 288 + }, + { + "epoch": 3.4201183431952664, + "grad_norm": 10.230292373857246, + "learning_rate": 3.5223821812655903e-07, + "logits/chosen": -0.9817609190940857, + "logits/rejected": -1.056479811668396, + "logps/chosen": -42.391998291015625, + "logps/rejected": -52.13397979736328, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1262445449829102, + "rewards/margins": 5.622671604156494, + "rewards/rejected": -6.748915672302246, + "step": 289 + }, + { + "epoch": 3.4319526627218933, + "grad_norm": 11.408367741978596, + "learning_rate": 3.510502189703954e-07, + "logits/chosen": -0.8357728719711304, + "logits/rejected": -0.7616229057312012, + "logps/chosen": -42.24382781982422, + "logps/rejected": -66.47443389892578, + "loss": 0.0657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8911622762680054, + "rewards/margins": 6.7061848640441895, + "rewards/rejected": -7.597347259521484, + "step": 290 + }, + { + "epoch": 3.4437869822485205, + "grad_norm": 12.561936917149565, + "learning_rate": 3.4985948604313237e-07, + "logits/chosen": -0.8755354285240173, + "logits/rejected": -0.8271730542182922, + "logps/chosen": -31.485605239868164, + "logps/rejected": -58.347190856933594, + "loss": 0.0752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19215503334999084, + "rewards/margins": 5.973523139953613, + "rewards/rejected": -5.781367301940918, + "step": 291 + }, + { + "epoch": 3.455621301775148, + "grad_norm": 11.955176243577917, + "learning_rate": 3.486660515583691e-07, + "logits/chosen": -1.228103518486023, + "logits/rejected": -1.2567815780639648, + "logps/chosen": -41.65220260620117, + "logps/rejected": -58.665496826171875, + "loss": 0.0598, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3302292823791504, + "rewards/margins": 5.375921726226807, + "rewards/rejected": -6.706151008605957, + "step": 292 + }, + { + "epoch": 3.467455621301775, + "grad_norm": 10.442695814707806, + "learning_rate": 3.474699478027918e-07, + "logits/chosen": -1.121075987815857, + "logits/rejected": -1.162339687347412, + "logps/chosen": -40.622459411621094, + "logps/rejected": -55.069026947021484, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2291250228881836, + "rewards/margins": 5.634562015533447, + "rewards/rejected": -5.863687515258789, + "step": 293 + }, + { + "epoch": 3.4792899408284024, + "grad_norm": 9.212432774516737, + "learning_rate": 3.4627120713529983e-07, + "logits/chosen": -1.1274569034576416, + "logits/rejected": -1.0957834720611572, + "logps/chosen": -29.82750129699707, + "logps/rejected": -60.58903503417969, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8590860366821289, + "rewards/margins": 6.390963554382324, + "rewards/rejected": -7.250050067901611, + "step": 294 + }, + { + "epoch": 3.4911242603550297, + "grad_norm": 8.987438061891513, + "learning_rate": 3.4506986198613077e-07, + "logits/chosen": -1.041571855545044, + "logits/rejected": -1.0537890195846558, + "logps/chosen": -41.13783264160156, + "logps/rejected": -72.97576904296875, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5465377569198608, + "rewards/margins": 7.734911918640137, + "rewards/rejected": -8.281450271606445, + "step": 295 + }, + { + "epoch": 3.502958579881657, + "grad_norm": 7.117159631612635, + "learning_rate": 3.438659448559825e-07, + "logits/chosen": -1.280914306640625, + "logits/rejected": -1.3170225620269775, + "logps/chosen": -36.09680938720703, + "logps/rejected": -57.332427978515625, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9314979910850525, + "rewards/margins": 7.998897075653076, + "rewards/rejected": -8.930395126342773, + "step": 296 + }, + { + "epoch": 3.5147928994082838, + "grad_norm": 9.213072401647429, + "learning_rate": 3.4265948831513434e-07, + "logits/chosen": -1.0224130153656006, + "logits/rejected": -1.0106931924819946, + "logps/chosen": -51.425453186035156, + "logps/rejected": -59.91952896118164, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6506232023239136, + "rewards/margins": 6.213413715362549, + "rewards/rejected": -6.864037036895752, + "step": 297 + }, + { + "epoch": 3.5266272189349115, + "grad_norm": 8.34764646763322, + "learning_rate": 3.414505250025659e-07, + "logits/chosen": -0.5949307680130005, + "logits/rejected": -0.6818762421607971, + "logps/chosen": -31.321874618530273, + "logps/rejected": -45.574928283691406, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6341886520385742, + "rewards/margins": 4.197469234466553, + "rewards/rejected": -4.831657886505127, + "step": 298 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 9.531733325613454, + "learning_rate": 3.402390876250737e-07, + "logits/chosen": -0.9537699222564697, + "logits/rejected": -0.8830502033233643, + "logps/chosen": -43.555381774902344, + "logps/rejected": -60.58173751831055, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3789546489715576, + "rewards/margins": 5.775010585784912, + "rewards/rejected": -7.153965473175049, + "step": 299 + }, + { + "epoch": 3.5502958579881656, + "grad_norm": 9.379884768180517, + "learning_rate": 3.390252089563867e-07, + "logits/chosen": -1.3647562265396118, + "logits/rejected": -1.2569482326507568, + "logps/chosen": -32.759376525878906, + "logps/rejected": -46.86608123779297, + "loss": 0.0402, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.663446307182312, + "rewards/margins": 4.781562805175781, + "rewards/rejected": -5.445009231567383, + "step": 300 + }, + { + "epoch": 3.562130177514793, + "grad_norm": 6.852898201829805, + "learning_rate": 3.3780892183627974e-07, + "logits/chosen": -1.2863893508911133, + "logits/rejected": -1.256026029586792, + "logps/chosen": -45.72820281982422, + "logps/rejected": -76.36709594726562, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.662516713142395, + "rewards/margins": 7.760738372802734, + "rewards/rejected": -8.423254013061523, + "step": 301 + }, + { + "epoch": 3.57396449704142, + "grad_norm": 9.656007482244595, + "learning_rate": 3.3659025916968475e-07, + "logits/chosen": -1.1637496948242188, + "logits/rejected": -1.0999524593353271, + "logps/chosen": -40.303245544433594, + "logps/rejected": -65.28178405761719, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.500770092010498, + "rewards/margins": 6.479090690612793, + "rewards/rejected": -7.979861259460449, + "step": 302 + }, + { + "epoch": 3.5857988165680474, + "grad_norm": 6.56810361487223, + "learning_rate": 3.353692539258006e-07, + "logits/chosen": -1.1888104677200317, + "logits/rejected": -1.1970776319503784, + "logps/chosen": -55.63449478149414, + "logps/rejected": -79.99554443359375, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9003673791885376, + "rewards/margins": 7.262213706970215, + "rewards/rejected": -9.162581443786621, + "step": 303 + }, + { + "epoch": 3.5976331360946747, + "grad_norm": 10.490981950638373, + "learning_rate": 3.3414593913720155e-07, + "logits/chosen": -0.9099432229995728, + "logits/rejected": -0.8601805567741394, + "logps/chosen": -41.21411895751953, + "logps/rejected": -65.25252532958984, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1893513202667236, + "rewards/margins": 5.567013740539551, + "rewards/rejected": -6.756364822387695, + "step": 304 + }, + { + "epoch": 3.609467455621302, + "grad_norm": 6.610277331758854, + "learning_rate": 3.329203478989431e-07, + "logits/chosen": -1.0593793392181396, + "logits/rejected": -1.0052438974380493, + "logps/chosen": -36.38404083251953, + "logps/rejected": -59.90925598144531, + "loss": 0.0423, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4144278764724731, + "rewards/margins": 5.430604457855225, + "rewards/rejected": -6.845032215118408, + "step": 305 + }, + { + "epoch": 3.621301775147929, + "grad_norm": 9.012951266000211, + "learning_rate": 3.3169251336766697e-07, + "logits/chosen": -1.0458168983459473, + "logits/rejected": -1.012871265411377, + "logps/chosen": -35.66429138183594, + "logps/rejected": -57.909767150878906, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3210654258728027, + "rewards/margins": 6.225252151489258, + "rewards/rejected": -7.546317100524902, + "step": 306 + }, + { + "epoch": 3.633136094674556, + "grad_norm": 10.588611564681699, + "learning_rate": 3.3046246876070405e-07, + "logits/chosen": -1.1131824254989624, + "logits/rejected": -1.117433786392212, + "logps/chosen": -41.028900146484375, + "logps/rejected": -58.1406135559082, + "loss": 0.0481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07064366340637207, + "rewards/margins": 6.420350551605225, + "rewards/rejected": -6.490993976593018, + "step": 307 + }, + { + "epoch": 3.6449704142011834, + "grad_norm": 10.357494650059015, + "learning_rate": 3.2923024735517567e-07, + "logits/chosen": -1.2510991096496582, + "logits/rejected": -1.169487476348877, + "logps/chosen": -49.32808303833008, + "logps/rejected": -77.26386260986328, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1415975093841553, + "rewards/margins": 8.103403091430664, + "rewards/rejected": -9.245000839233398, + "step": 308 + }, + { + "epoch": 3.6568047337278107, + "grad_norm": 10.469013609843655, + "learning_rate": 3.279958824870934e-07, + "logits/chosen": -0.921501100063324, + "logits/rejected": -0.8510603308677673, + "logps/chosen": -37.00920486450195, + "logps/rejected": -50.785682678222656, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2592521011829376, + "rewards/margins": 3.4533708095550537, + "rewards/rejected": -3.1941187381744385, + "step": 309 + }, + { + "epoch": 3.668639053254438, + "grad_norm": 8.143849115083505, + "learning_rate": 3.2675940755045713e-07, + "logits/chosen": -1.1272180080413818, + "logits/rejected": -1.1566120386123657, + "logps/chosen": -41.058563232421875, + "logps/rejected": -59.50581741333008, + "loss": 0.0535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3426307439804077, + "rewards/margins": 5.675926685333252, + "rewards/rejected": -6.018557548522949, + "step": 310 + }, + { + "epoch": 3.6804733727810652, + "grad_norm": 7.797821676664401, + "learning_rate": 3.2552085599635167e-07, + "logits/chosen": -1.2833009958267212, + "logits/rejected": -1.1694531440734863, + "logps/chosen": -32.42615509033203, + "logps/rejected": -64.32304382324219, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2963796854019165, + "rewards/margins": 6.591299533843994, + "rewards/rejected": -7.887678623199463, + "step": 311 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 6.89356963474552, + "learning_rate": 3.242802613320418e-07, + "logits/chosen": -1.393490195274353, + "logits/rejected": -1.3847185373306274, + "logps/chosen": -41.63416290283203, + "logps/rejected": -70.63141632080078, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9141947031021118, + "rewards/margins": 7.590588569641113, + "rewards/rejected": -8.504783630371094, + "step": 312 + }, + { + "epoch": 3.7041420118343193, + "grad_norm": 11.222653962104593, + "learning_rate": 3.2303765712006585e-07, + "logits/chosen": -1.1832377910614014, + "logits/rejected": -1.1547398567199707, + "logps/chosen": -40.68590545654297, + "logps/rejected": -70.75530242919922, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7116194367408752, + "rewards/margins": 5.869533538818359, + "rewards/rejected": -6.58115291595459, + "step": 313 + }, + { + "epoch": 3.7159763313609466, + "grad_norm": 6.708240807361137, + "learning_rate": 3.217930769773275e-07, + "logits/chosen": -1.2435672283172607, + "logits/rejected": -1.035918951034546, + "logps/chosen": -50.29835891723633, + "logps/rejected": -74.53111267089844, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17033040523529053, + "rewards/margins": 6.697113513946533, + "rewards/rejected": -6.867444038391113, + "step": 314 + }, + { + "epoch": 3.727810650887574, + "grad_norm": 9.42387101375972, + "learning_rate": 3.2054655457418647e-07, + "logits/chosen": -1.0515731573104858, + "logits/rejected": -1.0346689224243164, + "logps/chosen": -35.189369201660156, + "logps/rejected": -55.9654655456543, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5117961168289185, + "rewards/margins": 5.10178804397583, + "rewards/rejected": -5.613584041595459, + "step": 315 + }, + { + "epoch": 3.739644970414201, + "grad_norm": 6.516712675879623, + "learning_rate": 3.1929812363354764e-07, + "logits/chosen": -1.3077701330184937, + "logits/rejected": -1.1945271492004395, + "logps/chosen": -35.00279235839844, + "logps/rejected": -54.25492477416992, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9486129283905029, + "rewards/margins": 5.082883834838867, + "rewards/rejected": -6.031496524810791, + "step": 316 + }, + { + "epoch": 3.7514792899408285, + "grad_norm": 8.087941236249774, + "learning_rate": 3.1804781792994867e-07, + "logits/chosen": -1.1108276844024658, + "logits/rejected": -1.2471861839294434, + "logps/chosen": -41.833309173583984, + "logps/rejected": -61.84939193725586, + "loss": 0.0482, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.742074728012085, + "rewards/margins": 7.079897403717041, + "rewards/rejected": -8.821971893310547, + "step": 317 + }, + { + "epoch": 3.7633136094674557, + "grad_norm": 10.801590352605164, + "learning_rate": 3.167956712886463e-07, + "logits/chosen": -1.373018503189087, + "logits/rejected": -1.1945630311965942, + "logps/chosen": -30.055692672729492, + "logps/rejected": -56.957427978515625, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4738597273826599, + "rewards/margins": 5.641894817352295, + "rewards/rejected": -6.115754127502441, + "step": 318 + }, + { + "epoch": 3.775147928994083, + "grad_norm": 10.24310089782514, + "learning_rate": 3.155417175847011e-07, + "logits/chosen": -1.0630590915679932, + "logits/rejected": -1.0347498655319214, + "logps/chosen": -32.52122497558594, + "logps/rejected": -51.32162857055664, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5487163066864014, + "rewards/margins": 4.911801815032959, + "rewards/rejected": -5.460517883300781, + "step": 319 + }, + { + "epoch": 3.78698224852071, + "grad_norm": 6.870366909127477, + "learning_rate": 3.142859907420615e-07, + "logits/chosen": -1.1960430145263672, + "logits/rejected": -1.1303733587265015, + "logps/chosen": -41.18871307373047, + "logps/rejected": -58.43561553955078, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28152644634246826, + "rewards/margins": 5.013343811035156, + "rewards/rejected": -5.294870376586914, + "step": 320 + }, + { + "epoch": 3.798816568047337, + "grad_norm": 8.050195637302277, + "learning_rate": 3.1302852473264537e-07, + "logits/chosen": -0.8659465312957764, + "logits/rejected": -0.8467557430267334, + "logps/chosen": -33.895164489746094, + "logps/rejected": -50.80845642089844, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08278264105319977, + "rewards/margins": 4.752477169036865, + "rewards/rejected": -4.669694900512695, + "step": 321 + }, + { + "epoch": 3.8106508875739644, + "grad_norm": 8.069576088623236, + "learning_rate": 3.117693535754213e-07, + "logits/chosen": -0.9457200765609741, + "logits/rejected": -1.0279572010040283, + "logps/chosen": -37.87699890136719, + "logps/rejected": -58.80052185058594, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5841734409332275, + "rewards/margins": 5.943948268890381, + "rewards/rejected": -7.528121471405029, + "step": 322 + }, + { + "epoch": 3.8224852071005917, + "grad_norm": 9.494312396980359, + "learning_rate": 3.105085113354885e-07, + "logits/chosen": -1.0865830183029175, + "logits/rejected": -1.06438410282135, + "logps/chosen": -34.925296783447266, + "logps/rejected": -50.69199752807617, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6583223342895508, + "rewards/margins": 5.800050735473633, + "rewards/rejected": -6.458373069763184, + "step": 323 + }, + { + "epoch": 3.834319526627219, + "grad_norm": 10.61028311721643, + "learning_rate": 3.092460321231547e-07, + "logits/chosen": -1.1025853157043457, + "logits/rejected": -1.3029682636260986, + "logps/chosen": -45.37340545654297, + "logps/rejected": -57.43135452270508, + "loss": 0.0482, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6712946891784668, + "rewards/margins": 6.542860984802246, + "rewards/rejected": -8.214155197143555, + "step": 324 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 9.127446196094581, + "learning_rate": 3.079819500930138e-07, + "logits/chosen": -1.025221824645996, + "logits/rejected": -1.021698236465454, + "logps/chosen": -35.102596282958984, + "logps/rejected": -62.06328582763672, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45298677682876587, + "rewards/margins": 5.769662857055664, + "rewards/rejected": -6.222650051116943, + "step": 325 + }, + { + "epoch": 3.8579881656804735, + "grad_norm": 12.29389361531357, + "learning_rate": 3.0671629944302164e-07, + "logits/chosen": -1.0510854721069336, + "logits/rejected": -0.951848566532135, + "logps/chosen": -36.842681884765625, + "logps/rejected": -58.03953552246094, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3583628535270691, + "rewards/margins": 6.295431613922119, + "rewards/rejected": -6.653794288635254, + "step": 326 + }, + { + "epoch": 3.8698224852071004, + "grad_norm": 6.750607959887297, + "learning_rate": 3.054491144135707e-07, + "logits/chosen": -1.186820387840271, + "logits/rejected": -1.2949779033660889, + "logps/chosen": -38.38534164428711, + "logps/rejected": -59.20850372314453, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5839760303497314, + "rewards/margins": 7.092982292175293, + "rewards/rejected": -7.6769585609436035, + "step": 327 + }, + { + "epoch": 3.8816568047337277, + "grad_norm": 11.006787084671119, + "learning_rate": 3.0418042928656415e-07, + "logits/chosen": -0.9911553859710693, + "logits/rejected": -1.0456457138061523, + "logps/chosen": -46.48586654663086, + "logps/rejected": -65.22984313964844, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38870108127593994, + "rewards/margins": 6.2278571128845215, + "rewards/rejected": -6.616558074951172, + "step": 328 + }, + { + "epoch": 3.893491124260355, + "grad_norm": 10.159592627032264, + "learning_rate": 3.029102783844879e-07, + "logits/chosen": -0.9870089292526245, + "logits/rejected": -1.0301330089569092, + "logps/chosen": -43.52060317993164, + "logps/rejected": -66.05502319335938, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.344921350479126, + "rewards/margins": 7.7191667556762695, + "rewards/rejected": -9.064088821411133, + "step": 329 + }, + { + "epoch": 3.905325443786982, + "grad_norm": 9.8965394155921, + "learning_rate": 3.016386960694827e-07, + "logits/chosen": -1.0783404111862183, + "logits/rejected": -1.1657443046569824, + "logps/chosen": -35.86804962158203, + "logps/rejected": -49.75996017456055, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5152837038040161, + "rewards/margins": 5.677330017089844, + "rewards/rejected": -6.1926140785217285, + "step": 330 + }, + { + "epoch": 3.9171597633136095, + "grad_norm": 7.647123014651004, + "learning_rate": 3.003657167424139e-07, + "logits/chosen": -1.116632103919983, + "logits/rejected": -1.007010579109192, + "logps/chosen": -27.30445671081543, + "logps/rejected": -46.01488494873047, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.475461483001709, + "rewards/margins": 4.353217601776123, + "rewards/rejected": -4.828679084777832, + "step": 331 + }, + { + "epoch": 3.9289940828402368, + "grad_norm": 7.370007475987176, + "learning_rate": 2.990913748419411e-07, + "logits/chosen": -1.0164567232131958, + "logits/rejected": -1.099671483039856, + "logps/chosen": -42.57475662231445, + "logps/rejected": -57.868221282958984, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8491787910461426, + "rewards/margins": 5.017266750335693, + "rewards/rejected": -5.866445541381836, + "step": 332 + }, + { + "epoch": 3.940828402366864, + "grad_norm": 11.13700718226015, + "learning_rate": 2.978157048435863e-07, + "logits/chosen": -1.2751820087432861, + "logits/rejected": -1.3596327304840088, + "logps/chosen": -44.9207878112793, + "logps/rejected": -69.44329833984375, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0238862037658691, + "rewards/margins": 6.633727073669434, + "rewards/rejected": -7.657613754272461, + "step": 333 + }, + { + "epoch": 3.952662721893491, + "grad_norm": 9.53101226669104, + "learning_rate": 2.9653874125880167e-07, + "logits/chosen": -1.184804081916809, + "logits/rejected": -1.2233521938323975, + "logps/chosen": -33.34724807739258, + "logps/rejected": -57.77789306640625, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6291203498840332, + "rewards/margins": 6.075474739074707, + "rewards/rejected": -6.704594612121582, + "step": 334 + }, + { + "epoch": 3.9644970414201186, + "grad_norm": 7.607623067226138, + "learning_rate": 2.9526051863403517e-07, + "logits/chosen": -0.9798343181610107, + "logits/rejected": -1.0512641668319702, + "logps/chosen": -33.80629348754883, + "logps/rejected": -59.37635803222656, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18926861882209778, + "rewards/margins": 6.627708435058594, + "rewards/rejected": -6.438440322875977, + "step": 335 + }, + { + "epoch": 3.9763313609467454, + "grad_norm": 8.807254950481648, + "learning_rate": 2.9398107154979634e-07, + "logits/chosen": -1.0378540754318237, + "logits/rejected": -1.236304759979248, + "logps/chosen": -53.999755859375, + "logps/rejected": -71.97964477539062, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.30962872505188, + "rewards/margins": 8.442242622375488, + "rewards/rejected": -10.751871109008789, + "step": 336 + }, + { + "epoch": 3.9881656804733727, + "grad_norm": 7.885596351737529, + "learning_rate": 2.9270043461972097e-07, + "logits/chosen": -1.066207766532898, + "logits/rejected": -1.1475964784622192, + "logps/chosen": -51.98231506347656, + "logps/rejected": -71.3375244140625, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8781924247741699, + "rewards/margins": 7.75531005859375, + "rewards/rejected": -8.633502006530762, + "step": 337 + }, + { + "epoch": 4.0, + "grad_norm": 9.210155965565953, + "learning_rate": 2.9141864248963427e-07, + "logits/chosen": -1.214263916015625, + "logits/rejected": -1.1584827899932861, + "logps/chosen": -45.34033966064453, + "logps/rejected": -73.4906005859375, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.079200029373169, + "rewards/margins": 5.90239143371582, + "rewards/rejected": -6.98159122467041, + "step": 338 + }, + { + "epoch": 4.011834319526627, + "grad_norm": 6.508446991164881, + "learning_rate": 2.9013572983661375e-07, + "logits/chosen": -1.2331136465072632, + "logits/rejected": -1.3492088317871094, + "logps/chosen": -40.93061828613281, + "logps/rejected": -59.18800354003906, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27319905161857605, + "rewards/margins": 7.39341926574707, + "rewards/rejected": -7.666618347167969, + "step": 339 + }, + { + "epoch": 4.023668639053255, + "grad_norm": 7.338060837553236, + "learning_rate": 2.8885173136805125e-07, + "logits/chosen": -1.226414442062378, + "logits/rejected": -1.2364081144332886, + "logps/chosen": -43.55427551269531, + "logps/rejected": -65.36073303222656, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8232107162475586, + "rewards/margins": 7.595679759979248, + "rewards/rejected": -9.418889999389648, + "step": 340 + }, + { + "epoch": 4.035502958579881, + "grad_norm": 4.628023712795517, + "learning_rate": 2.8756668182071357e-07, + "logits/chosen": -1.180713176727295, + "logits/rejected": -1.1071914434432983, + "logps/chosen": -41.24338150024414, + "logps/rejected": -62.83121109008789, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6221990585327148, + "rewards/margins": 6.7731523513793945, + "rewards/rejected": -8.39535140991211, + "step": 341 + }, + { + "epoch": 4.047337278106509, + "grad_norm": 6.104842751233805, + "learning_rate": 2.862806159598032e-07, + "logits/chosen": -0.9698923826217651, + "logits/rejected": -0.9841170310974121, + "logps/chosen": -39.91763687133789, + "logps/rejected": -61.13340759277344, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4580131769180298, + "rewards/margins": 5.857044219970703, + "rewards/rejected": -7.315057754516602, + "step": 342 + }, + { + "epoch": 4.059171597633136, + "grad_norm": 6.103306838069507, + "learning_rate": 2.8499356857801744e-07, + "logits/chosen": -1.0496821403503418, + "logits/rejected": -1.0485060214996338, + "logps/chosen": -36.92170715332031, + "logps/rejected": -52.14490509033203, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.119423747062683, + "rewards/margins": 5.038466453552246, + "rewards/rejected": -6.1578898429870605, + "step": 343 + }, + { + "epoch": 4.071005917159764, + "grad_norm": 5.23129093382033, + "learning_rate": 2.837055744946072e-07, + "logits/chosen": -1.3063690662384033, + "logits/rejected": -1.1161246299743652, + "logps/chosen": -37.174598693847656, + "logps/rejected": -70.67867279052734, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7968075275421143, + "rewards/margins": 8.80527114868164, + "rewards/rejected": -10.602078437805176, + "step": 344 + }, + { + "epoch": 4.0828402366863905, + "grad_norm": 5.119583740628856, + "learning_rate": 2.8241666855443526e-07, + "logits/chosen": -1.1465834379196167, + "logits/rejected": -0.9921385049819946, + "logps/chosen": -43.1416015625, + "logps/rejected": -62.68724822998047, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9080947637557983, + "rewards/margins": 6.91895866394043, + "rewards/rejected": -7.827053070068359, + "step": 345 + }, + { + "epoch": 4.094674556213017, + "grad_norm": 7.489855061260909, + "learning_rate": 2.811268856270332e-07, + "logits/chosen": -1.083118200302124, + "logits/rejected": -1.002679467201233, + "logps/chosen": -38.36323547363281, + "logps/rejected": -67.2934799194336, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.746131181716919, + "rewards/margins": 6.556519508361816, + "rewards/rejected": -7.302650451660156, + "step": 346 + }, + { + "epoch": 4.106508875739645, + "grad_norm": 5.683245039375868, + "learning_rate": 2.798362606056583e-07, + "logits/chosen": -1.0683708190917969, + "logits/rejected": -1.0461918115615845, + "logps/chosen": -44.84065628051758, + "logps/rejected": -65.26290130615234, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4837861061096191, + "rewards/margins": 6.347103118896484, + "rewards/rejected": -7.8308892250061035, + "step": 347 + }, + { + "epoch": 4.118343195266272, + "grad_norm": 6.990533138399409, + "learning_rate": 2.7854482840634965e-07, + "logits/chosen": -1.2399723529815674, + "logits/rejected": -1.242792010307312, + "logps/chosen": -35.35354995727539, + "logps/rejected": -50.29901123046875, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.814429759979248, + "rewards/margins": 4.70147180557251, + "rewards/rejected": -5.5159010887146, + "step": 348 + }, + { + "epoch": 4.1301775147929, + "grad_norm": 6.319829009469765, + "learning_rate": 2.772526239669831e-07, + "logits/chosen": -1.2442165613174438, + "logits/rejected": -1.2576425075531006, + "logps/chosen": -41.020626068115234, + "logps/rejected": -71.2080078125, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1832388937473297, + "rewards/margins": 6.497980117797852, + "rewards/rejected": -6.681219100952148, + "step": 349 + }, + { + "epoch": 4.1420118343195265, + "grad_norm": 7.137478103836617, + "learning_rate": 2.759596822463267e-07, + "logits/chosen": -1.0815571546554565, + "logits/rejected": -1.0727592706680298, + "logps/chosen": -45.82701110839844, + "logps/rejected": -74.0084228515625, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5913039445877075, + "rewards/margins": 7.610474586486816, + "rewards/rejected": -9.201778411865234, + "step": 350 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 6.127060479740991, + "learning_rate": 2.746660382230944e-07, + "logits/chosen": -1.1690300703048706, + "logits/rejected": -1.1895644664764404, + "logps/chosen": -31.055179595947266, + "logps/rejected": -52.40230178833008, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1996252536773682, + "rewards/margins": 6.068203926086426, + "rewards/rejected": -7.267828941345215, + "step": 351 + }, + { + "epoch": 4.165680473372781, + "grad_norm": 6.941829346693308, + "learning_rate": 2.73371726895e-07, + "logits/chosen": -1.4027228355407715, + "logits/rejected": -1.4094792604446411, + "logps/chosen": -36.16400909423828, + "logps/rejected": -54.41309356689453, + "loss": 0.0423, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0032942295074463, + "rewards/margins": 5.790349960327148, + "rewards/rejected": -6.793643951416016, + "step": 352 + }, + { + "epoch": 4.177514792899408, + "grad_norm": 5.664169120525594, + "learning_rate": 2.7207678327781036e-07, + "logits/chosen": -1.0156735181808472, + "logits/rejected": -1.0846226215362549, + "logps/chosen": -41.97883605957031, + "logps/rejected": -58.83436584472656, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6752943396568298, + "rewards/margins": 6.577206611633301, + "rewards/rejected": -7.252501487731934, + "step": 353 + }, + { + "epoch": 4.189349112426036, + "grad_norm": 4.278910321706378, + "learning_rate": 2.7078124240439793e-07, + "logits/chosen": -1.4016739130020142, + "logits/rejected": -1.3881572484970093, + "logps/chosen": -40.908294677734375, + "logps/rejected": -81.16354370117188, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1897321939468384, + "rewards/margins": 9.733206748962402, + "rewards/rejected": -10.92293930053711, + "step": 354 + }, + { + "epoch": 4.201183431952662, + "grad_norm": 5.662906607186885, + "learning_rate": 2.6948513932379307e-07, + "logits/chosen": -1.127701997756958, + "logits/rejected": -1.2144131660461426, + "logps/chosen": -40.303466796875, + "logps/rejected": -53.539791107177734, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6169775724411011, + "rewards/margins": 5.929780006408691, + "rewards/rejected": -6.54675817489624, + "step": 355 + }, + { + "epoch": 4.21301775147929, + "grad_norm": 5.869401589278453, + "learning_rate": 2.68188509100236e-07, + "logits/chosen": -1.1310701370239258, + "logits/rejected": -0.9885028004646301, + "logps/chosen": -43.84556579589844, + "logps/rejected": -67.79801940917969, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5635640621185303, + "rewards/margins": 6.635869979858398, + "rewards/rejected": -7.199433326721191, + "step": 356 + }, + { + "epoch": 4.224852071005917, + "grad_norm": 6.4938093501862335, + "learning_rate": 2.668913868122279e-07, + "logits/chosen": -1.145686388015747, + "logits/rejected": -1.1048319339752197, + "logps/chosen": -38.25225830078125, + "logps/rejected": -84.31645965576172, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9592878222465515, + "rewards/margins": 8.200289726257324, + "rewards/rejected": -9.159577369689941, + "step": 357 + }, + { + "epoch": 4.236686390532545, + "grad_norm": 5.3318941750637805, + "learning_rate": 2.6559380755158206e-07, + "logits/chosen": -1.306983470916748, + "logits/rejected": -1.3083993196487427, + "logps/chosen": -32.1518440246582, + "logps/rejected": -57.46953582763672, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3908557891845703, + "rewards/margins": 6.340449810028076, + "rewards/rejected": -7.731306076049805, + "step": 358 + }, + { + "epoch": 4.2485207100591715, + "grad_norm": 4.583272241534079, + "learning_rate": 2.642958064224747e-07, + "logits/chosen": -1.1370916366577148, + "logits/rejected": -1.223602056503296, + "logps/chosen": -45.049015045166016, + "logps/rejected": -67.99398803710938, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5301861763000488, + "rewards/margins": 7.891375541687012, + "rewards/rejected": -8.421562194824219, + "step": 359 + }, + { + "epoch": 4.260355029585799, + "grad_norm": 5.8869583103527505, + "learning_rate": 2.629974185404951e-07, + "logits/chosen": -1.142185926437378, + "logits/rejected": -1.0663723945617676, + "logps/chosen": -54.66786193847656, + "logps/rejected": -72.61735534667969, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0223376750946045, + "rewards/margins": 5.790317535400391, + "rewards/rejected": -7.812655448913574, + "step": 360 + }, + { + "epoch": 4.272189349112426, + "grad_norm": 5.434403797417726, + "learning_rate": 2.616986790316952e-07, + "logits/chosen": -1.084727168083191, + "logits/rejected": -1.000258207321167, + "logps/chosen": -41.456581115722656, + "logps/rejected": -65.51860046386719, + "loss": 0.0308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9817674160003662, + "rewards/margins": 6.707640647888184, + "rewards/rejected": -7.689408302307129, + "step": 361 + }, + { + "epoch": 4.284023668639053, + "grad_norm": 7.370310021015268, + "learning_rate": 2.603996230316402e-07, + "logits/chosen": -0.9062331318855286, + "logits/rejected": -1.0064326524734497, + "logps/chosen": -37.973838806152344, + "logps/rejected": -57.57244873046875, + "loss": 0.0387, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2495323419570923, + "rewards/margins": 6.539309501647949, + "rewards/rejected": -7.78884220123291, + "step": 362 + }, + { + "epoch": 4.295857988165681, + "grad_norm": 4.712912974501213, + "learning_rate": 2.5910028568445716e-07, + "logits/chosen": -1.0892990827560425, + "logits/rejected": -0.9764453172683716, + "logps/chosen": -36.994937896728516, + "logps/rejected": -60.98323059082031, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5305968523025513, + "rewards/margins": 6.173823356628418, + "rewards/rejected": -6.704420566558838, + "step": 363 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 8.269082594101617, + "learning_rate": 2.5780070214188474e-07, + "logits/chosen": -1.2240197658538818, + "logits/rejected": -1.2403483390808105, + "logps/chosen": -44.62925720214844, + "logps/rejected": -68.63853454589844, + "loss": 0.0348, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3063997030258179, + "rewards/margins": 9.024595260620117, + "rewards/rejected": -10.330994606018066, + "step": 364 + }, + { + "epoch": 4.319526627218935, + "grad_norm": 7.007032072350081, + "learning_rate": 2.5650090756232226e-07, + "logits/chosen": -0.9315057992935181, + "logits/rejected": -0.9429672360420227, + "logps/chosen": -41.07881546020508, + "logps/rejected": -55.53737258911133, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0261346101760864, + "rewards/margins": 5.170999526977539, + "rewards/rejected": -6.197134017944336, + "step": 365 + }, + { + "epoch": 4.331360946745562, + "grad_norm": 6.395251337450572, + "learning_rate": 2.552009371098778e-07, + "logits/chosen": -1.404557466506958, + "logits/rejected": -1.3076186180114746, + "logps/chosen": -31.61805534362793, + "logps/rejected": -48.43040084838867, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18829268217086792, + "rewards/margins": 5.8097405433654785, + "rewards/rejected": -5.998033046722412, + "step": 366 + }, + { + "epoch": 4.34319526627219, + "grad_norm": 3.8677234721220275, + "learning_rate": 2.5390082595341816e-07, + "logits/chosen": -1.1186158657073975, + "logits/rejected": -1.1845828294754028, + "logps/chosen": -30.134925842285156, + "logps/rejected": -55.77180480957031, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2170889675617218, + "rewards/margins": 6.981093883514404, + "rewards/rejected": -6.764005661010742, + "step": 367 + }, + { + "epoch": 4.355029585798817, + "grad_norm": 6.159683736078311, + "learning_rate": 2.5260060926561604e-07, + "logits/chosen": -1.056211233139038, + "logits/rejected": -1.0722953081130981, + "logps/chosen": -38.946746826171875, + "logps/rejected": -69.67351531982422, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.082647681236267, + "rewards/margins": 6.603187561035156, + "rewards/rejected": -7.685835361480713, + "step": 368 + }, + { + "epoch": 4.366863905325443, + "grad_norm": 6.5279554127734745, + "learning_rate": 2.5130032222199954e-07, + "logits/chosen": -1.2777047157287598, + "logits/rejected": -1.230380654335022, + "logps/chosen": -44.25871658325195, + "logps/rejected": -68.12265014648438, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8238017559051514, + "rewards/margins": 7.426830291748047, + "rewards/rejected": -9.250631332397461, + "step": 369 + }, + { + "epoch": 4.378698224852071, + "grad_norm": 5.951315298701114, + "learning_rate": 2.5e-07, + "logits/chosen": -1.1897178888320923, + "logits/rejected": -1.1760835647583008, + "logps/chosen": -50.80073928833008, + "logps/rejected": -85.21668243408203, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2594873905181885, + "rewards/margins": 10.383180618286133, + "rewards/rejected": -11.642667770385742, + "step": 370 + }, + { + "epoch": 4.390532544378698, + "grad_norm": 6.15592831885902, + "learning_rate": 2.4869967777800055e-07, + "logits/chosen": -0.6888167858123779, + "logits/rejected": -0.6404827833175659, + "logps/chosen": -41.95843505859375, + "logps/rejected": -54.23292922973633, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.676899790763855, + "rewards/margins": 4.1613569259643555, + "rewards/rejected": -4.8382568359375, + "step": 371 + }, + { + "epoch": 4.402366863905326, + "grad_norm": 6.891198413290914, + "learning_rate": 2.4739939073438393e-07, + "logits/chosen": -0.82053142786026, + "logits/rejected": -0.8729037046432495, + "logps/chosen": -42.464149475097656, + "logps/rejected": -56.32053756713867, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1332708597183228, + "rewards/margins": 5.776480674743652, + "rewards/rejected": -6.909751892089844, + "step": 372 + }, + { + "epoch": 4.414201183431953, + "grad_norm": 5.861758579748894, + "learning_rate": 2.460991740465819e-07, + "logits/chosen": -1.1168274879455566, + "logits/rejected": -1.2398712635040283, + "logps/chosen": -42.28007507324219, + "logps/rejected": -57.90584945678711, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5763722658157349, + "rewards/margins": 5.462160110473633, + "rewards/rejected": -7.038532733917236, + "step": 373 + }, + { + "epoch": 4.42603550295858, + "grad_norm": 6.486927362087198, + "learning_rate": 2.4479906289012216e-07, + "logits/chosen": -1.2565290927886963, + "logits/rejected": -1.167330026626587, + "logps/chosen": -33.31645584106445, + "logps/rejected": -67.92594909667969, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6366786956787109, + "rewards/margins": 7.325068473815918, + "rewards/rejected": -7.961746692657471, + "step": 374 + }, + { + "epoch": 4.437869822485207, + "grad_norm": 7.945577732784187, + "learning_rate": 2.434990924376778e-07, + "logits/chosen": -1.3346067667007446, + "logits/rejected": -1.3618366718292236, + "logps/chosen": -40.330810546875, + "logps/rejected": -75.98135375976562, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.036665678024292, + "rewards/margins": 8.026451110839844, + "rewards/rejected": -9.063117027282715, + "step": 375 + }, + { + "epoch": 4.449704142011834, + "grad_norm": 4.958938820398859, + "learning_rate": 2.421992978581152e-07, + "logits/chosen": -0.9798704981803894, + "logits/rejected": -0.9934732913970947, + "logps/chosen": -55.855712890625, + "logps/rejected": -73.17900085449219, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.960662841796875, + "rewards/margins": 6.544261932373047, + "rewards/rejected": -9.504924774169922, + "step": 376 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 7.067788432308369, + "learning_rate": 2.4089971431554287e-07, + "logits/chosen": -1.0426850318908691, + "logits/rejected": -1.1020509004592896, + "logps/chosen": -38.878639221191406, + "logps/rejected": -50.67585754394531, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.268679678440094, + "rewards/margins": 6.866507053375244, + "rewards/rejected": -7.135187149047852, + "step": 377 + }, + { + "epoch": 4.4733727810650885, + "grad_norm": 6.288836568067266, + "learning_rate": 2.3960037696835987e-07, + "logits/chosen": -1.304239273071289, + "logits/rejected": -1.282547950744629, + "logps/chosen": -39.218841552734375, + "logps/rejected": -60.83658218383789, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.095663070678711, + "rewards/margins": 7.307448863983154, + "rewards/rejected": -8.403112411499023, + "step": 378 + }, + { + "epoch": 4.485207100591716, + "grad_norm": 6.123368205316238, + "learning_rate": 2.3830132096830475e-07, + "logits/chosen": -1.1514697074890137, + "logits/rejected": -1.2038421630859375, + "logps/chosen": -38.228721618652344, + "logps/rejected": -63.91864776611328, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9770803451538086, + "rewards/margins": 7.351076602935791, + "rewards/rejected": -8.328157424926758, + "step": 379 + }, + { + "epoch": 4.497041420118343, + "grad_norm": 6.073409726703173, + "learning_rate": 2.3700258145950493e-07, + "logits/chosen": -1.1492507457733154, + "logits/rejected": -1.1283336877822876, + "logps/chosen": -34.70807647705078, + "logps/rejected": -53.41638946533203, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4615098237991333, + "rewards/margins": 5.6219682693481445, + "rewards/rejected": -7.083477973937988, + "step": 380 + }, + { + "epoch": 4.508875739644971, + "grad_norm": 6.859789156730043, + "learning_rate": 2.3570419357752518e-07, + "logits/chosen": -1.1793534755706787, + "logits/rejected": -1.1703987121582031, + "logps/chosen": -35.941184997558594, + "logps/rejected": -62.35126495361328, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.987781286239624, + "rewards/margins": 7.1669921875, + "rewards/rejected": -9.154773712158203, + "step": 381 + }, + { + "epoch": 4.520710059171598, + "grad_norm": 5.867928411886104, + "learning_rate": 2.3440619244841794e-07, + "logits/chosen": -1.33339262008667, + "logits/rejected": -1.2492295503616333, + "logps/chosen": -34.28276062011719, + "logps/rejected": -59.37235641479492, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.813497006893158, + "rewards/margins": 6.642629623413086, + "rewards/rejected": -7.456126689910889, + "step": 382 + }, + { + "epoch": 4.5325443786982245, + "grad_norm": 7.114872273799716, + "learning_rate": 2.3310861318777214e-07, + "logits/chosen": -0.9742549657821655, + "logits/rejected": -0.9942373037338257, + "logps/chosen": -35.96149826049805, + "logps/rejected": -58.944549560546875, + "loss": 0.0363, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3453497886657715, + "rewards/margins": 6.538041114807129, + "rewards/rejected": -7.883390426635742, + "step": 383 + }, + { + "epoch": 4.544378698224852, + "grad_norm": 6.454893576265482, + "learning_rate": 2.3181149089976404e-07, + "logits/chosen": -0.9551500082015991, + "logits/rejected": -0.8459126949310303, + "logps/chosen": -35.49050521850586, + "logps/rejected": -59.616416931152344, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7028827667236328, + "rewards/margins": 5.758362293243408, + "rewards/rejected": -7.461245536804199, + "step": 384 + }, + { + "epoch": 4.556213017751479, + "grad_norm": 4.604917778075366, + "learning_rate": 2.30514860676207e-07, + "logits/chosen": -0.9264880418777466, + "logits/rejected": -0.8983692526817322, + "logps/chosen": -37.952232360839844, + "logps/rejected": -66.89598846435547, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.458532452583313, + "rewards/margins": 7.424208641052246, + "rewards/rejected": -8.88274097442627, + "step": 385 + }, + { + "epoch": 4.568047337278107, + "grad_norm": 5.972753790143483, + "learning_rate": 2.2921875759560207e-07, + "logits/chosen": -1.2438300848007202, + "logits/rejected": -1.4885196685791016, + "logps/chosen": -48.01713943481445, + "logps/rejected": -50.086448669433594, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3077518939971924, + "rewards/margins": 5.58108377456665, + "rewards/rejected": -6.888835906982422, + "step": 386 + }, + { + "epoch": 4.579881656804734, + "grad_norm": 5.6047301713387485, + "learning_rate": 2.2792321672218967e-07, + "logits/chosen": -1.1834698915481567, + "logits/rejected": -1.1805692911148071, + "logps/chosen": -37.24559020996094, + "logps/rejected": -68.1692886352539, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3298698663711548, + "rewards/margins": 8.419723510742188, + "rewards/rejected": -9.749593734741211, + "step": 387 + }, + { + "epoch": 4.591715976331361, + "grad_norm": 7.525291987243171, + "learning_rate": 2.2662827310499995e-07, + "logits/chosen": -1.016861081123352, + "logits/rejected": -1.143229603767395, + "logps/chosen": -43.143497467041016, + "logps/rejected": -63.837677001953125, + "loss": 0.043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9594184756278992, + "rewards/margins": 6.90932559967041, + "rewards/rejected": -7.868744850158691, + "step": 388 + }, + { + "epoch": 4.603550295857988, + "grad_norm": 7.015133341827511, + "learning_rate": 2.2533396177690562e-07, + "logits/chosen": -1.2056879997253418, + "logits/rejected": -1.3042309284210205, + "logps/chosen": -46.522300720214844, + "logps/rejected": -61.494346618652344, + "loss": 0.0287, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9979819059371948, + "rewards/margins": 6.070592880249023, + "rewards/rejected": -7.068574905395508, + "step": 389 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 5.78008137850188, + "learning_rate": 2.2404031775367332e-07, + "logits/chosen": -1.2291995286941528, + "logits/rejected": -1.314921259880066, + "logps/chosen": -45.853309631347656, + "logps/rejected": -55.867095947265625, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3121535778045654, + "rewards/margins": 5.488016605377197, + "rewards/rejected": -6.800169944763184, + "step": 390 + }, + { + "epoch": 4.627218934911243, + "grad_norm": 6.252483858832655, + "learning_rate": 2.227473760330169e-07, + "logits/chosen": -1.239790439605713, + "logits/rejected": -1.1441867351531982, + "logps/chosen": -37.55449676513672, + "logps/rejected": -68.65978240966797, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0223238468170166, + "rewards/margins": 8.488706588745117, + "rewards/rejected": -9.511030197143555, + "step": 391 + }, + { + "epoch": 4.6390532544378695, + "grad_norm": 5.571831888079325, + "learning_rate": 2.2145517159365043e-07, + "logits/chosen": -1.147141456604004, + "logits/rejected": -1.0945308208465576, + "logps/chosen": -40.555084228515625, + "logps/rejected": -71.9697265625, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.045267105102539, + "rewards/margins": 7.398416519165039, + "rewards/rejected": -9.443683624267578, + "step": 392 + }, + { + "epoch": 4.650887573964497, + "grad_norm": 6.698497149981605, + "learning_rate": 2.2016373939434166e-07, + "logits/chosen": -1.2282354831695557, + "logits/rejected": -1.2330670356750488, + "logps/chosen": -41.47081756591797, + "logps/rejected": -56.92009353637695, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8390868902206421, + "rewards/margins": 5.762375831604004, + "rewards/rejected": -6.601463317871094, + "step": 393 + }, + { + "epoch": 4.662721893491124, + "grad_norm": 5.665101121799137, + "learning_rate": 2.1887311437296684e-07, + "logits/chosen": -1.3237837553024292, + "logits/rejected": -1.4168846607208252, + "logps/chosen": -43.45185470581055, + "logps/rejected": -69.13044738769531, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4442262649536133, + "rewards/margins": 8.553362846374512, + "rewards/rejected": -9.997589111328125, + "step": 394 + }, + { + "epoch": 4.674556213017752, + "grad_norm": 4.857938949164866, + "learning_rate": 2.175833314455647e-07, + "logits/chosen": -1.044704794883728, + "logits/rejected": -1.047666072845459, + "logps/chosen": -57.778099060058594, + "logps/rejected": -86.46309661865234, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1337398290634155, + "rewards/margins": 7.24013614654541, + "rewards/rejected": -8.373876571655273, + "step": 395 + }, + { + "epoch": 4.686390532544379, + "grad_norm": 4.452615646822525, + "learning_rate": 2.162944255053928e-07, + "logits/chosen": -1.4428414106369019, + "logits/rejected": -1.300477147102356, + "logps/chosen": -43.00053405761719, + "logps/rejected": -78.71292877197266, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2613164186477661, + "rewards/margins": 7.334977149963379, + "rewards/rejected": -8.596292495727539, + "step": 396 + }, + { + "epoch": 4.6982248520710055, + "grad_norm": 4.714776633335646, + "learning_rate": 2.1500643142198264e-07, + "logits/chosen": -1.146743893623352, + "logits/rejected": -1.1370177268981934, + "logps/chosen": -37.42723846435547, + "logps/rejected": -47.88990020751953, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5033150911331177, + "rewards/margins": 4.86285400390625, + "rewards/rejected": -6.366168975830078, + "step": 397 + }, + { + "epoch": 4.710059171597633, + "grad_norm": 6.0816315897712725, + "learning_rate": 2.137193840401968e-07, + "logits/chosen": -1.2402558326721191, + "logits/rejected": -1.2731809616088867, + "logps/chosen": -44.44050598144531, + "logps/rejected": -54.368404388427734, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6547656059265137, + "rewards/margins": 5.59221887588501, + "rewards/rejected": -6.246984481811523, + "step": 398 + }, + { + "epoch": 4.72189349112426, + "grad_norm": 5.278736402112801, + "learning_rate": 2.1243331817928643e-07, + "logits/chosen": -1.2595813274383545, + "logits/rejected": -1.263629674911499, + "logps/chosen": -44.460487365722656, + "logps/rejected": -73.69570922851562, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5480247735977173, + "rewards/margins": 8.522817611694336, + "rewards/rejected": -9.070842742919922, + "step": 399 + }, + { + "epoch": 4.733727810650888, + "grad_norm": 7.040411325600633, + "learning_rate": 2.1114826863194878e-07, + "logits/chosen": -1.3222355842590332, + "logits/rejected": -1.3484004735946655, + "logps/chosen": -34.907257080078125, + "logps/rejected": -60.81079864501953, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18377304077148438, + "rewards/margins": 8.166070938110352, + "rewards/rejected": -8.349843978881836, + "step": 400 + }, + { + "epoch": 4.745562130177515, + "grad_norm": 5.317019025575069, + "learning_rate": 2.0986427016338623e-07, + "logits/chosen": -0.9710571765899658, + "logits/rejected": -0.9310001730918884, + "logps/chosen": -39.45679473876953, + "logps/rejected": -56.64714050292969, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3554608225822449, + "rewards/margins": 6.091450214385986, + "rewards/rejected": -6.446911811828613, + "step": 401 + }, + { + "epoch": 4.757396449704142, + "grad_norm": 6.760332535967884, + "learning_rate": 2.0858135751036568e-07, + "logits/chosen": -1.2228119373321533, + "logits/rejected": -1.1839139461517334, + "logps/chosen": -30.775981903076172, + "logps/rejected": -47.54846954345703, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.206694558262825, + "rewards/margins": 5.442242622375488, + "rewards/rejected": -5.23554801940918, + "step": 402 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 5.929464844555724, + "learning_rate": 2.0729956538027904e-07, + "logits/chosen": -1.2554802894592285, + "logits/rejected": -1.1986385583877563, + "logps/chosen": -29.697330474853516, + "logps/rejected": -48.260414123535156, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7226145267486572, + "rewards/margins": 5.005193710327148, + "rewards/rejected": -5.727808475494385, + "step": 403 + }, + { + "epoch": 4.781065088757396, + "grad_norm": 7.814308902771925, + "learning_rate": 2.060189284502037e-07, + "logits/chosen": -1.1625741720199585, + "logits/rejected": -1.014739751815796, + "logps/chosen": -44.1807861328125, + "logps/rejected": -75.00509643554688, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.093876600265503, + "rewards/margins": 7.723724842071533, + "rewards/rejected": -9.817602157592773, + "step": 404 + }, + { + "epoch": 4.792899408284024, + "grad_norm": 5.110756280509404, + "learning_rate": 2.0473948136596486e-07, + "logits/chosen": -1.2573013305664062, + "logits/rejected": -1.1042354106903076, + "logps/chosen": -37.25166320800781, + "logps/rejected": -62.03949737548828, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6967883110046387, + "rewards/margins": 7.865792751312256, + "rewards/rejected": -9.562580108642578, + "step": 405 + }, + { + "epoch": 4.804733727810651, + "grad_norm": 6.7687604973263635, + "learning_rate": 2.0346125874119838e-07, + "logits/chosen": -1.3532781600952148, + "logits/rejected": -1.336191177368164, + "logps/chosen": -39.134239196777344, + "logps/rejected": -60.64654541015625, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8724418878555298, + "rewards/margins": 7.370305061340332, + "rewards/rejected": -8.242748260498047, + "step": 406 + }, + { + "epoch": 4.816568047337278, + "grad_norm": 5.980695574776132, + "learning_rate": 2.0218429515641368e-07, + "logits/chosen": -0.9892873167991638, + "logits/rejected": -0.639751672744751, + "logps/chosen": -29.87440299987793, + "logps/rejected": -68.78720092773438, + "loss": 0.0305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17198152840137482, + "rewards/margins": 8.037885665893555, + "rewards/rejected": -8.209866523742676, + "step": 407 + }, + { + "epoch": 4.828402366863905, + "grad_norm": 6.2455457286356815, + "learning_rate": 2.0090862515805895e-07, + "logits/chosen": -1.1471354961395264, + "logits/rejected": -1.0667625665664673, + "logps/chosen": -34.445716857910156, + "logps/rejected": -61.40612030029297, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5051982402801514, + "rewards/margins": 6.787008285522461, + "rewards/rejected": -8.292206764221191, + "step": 408 + }, + { + "epoch": 4.840236686390533, + "grad_norm": 6.051206947220886, + "learning_rate": 1.9963428325758613e-07, + "logits/chosen": -1.2832268476486206, + "logits/rejected": -1.2529618740081787, + "logps/chosen": -48.28749465942383, + "logps/rejected": -68.13203430175781, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.748518466949463, + "rewards/margins": 7.8393754959106445, + "rewards/rejected": -9.58789348602295, + "step": 409 + }, + { + "epoch": 4.85207100591716, + "grad_norm": 6.3012658749958765, + "learning_rate": 1.983613039305173e-07, + "logits/chosen": -1.134077787399292, + "logits/rejected": -1.1521306037902832, + "logps/chosen": -49.527854919433594, + "logps/rejected": -67.16100311279297, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6244655847549438, + "rewards/margins": 6.405965805053711, + "rewards/rejected": -8.030430793762207, + "step": 410 + }, + { + "epoch": 4.8639053254437865, + "grad_norm": 4.470631833677289, + "learning_rate": 1.9708972161551213e-07, + "logits/chosen": -1.132239580154419, + "logits/rejected": -1.0069799423217773, + "logps/chosen": -39.385215759277344, + "logps/rejected": -58.923709869384766, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6855692863464355, + "rewards/margins": 6.034358024597168, + "rewards/rejected": -6.7199273109436035, + "step": 411 + }, + { + "epoch": 4.875739644970414, + "grad_norm": 6.376861071339664, + "learning_rate": 1.9581957071343588e-07, + "logits/chosen": -0.9943496584892273, + "logits/rejected": -0.9143755435943604, + "logps/chosen": -48.05913543701172, + "logps/rejected": -69.44911193847656, + "loss": 0.0287, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8955947160720825, + "rewards/margins": 6.447237968444824, + "rewards/rejected": -8.342832565307617, + "step": 412 + }, + { + "epoch": 4.887573964497041, + "grad_norm": 6.431311497276744, + "learning_rate": 1.9455088558642932e-07, + "logits/chosen": -1.2816779613494873, + "logits/rejected": -1.1103105545043945, + "logps/chosen": -28.044443130493164, + "logps/rejected": -67.90215301513672, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3063042163848877, + "rewards/margins": 9.634206771850586, + "rewards/rejected": -10.940509796142578, + "step": 413 + }, + { + "epoch": 4.899408284023669, + "grad_norm": 4.779737157402348, + "learning_rate": 1.9328370055697832e-07, + "logits/chosen": -1.1393996477127075, + "logits/rejected": -1.108349323272705, + "logps/chosen": -34.88286590576172, + "logps/rejected": -57.27365493774414, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0275461673736572, + "rewards/margins": 6.514031410217285, + "rewards/rejected": -7.541577339172363, + "step": 414 + }, + { + "epoch": 4.911242603550296, + "grad_norm": 4.144080990787552, + "learning_rate": 1.9201804990698616e-07, + "logits/chosen": -1.0242797136306763, + "logits/rejected": -1.0819731950759888, + "logps/chosen": -62.003173828125, + "logps/rejected": -75.39320373535156, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2295634746551514, + "rewards/margins": 6.729301452636719, + "rewards/rejected": -8.95886516571045, + "step": 415 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 7.28582515595878, + "learning_rate": 1.907539678768453e-07, + "logits/chosen": -1.2485833168029785, + "logits/rejected": -1.2470812797546387, + "logps/chosen": -33.02880859375, + "logps/rejected": -60.55603790283203, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4287913739681244, + "rewards/margins": 8.188567161560059, + "rewards/rejected": -8.617358207702637, + "step": 416 + }, + { + "epoch": 4.93491124260355, + "grad_norm": 5.63867046125462, + "learning_rate": 1.8949148866451152e-07, + "logits/chosen": -1.3377947807312012, + "logits/rejected": -1.2713524103164673, + "logps/chosen": -43.48049545288086, + "logps/rejected": -66.24922943115234, + "loss": 0.029, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6896387338638306, + "rewards/margins": 7.300614356994629, + "rewards/rejected": -8.990253448486328, + "step": 417 + }, + { + "epoch": 4.946745562130177, + "grad_norm": 7.296218456889154, + "learning_rate": 1.8823064642457876e-07, + "logits/chosen": -1.2259938716888428, + "logits/rejected": -1.05912184715271, + "logps/chosen": -27.457061767578125, + "logps/rejected": -54.369407653808594, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2313295304775238, + "rewards/margins": 6.432798862457275, + "rewards/rejected": -6.664128303527832, + "step": 418 + }, + { + "epoch": 4.958579881656805, + "grad_norm": 6.2793049860116215, + "learning_rate": 1.8697147526735466e-07, + "logits/chosen": -1.1357454061508179, + "logits/rejected": -1.172181487083435, + "logps/chosen": -39.59063720703125, + "logps/rejected": -70.14149475097656, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2256022691726685, + "rewards/margins": 7.56755256652832, + "rewards/rejected": -8.793155670166016, + "step": 419 + }, + { + "epoch": 4.970414201183432, + "grad_norm": 5.690239912229541, + "learning_rate": 1.8571400925793852e-07, + "logits/chosen": -1.1824370622634888, + "logits/rejected": -1.0679001808166504, + "logps/chosen": -39.977928161621094, + "logps/rejected": -63.33866500854492, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.93265700340271, + "rewards/margins": 7.5748395919799805, + "rewards/rejected": -8.507495880126953, + "step": 420 + }, + { + "epoch": 4.982248520710059, + "grad_norm": 6.961259730499676, + "learning_rate": 1.844582824152988e-07, + "logits/chosen": -1.2937397956848145, + "logits/rejected": -1.4227337837219238, + "logps/chosen": -47.419578552246094, + "logps/rejected": -62.564239501953125, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.698372483253479, + "rewards/margins": 8.21216869354248, + "rewards/rejected": -8.910541534423828, + "step": 421 + }, + { + "epoch": 4.994082840236686, + "grad_norm": 7.002318301330378, + "learning_rate": 1.8320432871135376e-07, + "logits/chosen": -1.068471074104309, + "logits/rejected": -0.9389731884002686, + "logps/chosen": -39.13528823852539, + "logps/rejected": -73.06786346435547, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0938169956207275, + "rewards/margins": 7.050632476806641, + "rewards/rejected": -8.144449234008789, + "step": 422 + }, + { + "epoch": 5.005917159763314, + "grad_norm": 6.3621819487176205, + "learning_rate": 1.8195218207005136e-07, + "logits/chosen": -1.3071154356002808, + "logits/rejected": -1.2660558223724365, + "logps/chosen": -41.71501541137695, + "logps/rejected": -58.077205657958984, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21319010853767395, + "rewards/margins": 5.4651103019714355, + "rewards/rejected": -5.678299903869629, + "step": 423 + }, + { + "epoch": 5.017751479289941, + "grad_norm": 5.688784125844276, + "learning_rate": 1.8070187636645237e-07, + "logits/chosen": -1.1194195747375488, + "logits/rejected": -1.1475024223327637, + "logps/chosen": -39.484375, + "logps/rejected": -53.20079803466797, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6331821084022522, + "rewards/margins": 5.97371768951416, + "rewards/rejected": -6.606899261474609, + "step": 424 + }, + { + "epoch": 5.029585798816568, + "grad_norm": 4.08606565715224, + "learning_rate": 1.7945344542581353e-07, + "logits/chosen": -1.2313172817230225, + "logits/rejected": -1.2285120487213135, + "logps/chosen": -34.221275329589844, + "logps/rejected": -71.1649398803711, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6170086860656738, + "rewards/margins": 9.6792573928833, + "rewards/rejected": -10.296265602111816, + "step": 425 + }, + { + "epoch": 5.041420118343195, + "grad_norm": 4.101168582532201, + "learning_rate": 1.782069230226725e-07, + "logits/chosen": -1.331200122833252, + "logits/rejected": -1.2716310024261475, + "logps/chosen": -32.71428298950195, + "logps/rejected": -57.31195068359375, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05249008536338806, + "rewards/margins": 6.174403190612793, + "rewards/rejected": -6.121913433074951, + "step": 426 + }, + { + "epoch": 5.053254437869822, + "grad_norm": 3.885129904982064, + "learning_rate": 1.7696234287993413e-07, + "logits/chosen": -1.0245940685272217, + "logits/rejected": -0.809888482093811, + "logps/chosen": -26.134567260742188, + "logps/rejected": -58.40096664428711, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018941545858979225, + "rewards/margins": 6.3871893882751465, + "rewards/rejected": -6.3682475090026855, + "step": 427 + }, + { + "epoch": 5.06508875739645, + "grad_norm": 4.487578296923204, + "learning_rate": 1.7571973866795813e-07, + "logits/chosen": -1.1871979236602783, + "logits/rejected": -1.1301292181015015, + "logps/chosen": -47.12839889526367, + "logps/rejected": -72.7445068359375, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4960553646087646, + "rewards/margins": 8.162572860717773, + "rewards/rejected": -10.658628463745117, + "step": 428 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 4.259353249136821, + "learning_rate": 1.7447914400364833e-07, + "logits/chosen": -1.3441885709762573, + "logits/rejected": -1.4836678504943848, + "logps/chosen": -56.2959098815918, + "logps/rejected": -63.773292541503906, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.281679391860962, + "rewards/margins": 6.2985944747924805, + "rewards/rejected": -7.580273628234863, + "step": 429 + }, + { + "epoch": 5.088757396449704, + "grad_norm": 6.225232745781602, + "learning_rate": 1.7324059244954292e-07, + "logits/chosen": -1.179904580116272, + "logits/rejected": -1.0788750648498535, + "logps/chosen": -31.996553421020508, + "logps/rejected": -57.39022445678711, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4775885343551636, + "rewards/margins": 5.636683464050293, + "rewards/rejected": -7.114272117614746, + "step": 430 + }, + { + "epoch": 5.100591715976331, + "grad_norm": 5.489476203604287, + "learning_rate": 1.720041175129066e-07, + "logits/chosen": -1.0896636247634888, + "logits/rejected": -0.9166202545166016, + "logps/chosen": -37.66105651855469, + "logps/rejected": -66.37045288085938, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2693698406219482, + "rewards/margins": 6.162668228149414, + "rewards/rejected": -7.432038307189941, + "step": 431 + }, + { + "epoch": 5.112426035502959, + "grad_norm": 5.243666393695587, + "learning_rate": 1.7076975264482433e-07, + "logits/chosen": -1.1870818138122559, + "logits/rejected": -1.1362037658691406, + "logps/chosen": -36.95043182373047, + "logps/rejected": -65.01616668701172, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3586033582687378, + "rewards/margins": 8.19964599609375, + "rewards/rejected": -9.558250427246094, + "step": 432 + }, + { + "epoch": 5.124260355029586, + "grad_norm": 5.459471747574669, + "learning_rate": 1.6953753123929595e-07, + "logits/chosen": -0.9043876528739929, + "logits/rejected": -1.0556137561798096, + "logps/chosen": -42.625064849853516, + "logps/rejected": -65.89883422851562, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0012390613555908, + "rewards/margins": 5.687243938446045, + "rewards/rejected": -6.688483238220215, + "step": 433 + }, + { + "epoch": 5.136094674556213, + "grad_norm": 4.0735562348358725, + "learning_rate": 1.6830748663233303e-07, + "logits/chosen": -1.276261806488037, + "logits/rejected": -1.059903860092163, + "logps/chosen": -37.19121551513672, + "logps/rejected": -67.30182647705078, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8689858913421631, + "rewards/margins": 7.131950378417969, + "rewards/rejected": -8.000936508178711, + "step": 434 + }, + { + "epoch": 5.14792899408284, + "grad_norm": 5.750153182234357, + "learning_rate": 1.6707965210105687e-07, + "logits/chosen": -1.0981320142745972, + "logits/rejected": -1.1993542909622192, + "logps/chosen": -45.06623077392578, + "logps/rejected": -59.895782470703125, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1987268924713135, + "rewards/margins": 7.3387932777404785, + "rewards/rejected": -8.537520408630371, + "step": 435 + }, + { + "epoch": 5.159763313609467, + "grad_norm": 4.092389515181816, + "learning_rate": 1.6585406086279846e-07, + "logits/chosen": -1.2612695693969727, + "logits/rejected": -1.0451221466064453, + "logps/chosen": -36.8587646484375, + "logps/rejected": -79.8933334350586, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8900283575057983, + "rewards/margins": 9.542950630187988, + "rewards/rejected": -10.432977676391602, + "step": 436 + }, + { + "epoch": 5.171597633136095, + "grad_norm": 5.775549033301964, + "learning_rate": 1.6463074607419942e-07, + "logits/chosen": -1.1313787698745728, + "logits/rejected": -1.1185730695724487, + "logps/chosen": -39.299861907958984, + "logps/rejected": -55.985435485839844, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1983766555786133, + "rewards/margins": 5.455132484436035, + "rewards/rejected": -6.65350866317749, + "step": 437 + }, + { + "epoch": 5.183431952662722, + "grad_norm": 4.591857419480486, + "learning_rate": 1.6340974083031523e-07, + "logits/chosen": -1.1867667436599731, + "logits/rejected": -1.101377248764038, + "logps/chosen": -37.22975158691406, + "logps/rejected": -59.464149475097656, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3749149739742279, + "rewards/margins": 7.300636291503906, + "rewards/rejected": -7.675551414489746, + "step": 438 + }, + { + "epoch": 5.195266272189349, + "grad_norm": 4.200678818481589, + "learning_rate": 1.6219107816372024e-07, + "logits/chosen": -1.307371973991394, + "logits/rejected": -1.1307945251464844, + "logps/chosen": -45.63922119140625, + "logps/rejected": -96.90985107421875, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9187942743301392, + "rewards/margins": 10.127294540405273, + "rewards/rejected": -12.046089172363281, + "step": 439 + }, + { + "epoch": 5.207100591715976, + "grad_norm": 4.736427790505553, + "learning_rate": 1.6097479104361326e-07, + "logits/chosen": -1.4351165294647217, + "logits/rejected": -1.3055188655853271, + "logps/chosen": -33.065948486328125, + "logps/rejected": -62.625099182128906, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6623735427856445, + "rewards/margins": 7.489253044128418, + "rewards/rejected": -8.151626586914062, + "step": 440 + }, + { + "epoch": 5.218934911242604, + "grad_norm": 5.4353100679782385, + "learning_rate": 1.5976091237492634e-07, + "logits/chosen": -1.1784343719482422, + "logits/rejected": -1.1669944524765015, + "logps/chosen": -43.70185852050781, + "logps/rejected": -61.90728759765625, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9407563805580139, + "rewards/margins": 5.61828088760376, + "rewards/rejected": -6.559036731719971, + "step": 441 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 5.5234881107996, + "learning_rate": 1.5854947499743413e-07, + "logits/chosen": -1.3166191577911377, + "logits/rejected": -1.2368249893188477, + "logps/chosen": -42.619972229003906, + "logps/rejected": -83.75607299804688, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4741311073303223, + "rewards/margins": 8.95831298828125, + "rewards/rejected": -10.432443618774414, + "step": 442 + }, + { + "epoch": 5.242603550295858, + "grad_norm": 5.027875638098313, + "learning_rate": 1.573405116848656e-07, + "logits/chosen": -1.2968052625656128, + "logits/rejected": -1.1763187646865845, + "logps/chosen": -41.92218017578125, + "logps/rejected": -65.44755554199219, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.754935622215271, + "rewards/margins": 5.887596607208252, + "rewards/rejected": -7.6425323486328125, + "step": 443 + }, + { + "epoch": 5.254437869822485, + "grad_norm": 4.9663450415072035, + "learning_rate": 1.5613405514401757e-07, + "logits/chosen": -1.1399455070495605, + "logits/rejected": -1.21956205368042, + "logps/chosen": -39.965789794921875, + "logps/rejected": -55.96144485473633, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.283247709274292, + "rewards/margins": 6.501648902893066, + "rewards/rejected": -7.7848968505859375, + "step": 444 + }, + { + "epoch": 5.266272189349112, + "grad_norm": 5.858342159433277, + "learning_rate": 1.5493013801386923e-07, + "logits/chosen": -1.2090262174606323, + "logits/rejected": -1.2344286441802979, + "logps/chosen": -46.805938720703125, + "logps/rejected": -75.00871276855469, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.800337791442871, + "rewards/margins": 8.601978302001953, + "rewards/rejected": -10.402315139770508, + "step": 445 + }, + { + "epoch": 5.27810650887574, + "grad_norm": 3.9282604177697107, + "learning_rate": 1.537287928647002e-07, + "logits/chosen": -1.1603989601135254, + "logits/rejected": -1.1315605640411377, + "logps/chosen": -56.917301177978516, + "logps/rejected": -87.98289489746094, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3748254776000977, + "rewards/margins": 9.121573448181152, + "rewards/rejected": -11.49639892578125, + "step": 446 + }, + { + "epoch": 5.289940828402367, + "grad_norm": 4.17827764711654, + "learning_rate": 1.525300521972082e-07, + "logits/chosen": -1.1303211450576782, + "logits/rejected": -1.0561895370483398, + "logps/chosen": -34.059898376464844, + "logps/rejected": -58.381256103515625, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6126248836517334, + "rewards/margins": 6.0588579177856445, + "rewards/rejected": -6.671482086181641, + "step": 447 + }, + { + "epoch": 5.3017751479289945, + "grad_norm": 5.0074680639809745, + "learning_rate": 1.513339484416309e-07, + "logits/chosen": -1.169161319732666, + "logits/rejected": -1.327141284942627, + "logps/chosen": -44.55299377441406, + "logps/rejected": -59.21006774902344, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9135736227035522, + "rewards/margins": 6.587090492248535, + "rewards/rejected": -7.500664710998535, + "step": 448 + }, + { + "epoch": 5.313609467455621, + "grad_norm": 5.786312787732582, + "learning_rate": 1.5014051395686766e-07, + "logits/chosen": -1.314427137374878, + "logits/rejected": -1.2852325439453125, + "logps/chosen": -48.46540832519531, + "logps/rejected": -74.1810531616211, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.380225658416748, + "rewards/margins": 7.730525016784668, + "rewards/rejected": -10.110750198364258, + "step": 449 + }, + { + "epoch": 5.325443786982248, + "grad_norm": 4.769640674846225, + "learning_rate": 1.489497810296046e-07, + "logits/chosen": -1.3386704921722412, + "logits/rejected": -1.4422738552093506, + "logps/chosen": -50.036563873291016, + "logps/rejected": -78.25895690917969, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.123276710510254, + "rewards/margins": 8.760514259338379, + "rewards/rejected": -11.883790969848633, + "step": 450 + }, + { + "epoch": 5.337278106508876, + "grad_norm": 4.460218619596584, + "learning_rate": 1.4776178187344105e-07, + "logits/chosen": -0.9677804112434387, + "logits/rejected": -1.0331177711486816, + "logps/chosen": -59.2320442199707, + "logps/rejected": -78.76850891113281, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6641674041748047, + "rewards/margins": 6.445460319519043, + "rewards/rejected": -10.109628677368164, + "step": 451 + }, + { + "epoch": 5.349112426035503, + "grad_norm": 3.802978212761357, + "learning_rate": 1.4657654862801797e-07, + "logits/chosen": -1.4019286632537842, + "logits/rejected": -1.3270961046218872, + "logps/chosen": -50.94618606567383, + "logps/rejected": -73.76014709472656, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.508474826812744, + "rewards/margins": 7.509592056274414, + "rewards/rejected": -10.01806640625, + "step": 452 + }, + { + "epoch": 5.3609467455621305, + "grad_norm": 5.615210502971369, + "learning_rate": 1.4539411335814866e-07, + "logits/chosen": -1.0637266635894775, + "logits/rejected": -1.0000033378601074, + "logps/chosen": -41.48188018798828, + "logps/rejected": -63.91345977783203, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4643651247024536, + "rewards/margins": 6.114117622375488, + "rewards/rejected": -6.578482627868652, + "step": 453 + }, + { + "epoch": 5.372781065088757, + "grad_norm": 5.198548513900638, + "learning_rate": 1.4421450805295082e-07, + "logits/chosen": -1.2260617017745972, + "logits/rejected": -1.0831642150878906, + "logps/chosen": -35.77007293701172, + "logps/rejected": -64.84323120117188, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.987048864364624, + "rewards/margins": 7.374147415161133, + "rewards/rejected": -8.361196517944336, + "step": 454 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 4.832767981066584, + "learning_rate": 1.4303776462498186e-07, + "logits/chosen": -1.160849690437317, + "logits/rejected": -1.2087219953536987, + "logps/chosen": -40.54812240600586, + "logps/rejected": -65.13203430175781, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0010249614715576, + "rewards/margins": 8.726207733154297, + "rewards/rejected": -10.727232933044434, + "step": 455 + }, + { + "epoch": 5.396449704142012, + "grad_norm": 5.177069682037774, + "learning_rate": 1.418639149093748e-07, + "logits/chosen": -1.1638591289520264, + "logits/rejected": -1.1559276580810547, + "logps/chosen": -40.623634338378906, + "logps/rejected": -68.79119873046875, + "loss": 0.0258, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1391851902008057, + "rewards/margins": 8.373973846435547, + "rewards/rejected": -9.513158798217773, + "step": 456 + }, + { + "epoch": 5.408284023668639, + "grad_norm": 4.365063167846994, + "learning_rate": 1.406929906629774e-07, + "logits/chosen": -1.0570735931396484, + "logits/rejected": -1.124732255935669, + "logps/chosen": -43.47991943359375, + "logps/rejected": -74.1714096069336, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.225761890411377, + "rewards/margins": 8.236499786376953, + "rewards/rejected": -10.462261199951172, + "step": 457 + }, + { + "epoch": 5.420118343195266, + "grad_norm": 3.589902288780942, + "learning_rate": 1.3952502356349323e-07, + "logits/chosen": -1.3269389867782593, + "logits/rejected": -1.233825922012329, + "logps/chosen": -34.30451202392578, + "logps/rejected": -52.173431396484375, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0334144830703735, + "rewards/margins": 5.583642482757568, + "rewards/rejected": -6.617057800292969, + "step": 458 + }, + { + "epoch": 5.431952662721893, + "grad_norm": 4.55181870052482, + "learning_rate": 1.38360045208624e-07, + "logits/chosen": -1.2689670324325562, + "logits/rejected": -1.2837406396865845, + "logps/chosen": -40.163639068603516, + "logps/rejected": -56.57490539550781, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6674104928970337, + "rewards/margins": 6.7150068283081055, + "rewards/rejected": -7.382417678833008, + "step": 459 + }, + { + "epoch": 5.443786982248521, + "grad_norm": 5.077146204999947, + "learning_rate": 1.371980871152157e-07, + "logits/chosen": -1.154836893081665, + "logits/rejected": -0.9715840816497803, + "logps/chosen": -45.965538024902344, + "logps/rejected": -83.77821350097656, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9867550134658813, + "rewards/margins": 7.073174953460693, + "rewards/rejected": -9.059929847717285, + "step": 460 + }, + { + "epoch": 5.455621301775148, + "grad_norm": 5.101492700995248, + "learning_rate": 1.3603918071840486e-07, + "logits/chosen": -1.316448450088501, + "logits/rejected": -1.2533689737319946, + "logps/chosen": -42.93837356567383, + "logps/rejected": -62.27436065673828, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6039561033248901, + "rewards/margins": 8.3004732131958, + "rewards/rejected": -9.90442943572998, + "step": 461 + }, + { + "epoch": 5.4674556213017755, + "grad_norm": 3.581286281901749, + "learning_rate": 1.3488335737076911e-07, + "logits/chosen": -1.05559504032135, + "logits/rejected": -1.045451283454895, + "logps/chosen": -39.23100280761719, + "logps/rejected": -68.82288360595703, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3661481142044067, + "rewards/margins": 7.688545227050781, + "rewards/rejected": -9.054693222045898, + "step": 462 + }, + { + "epoch": 5.479289940828402, + "grad_norm": 4.630228404526749, + "learning_rate": 1.3373064834147817e-07, + "logits/chosen": -1.1598727703094482, + "logits/rejected": -1.264807939529419, + "logps/chosen": -34.88840103149414, + "logps/rejected": -62.14350891113281, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.034010887145996, + "rewards/margins": 8.205092430114746, + "rewards/rejected": -9.239103317260742, + "step": 463 + }, + { + "epoch": 5.491124260355029, + "grad_norm": 5.16650939084812, + "learning_rate": 1.3258108481544847e-07, + "logits/chosen": -1.1226065158843994, + "logits/rejected": -1.1048481464385986, + "logps/chosen": -43.532371520996094, + "logps/rejected": -56.43935775756836, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7918784618377686, + "rewards/margins": 5.705506324768066, + "rewards/rejected": -7.497385025024414, + "step": 464 + }, + { + "epoch": 5.502958579881657, + "grad_norm": 3.8686941600847433, + "learning_rate": 1.314346978924994e-07, + "logits/chosen": -1.229318380355835, + "logits/rejected": -1.2423676252365112, + "logps/chosen": -39.24761199951172, + "logps/rejected": -61.67021179199219, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.169360876083374, + "rewards/margins": 7.238840103149414, + "rewards/rejected": -9.408201217651367, + "step": 465 + }, + { + "epoch": 5.514792899408284, + "grad_norm": 5.296943513917031, + "learning_rate": 1.3029151858651143e-07, + "logits/chosen": -1.3978345394134521, + "logits/rejected": -1.3712904453277588, + "logps/chosen": -33.012115478515625, + "logps/rejected": -56.899269104003906, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2505531311035156, + "rewards/margins": 6.717418670654297, + "rewards/rejected": -7.9679718017578125, + "step": 466 + }, + { + "epoch": 5.5266272189349115, + "grad_norm": 4.646575940746333, + "learning_rate": 1.2915157782458802e-07, + "logits/chosen": -1.171278715133667, + "logits/rejected": -1.205727458000183, + "logps/chosen": -47.129981994628906, + "logps/rejected": -68.88152313232422, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6739788055419922, + "rewards/margins": 7.506168842315674, + "rewards/rejected": -9.180148124694824, + "step": 467 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 6.123565299064301, + "learning_rate": 1.2801490644621788e-07, + "logits/chosen": -1.1880264282226562, + "logits/rejected": -1.1200838088989258, + "logps/chosen": -38.7286491394043, + "logps/rejected": -48.82095718383789, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8554790616035461, + "rewards/margins": 4.912729740142822, + "rewards/rejected": -5.7682085037231445, + "step": 468 + }, + { + "epoch": 5.550295857988166, + "grad_norm": 4.461044775468357, + "learning_rate": 1.268815352024416e-07, + "logits/chosen": -1.3372142314910889, + "logits/rejected": -1.2087198495864868, + "logps/chosen": -39.35560607910156, + "logps/rejected": -66.68772888183594, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.420445203781128, + "rewards/margins": 7.610683441162109, + "rewards/rejected": -10.0311279296875, + "step": 469 + }, + { + "epoch": 5.562130177514793, + "grad_norm": 3.4572482970403624, + "learning_rate": 1.257514947550189e-07, + "logits/chosen": -1.3124825954437256, + "logits/rejected": -1.2549489736557007, + "logps/chosen": -34.87763214111328, + "logps/rejected": -71.9126968383789, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7214508652687073, + "rewards/margins": 9.442049026489258, + "rewards/rejected": -10.16349983215332, + "step": 470 + }, + { + "epoch": 5.57396449704142, + "grad_norm": 4.197929133254421, + "learning_rate": 1.2462481567559966e-07, + "logits/chosen": -1.253973126411438, + "logits/rejected": -1.0877537727355957, + "logps/chosen": -39.30159378051758, + "logps/rejected": -77.09483337402344, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9544857740402222, + "rewards/margins": 6.852291107177734, + "rewards/rejected": -7.806777000427246, + "step": 471 + }, + { + "epoch": 5.585798816568047, + "grad_norm": 5.704469040817279, + "learning_rate": 1.2350152844489688e-07, + "logits/chosen": -1.2464970350265503, + "logits/rejected": -1.2669868469238281, + "logps/chosen": -34.1337890625, + "logps/rejected": -61.517852783203125, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8590599298477173, + "rewards/margins": 8.024746894836426, + "rewards/rejected": -9.883807182312012, + "step": 472 + }, + { + "epoch": 5.597633136094674, + "grad_norm": 3.345069840685552, + "learning_rate": 1.2238166345186152e-07, + "logits/chosen": -1.1105680465698242, + "logits/rejected": -1.1130532026290894, + "logps/chosen": -44.979270935058594, + "logps/rejected": -61.89778137207031, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2353321313858032, + "rewards/margins": 6.803773880004883, + "rewards/rejected": -8.039106369018555, + "step": 473 + }, + { + "epoch": 5.609467455621302, + "grad_norm": 5.571254370649255, + "learning_rate": 1.2126525099286108e-07, + "logits/chosen": -1.5405449867248535, + "logits/rejected": -1.5201728343963623, + "logps/chosen": -35.71417236328125, + "logps/rejected": -76.58299255371094, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5148788690567017, + "rewards/margins": 8.865453720092773, + "rewards/rejected": -10.380332946777344, + "step": 474 + }, + { + "epoch": 5.621301775147929, + "grad_norm": 6.171366106305816, + "learning_rate": 1.201523212708593e-07, + "logits/chosen": -1.2863376140594482, + "logits/rejected": -1.0735771656036377, + "logps/chosen": -31.64728546142578, + "logps/rejected": -67.95651245117188, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8103072047233582, + "rewards/margins": 8.086475372314453, + "rewards/rejected": -8.896783828735352, + "step": 475 + }, + { + "epoch": 5.633136094674557, + "grad_norm": 5.562476478094527, + "learning_rate": 1.1904290439459971e-07, + "logits/chosen": -1.3088757991790771, + "logits/rejected": -1.3743656873703003, + "logps/chosen": -44.74784851074219, + "logps/rejected": -75.913818359375, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7934825420379639, + "rewards/margins": 8.956192016601562, + "rewards/rejected": -10.749673843383789, + "step": 476 + }, + { + "epoch": 5.644970414201183, + "grad_norm": 4.9137843056433494, + "learning_rate": 1.1793703037779055e-07, + "logits/chosen": -1.2263046503067017, + "logits/rejected": -1.2299702167510986, + "logps/chosen": -52.14278793334961, + "logps/rejected": -83.45344543457031, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6467747688293457, + "rewards/margins": 9.627193450927734, + "rewards/rejected": -12.273969650268555, + "step": 477 + }, + { + "epoch": 5.65680473372781, + "grad_norm": 4.209643150852624, + "learning_rate": 1.1683472913829284e-07, + "logits/chosen": -1.1544851064682007, + "logits/rejected": -1.0651438236236572, + "logps/chosen": -43.3776741027832, + "logps/rejected": -73.96808624267578, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.589303970336914, + "rewards/margins": 7.712847709655762, + "rewards/rejected": -9.302152633666992, + "step": 478 + }, + { + "epoch": 5.668639053254438, + "grad_norm": 4.463200843657177, + "learning_rate": 1.1573603049731153e-07, + "logits/chosen": -1.2334978580474854, + "logits/rejected": -1.2861613035202026, + "logps/chosen": -46.0629997253418, + "logps/rejected": -64.01387023925781, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.542292058467865, + "rewards/margins": 7.245844841003418, + "rewards/rejected": -7.788136959075928, + "step": 479 + }, + { + "epoch": 5.680473372781065, + "grad_norm": 4.733353453788434, + "learning_rate": 1.146409641785882e-07, + "logits/chosen": -1.3681819438934326, + "logits/rejected": -1.1802641153335571, + "logps/chosen": -49.00510025024414, + "logps/rejected": -74.6863784790039, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9393169283866882, + "rewards/margins": 8.013870239257812, + "rewards/rejected": -8.953186988830566, + "step": 480 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 4.192888321849314, + "learning_rate": 1.1354955980759689e-07, + "logits/chosen": -1.2294647693634033, + "logits/rejected": -1.1312428712844849, + "logps/chosen": -32.66830825805664, + "logps/rejected": -62.29680252075195, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7126701474189758, + "rewards/margins": 7.077181816101074, + "rewards/rejected": -7.789851665496826, + "step": 481 + }, + { + "epoch": 5.704142011834319, + "grad_norm": 4.6911466794721255, + "learning_rate": 1.1246184691074314e-07, + "logits/chosen": -1.097620964050293, + "logits/rejected": -1.0427839756011963, + "logps/chosen": -36.93230056762695, + "logps/rejected": -56.27244567871094, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5761284232139587, + "rewards/margins": 6.3100385665893555, + "rewards/rejected": -6.886167049407959, + "step": 482 + }, + { + "epoch": 5.715976331360947, + "grad_norm": 4.510848016909919, + "learning_rate": 1.1137785491456453e-07, + "logits/chosen": -1.135402798652649, + "logits/rejected": -1.1884230375289917, + "logps/chosen": -42.09026336669922, + "logps/rejected": -65.27650451660156, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.159618377685547, + "rewards/margins": 7.496487617492676, + "rewards/rejected": -9.656106948852539, + "step": 483 + }, + { + "epoch": 5.727810650887574, + "grad_norm": 6.019538895167783, + "learning_rate": 1.1029761314493518e-07, + "logits/chosen": -1.4300427436828613, + "logits/rejected": -1.311929702758789, + "logps/chosen": -35.11430358886719, + "logps/rejected": -61.214385986328125, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1705330610275269, + "rewards/margins": 6.273542404174805, + "rewards/rejected": -7.444075584411621, + "step": 484 + }, + { + "epoch": 5.739644970414201, + "grad_norm": 4.484361080757079, + "learning_rate": 1.0922115082627196e-07, + "logits/chosen": -1.2917143106460571, + "logits/rejected": -1.2676193714141846, + "logps/chosen": -39.90666580200195, + "logps/rejected": -63.61042404174805, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.078958511352539, + "rewards/margins": 7.296936511993408, + "rewards/rejected": -9.375894546508789, + "step": 485 + }, + { + "epoch": 5.7514792899408285, + "grad_norm": 3.7963083798265678, + "learning_rate": 1.0814849708074414e-07, + "logits/chosen": -1.2180938720703125, + "logits/rejected": -1.2313947677612305, + "logps/chosen": -38.8764762878418, + "logps/rejected": -58.5401496887207, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2903863787651062, + "rewards/margins": 6.708767890930176, + "rewards/rejected": -6.999154090881348, + "step": 486 + }, + { + "epoch": 5.763313609467455, + "grad_norm": 3.9184087371836953, + "learning_rate": 1.070796809274853e-07, + "logits/chosen": -1.410796880722046, + "logits/rejected": -1.51241934299469, + "logps/chosen": -39.3426513671875, + "logps/rejected": -64.799560546875, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2968083620071411, + "rewards/margins": 8.182130813598633, + "rewards/rejected": -9.478939056396484, + "step": 487 + }, + { + "epoch": 5.775147928994083, + "grad_norm": 4.695034688833399, + "learning_rate": 1.0601473128180854e-07, + "logits/chosen": -1.1525176763534546, + "logits/rejected": -1.1728888750076294, + "logps/chosen": -29.180187225341797, + "logps/rejected": -47.42591094970703, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2968808710575104, + "rewards/margins": 5.646054267883301, + "rewards/rejected": -5.349173545837402, + "step": 488 + }, + { + "epoch": 5.78698224852071, + "grad_norm": 3.9629884838666793, + "learning_rate": 1.0495367695442392e-07, + "logits/chosen": -1.4741270542144775, + "logits/rejected": -1.4874193668365479, + "logps/chosen": -37.341251373291016, + "logps/rejected": -64.96741485595703, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7409987449645996, + "rewards/margins": 8.580930709838867, + "rewards/rejected": -10.321928977966309, + "step": 489 + }, + { + "epoch": 5.798816568047338, + "grad_norm": 4.579911167657576, + "learning_rate": 1.0389654665065908e-07, + "logits/chosen": -1.3020949363708496, + "logits/rejected": -1.1568399667739868, + "logps/chosen": -37.12276840209961, + "logps/rejected": -74.50867462158203, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.156496047973633, + "rewards/margins": 8.242414474487305, + "rewards/rejected": -10.398909568786621, + "step": 490 + }, + { + "epoch": 5.810650887573964, + "grad_norm": 3.8373743576852046, + "learning_rate": 1.0284336896968304e-07, + "logits/chosen": -1.4103869199752808, + "logits/rejected": -1.2831324338912964, + "logps/chosen": -27.218812942504883, + "logps/rejected": -64.12830352783203, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5227464437484741, + "rewards/margins": 7.4249677658081055, + "rewards/rejected": -7.947714328765869, + "step": 491 + }, + { + "epoch": 5.822485207100591, + "grad_norm": 3.95946268366364, + "learning_rate": 1.0179417240373182e-07, + "logits/chosen": -1.0852227210998535, + "logits/rejected": -1.0003684759140015, + "logps/chosen": -34.86237716674805, + "logps/rejected": -55.69224166870117, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.195955753326416, + "rewards/margins": 6.8462629318237305, + "rewards/rejected": -8.042219161987305, + "step": 492 + }, + { + "epoch": 5.834319526627219, + "grad_norm": 4.561320933520321, + "learning_rate": 1.0074898533733833e-07, + "logits/chosen": -1.3027982711791992, + "logits/rejected": -1.272012710571289, + "logps/chosen": -44.73501205444336, + "logps/rejected": -74.4944076538086, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.941219687461853, + "rewards/margins": 7.836876392364502, + "rewards/rejected": -8.778095245361328, + "step": 493 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 3.4880073627324113, + "learning_rate": 9.970783604656383e-08, + "logits/chosen": -1.1432762145996094, + "logits/rejected": -1.182610034942627, + "logps/chosen": -45.404197692871094, + "logps/rejected": -68.64156341552734, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.063544273376465, + "rewards/margins": 7.183533668518066, + "rewards/rejected": -9.247077941894531, + "step": 494 + }, + { + "epoch": 5.8579881656804735, + "grad_norm": 4.263751098046147, + "learning_rate": 9.867075269823353e-08, + "logits/chosen": -1.2150077819824219, + "logits/rejected": -1.156544804573059, + "logps/chosen": -31.194244384765625, + "logps/rejected": -62.26982498168945, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5434834957122803, + "rewards/margins": 7.384872913360596, + "rewards/rejected": -7.928356647491455, + "step": 495 + }, + { + "epoch": 5.8698224852071, + "grad_norm": 3.5712261428305476, + "learning_rate": 9.763776334917398e-08, + "logits/chosen": -1.137619972229004, + "logits/rejected": -1.1010360717773438, + "logps/chosen": -42.5244140625, + "logps/rejected": -64.01480865478516, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5774519443511963, + "rewards/margins": 7.529844284057617, + "rewards/rejected": -9.107295989990234, + "step": 496 + }, + { + "epoch": 5.881656804733728, + "grad_norm": 3.9520695580072838, + "learning_rate": 9.660889594545469e-08, + "logits/chosen": -1.3965240716934204, + "logits/rejected": -1.2792103290557861, + "logps/chosen": -35.29607391357422, + "logps/rejected": -62.42978286743164, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.475753664970398, + "rewards/margins": 7.789628982543945, + "rewards/rejected": -9.265382766723633, + "step": 497 + }, + { + "epoch": 5.893491124260355, + "grad_norm": 2.9741347782819343, + "learning_rate": 9.558417832163162e-08, + "logits/chosen": -1.3339378833770752, + "logits/rejected": -1.3494333028793335, + "logps/chosen": -34.82639694213867, + "logps/rejected": -60.99176788330078, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.567624568939209, + "rewards/margins": 7.62171745300293, + "rewards/rejected": -8.189342498779297, + "step": 498 + }, + { + "epoch": 5.905325443786982, + "grad_norm": 3.571047214468529, + "learning_rate": 9.456363819999419e-08, + "logits/chosen": -1.3118352890014648, + "logits/rejected": -1.2693629264831543, + "logps/chosen": -41.321712493896484, + "logps/rejected": -56.57762145996094, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.858654797077179, + "rewards/margins": 6.5985426902771, + "rewards/rejected": -7.457197666168213, + "step": 499 + }, + { + "epoch": 5.9171597633136095, + "grad_norm": 5.851871018530032, + "learning_rate": 9.354730318981561e-08, + "logits/chosen": -1.362858772277832, + "logits/rejected": -1.2244151830673218, + "logps/chosen": -42.49793243408203, + "logps/rejected": -82.23344421386719, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.179011344909668, + "rewards/margins": 9.12831974029541, + "rewards/rejected": -11.307331085205078, + "step": 500 + }, + { + "epoch": 5.928994082840236, + "grad_norm": 5.383838413833312, + "learning_rate": 9.25352007866054e-08, + "logits/chosen": -1.2556869983673096, + "logits/rejected": -1.443664789199829, + "logps/chosen": -40.52064514160156, + "logps/rejected": -56.383453369140625, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.034975528717041, + "rewards/margins": 6.7539753913879395, + "rewards/rejected": -8.78895092010498, + "step": 501 + }, + { + "epoch": 5.940828402366864, + "grad_norm": 5.05266297652339, + "learning_rate": 9.15273583713663e-08, + "logits/chosen": -1.4486913681030273, + "logits/rejected": -1.5014913082122803, + "logps/chosen": -48.460594177246094, + "logps/rejected": -62.99694061279297, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5520663261413574, + "rewards/margins": 5.633842468261719, + "rewards/rejected": -7.185909271240234, + "step": 502 + }, + { + "epoch": 5.952662721893491, + "grad_norm": 3.687156022372964, + "learning_rate": 9.052380320985273e-08, + "logits/chosen": -1.378886103630066, + "logits/rejected": -1.4398740530014038, + "logps/chosen": -45.16747283935547, + "logps/rejected": -63.0939826965332, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6366732120513916, + "rewards/margins": 7.194758892059326, + "rewards/rejected": -8.83143138885498, + "step": 503 + }, + { + "epoch": 5.964497041420119, + "grad_norm": 4.578124297729498, + "learning_rate": 8.95245624518336e-08, + "logits/chosen": -1.1423323154449463, + "logits/rejected": -1.1454185247421265, + "logps/chosen": -36.498924255371094, + "logps/rejected": -51.14469528198242, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.339404821395874, + "rewards/margins": 5.312580585479736, + "rewards/rejected": -6.651985168457031, + "step": 504 + }, + { + "epoch": 5.976331360946745, + "grad_norm": 3.6118523674964518, + "learning_rate": 8.85296631303579e-08, + "logits/chosen": -1.2390810251235962, + "logits/rejected": -1.1867014169692993, + "logps/chosen": -32.07006072998047, + "logps/rejected": -58.290775299072266, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3951120972633362, + "rewards/margins": 6.040757656097412, + "rewards/rejected": -6.435870170593262, + "step": 505 + }, + { + "epoch": 5.988165680473373, + "grad_norm": 4.13649603264138, + "learning_rate": 8.753913216102285e-08, + "logits/chosen": -1.404435634613037, + "logits/rejected": -1.3154963254928589, + "logps/chosen": -47.064483642578125, + "logps/rejected": -74.79150390625, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.253176212310791, + "rewards/margins": 7.4068121910095215, + "rewards/rejected": -10.659988403320312, + "step": 506 + }, + { + "epoch": 6.0, + "grad_norm": 6.57455134237728, + "learning_rate": 8.655299634124646e-08, + "logits/chosen": -1.1584608554840088, + "logits/rejected": -1.1112971305847168, + "logps/chosen": -35.10588455200195, + "logps/rejected": -55.12195587158203, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5993782877922058, + "rewards/margins": 7.258352279663086, + "rewards/rejected": -7.857730865478516, + "step": 507 + }, + { + "epoch": 6.011834319526627, + "grad_norm": 5.072270968295128, + "learning_rate": 8.557128234954189e-08, + "logits/chosen": -1.2677009105682373, + "logits/rejected": -1.2384376525878906, + "logps/chosen": -26.764663696289062, + "logps/rejected": -54.153812408447266, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4589073061943054, + "rewards/margins": 7.230363845825195, + "rewards/rejected": -7.689270973205566, + "step": 508 + }, + { + "epoch": 6.023668639053255, + "grad_norm": 3.823553652256516, + "learning_rate": 8.459401674479594e-08, + "logits/chosen": -1.1779940128326416, + "logits/rejected": -1.098586082458496, + "logps/chosen": -37.93672180175781, + "logps/rejected": -65.87274932861328, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2578346729278564, + "rewards/margins": 7.301403999328613, + "rewards/rejected": -8.55923843383789, + "step": 509 + }, + { + "epoch": 6.035502958579881, + "grad_norm": 3.2726696014561516, + "learning_rate": 8.362122596555088e-08, + "logits/chosen": -1.3603450059890747, + "logits/rejected": -1.190643548965454, + "logps/chosen": -31.422203063964844, + "logps/rejected": -67.04301452636719, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9887487888336182, + "rewards/margins": 9.028081893920898, + "rewards/rejected": -10.016830444335938, + "step": 510 + }, + { + "epoch": 6.047337278106509, + "grad_norm": 3.455246506585187, + "learning_rate": 8.265293632928854e-08, + "logits/chosen": -1.155658483505249, + "logits/rejected": -1.0525039434432983, + "logps/chosen": -38.18634796142578, + "logps/rejected": -54.874427795410156, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.180433750152588, + "rewards/margins": 5.439917087554932, + "rewards/rejected": -6.6203508377075195, + "step": 511 + }, + { + "epoch": 6.059171597633136, + "grad_norm": 3.350614417751688, + "learning_rate": 8.16891740317189e-08, + "logits/chosen": -1.2656223773956299, + "logits/rejected": -1.3540056943893433, + "logps/chosen": -35.06914138793945, + "logps/rejected": -60.15445327758789, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5391483306884766, + "rewards/margins": 8.150188446044922, + "rewards/rejected": -9.689336776733398, + "step": 512 + }, + { + "epoch": 6.071005917159764, + "grad_norm": 3.484209239680023, + "learning_rate": 8.072996514607124e-08, + "logits/chosen": -1.2974562644958496, + "logits/rejected": -1.1921536922454834, + "logps/chosen": -55.431785583496094, + "logps/rejected": -79.14292907714844, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.612431526184082, + "rewards/margins": 9.743474960327148, + "rewards/rejected": -12.355907440185547, + "step": 513 + }, + { + "epoch": 6.0828402366863905, + "grad_norm": 4.446328914142042, + "learning_rate": 7.977533562238838e-08, + "logits/chosen": -1.277441143989563, + "logits/rejected": -1.2345993518829346, + "logps/chosen": -40.02715301513672, + "logps/rejected": -76.5589599609375, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1496319770812988, + "rewards/margins": 10.045013427734375, + "rewards/rejected": -11.194644927978516, + "step": 514 + }, + { + "epoch": 6.094674556213017, + "grad_norm": 3.4828911535899496, + "learning_rate": 7.882531128682538e-08, + "logits/chosen": -1.220058798789978, + "logits/rejected": -1.2398343086242676, + "logps/chosen": -48.92205810546875, + "logps/rejected": -75.846923828125, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.44828462600708, + "rewards/margins": 8.325919151306152, + "rewards/rejected": -10.77420425415039, + "step": 515 + }, + { + "epoch": 6.106508875739645, + "grad_norm": 3.974217510492179, + "learning_rate": 7.787991784094999e-08, + "logits/chosen": -1.1681749820709229, + "logits/rejected": -1.2502659559249878, + "logps/chosen": -40.98628616333008, + "logps/rejected": -69.3888168334961, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.623548746109009, + "rewards/margins": 9.416035652160645, + "rewards/rejected": -12.03958511352539, + "step": 516 + }, + { + "epoch": 6.118343195266272, + "grad_norm": 3.087635521774272, + "learning_rate": 7.693918086104825e-08, + "logits/chosen": -1.2360180616378784, + "logits/rejected": -1.1360399723052979, + "logps/chosen": -37.70630645751953, + "logps/rejected": -69.30216979980469, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35728567838668823, + "rewards/margins": 8.243356704711914, + "rewards/rejected": -8.600642204284668, + "step": 517 + }, + { + "epoch": 6.1301775147929, + "grad_norm": 4.697710852543094, + "learning_rate": 7.60031257974316e-08, + "logits/chosen": -1.4873653650283813, + "logits/rejected": -1.365291953086853, + "logps/chosen": -36.08180618286133, + "logps/rejected": -69.22196960449219, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.034714698791504, + "rewards/margins": 7.524087905883789, + "rewards/rejected": -9.558802604675293, + "step": 518 + }, + { + "epoch": 6.1420118343195265, + "grad_norm": 4.048751031295707, + "learning_rate": 7.507177797374927e-08, + "logits/chosen": -1.3827829360961914, + "logits/rejected": -1.363875389099121, + "logps/chosen": -43.49116516113281, + "logps/rejected": -78.3033447265625, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8128806352615356, + "rewards/margins": 10.288780212402344, + "rewards/rejected": -12.101661682128906, + "step": 519 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 3.0144143135282286, + "learning_rate": 7.414516258630244e-08, + "logits/chosen": -1.3672518730163574, + "logits/rejected": -1.294494390487671, + "logps/chosen": -28.87794303894043, + "logps/rejected": -49.833717346191406, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42372238636016846, + "rewards/margins": 6.666773796081543, + "rewards/rejected": -7.090496063232422, + "step": 520 + }, + { + "epoch": 6.165680473372781, + "grad_norm": 5.7375168904982745, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -1.1266202926635742, + "logits/rejected": -1.1897248029708862, + "logps/chosen": -36.070762634277344, + "logps/rejected": -67.00115203857422, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.276731491088867, + "rewards/margins": 8.913595199584961, + "rewards/rejected": -11.190326690673828, + "step": 521 + }, + { + "epoch": 6.177514792899408, + "grad_norm": 3.9341322070007227, + "learning_rate": 7.230622926449564e-08, + "logits/chosen": -1.2871345281600952, + "logits/rejected": -1.3150429725646973, + "logps/chosen": -32.48534393310547, + "logps/rejected": -56.244903564453125, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2191728353500366, + "rewards/margins": 7.289381980895996, + "rewards/rejected": -8.508554458618164, + "step": 522 + }, + { + "epoch": 6.189349112426036, + "grad_norm": 3.8384098316045776, + "learning_rate": 7.139396107988193e-08, + "logits/chosen": -1.0808664560317993, + "logits/rejected": -1.051466464996338, + "logps/chosen": -36.810298919677734, + "logps/rejected": -50.8199462890625, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.059516020119190216, + "rewards/margins": 5.3354082107543945, + "rewards/rejected": -5.27589225769043, + "step": 523 + }, + { + "epoch": 6.201183431952662, + "grad_norm": 3.7617803402555965, + "learning_rate": 7.048652482965078e-08, + "logits/chosen": -1.5526072978973389, + "logits/rejected": -1.4698734283447266, + "logps/chosen": -35.63499450683594, + "logps/rejected": -58.799835205078125, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1897969245910645, + "rewards/margins": 6.380156517028809, + "rewards/rejected": -7.569952964782715, + "step": 524 + }, + { + "epoch": 6.21301775147929, + "grad_norm": 6.280571337534133, + "learning_rate": 6.958394506320947e-08, + "logits/chosen": -1.082352876663208, + "logits/rejected": -0.9739059805870056, + "logps/chosen": -59.19343566894531, + "logps/rejected": -85.16358184814453, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.377283811569214, + "rewards/margins": 8.0195951461792, + "rewards/rejected": -11.396878242492676, + "step": 525 + }, + { + "epoch": 6.224852071005917, + "grad_norm": 3.6878257721370926, + "learning_rate": 6.868624619858021e-08, + "logits/chosen": -1.1773961782455444, + "logits/rejected": -1.069741129875183, + "logps/chosen": -35.47507095336914, + "logps/rejected": -71.48643493652344, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48855772614479065, + "rewards/margins": 9.38101863861084, + "rewards/rejected": -9.869575500488281, + "step": 526 + }, + { + "epoch": 6.236686390532545, + "grad_norm": 5.553600246448852, + "learning_rate": 6.779345252173906e-08, + "logits/chosen": -0.9772287011146545, + "logits/rejected": -1.2621002197265625, + "logps/chosen": -66.20740509033203, + "logps/rejected": -61.06924819946289, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.746315598487854, + "rewards/margins": 5.9837775230407715, + "rewards/rejected": -6.730093002319336, + "step": 527 + }, + { + "epoch": 6.2485207100591715, + "grad_norm": 4.0981125572939785, + "learning_rate": 6.690558818595943e-08, + "logits/chosen": -1.3861331939697266, + "logits/rejected": -1.297480821609497, + "logps/chosen": -43.87446975708008, + "logps/rejected": -63.237091064453125, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5427628755569458, + "rewards/margins": 7.580643653869629, + "rewards/rejected": -9.123406410217285, + "step": 528 + }, + { + "epoch": 6.260355029585799, + "grad_norm": 3.147025133945202, + "learning_rate": 6.602267721115806e-08, + "logits/chosen": -1.410119891166687, + "logits/rejected": -1.324759840965271, + "logps/chosen": -42.89299011230469, + "logps/rejected": -70.10383605957031, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9887200593948364, + "rewards/margins": 6.539437770843506, + "rewards/rejected": -7.5281572341918945, + "step": 529 + }, + { + "epoch": 6.272189349112426, + "grad_norm": 3.4923988387654714, + "learning_rate": 6.514474348324581e-08, + "logits/chosen": -1.1285738945007324, + "logits/rejected": -1.0195645093917847, + "logps/chosen": -39.756072998046875, + "logps/rejected": -64.98381042480469, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.783644437789917, + "rewards/margins": 7.37436056137085, + "rewards/rejected": -9.158004760742188, + "step": 530 + }, + { + "epoch": 6.284023668639053, + "grad_norm": 4.081843420506292, + "learning_rate": 6.427181075348084e-08, + "logits/chosen": -1.1205132007598877, + "logits/rejected": -1.0184344053268433, + "logps/chosen": -29.305950164794922, + "logps/rejected": -60.268062591552734, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.907474160194397, + "rewards/margins": 6.502952575683594, + "rewards/rejected": -7.410426139831543, + "step": 531 + }, + { + "epoch": 6.295857988165681, + "grad_norm": 3.8839404909626585, + "learning_rate": 6.340390263782655e-08, + "logits/chosen": -1.3858798742294312, + "logits/rejected": -1.2407383918762207, + "logps/chosen": -41.1785888671875, + "logps/rejected": -76.4423828125, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.216499090194702, + "rewards/margins": 8.766088485717773, + "rewards/rejected": -10.982587814331055, + "step": 532 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 4.4263228598532365, + "learning_rate": 6.254104261631254e-08, + "logits/chosen": -1.1712279319763184, + "logits/rejected": -1.214058518409729, + "logps/chosen": -34.68932342529297, + "logps/rejected": -55.6457633972168, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2202101945877075, + "rewards/margins": 6.376070022583008, + "rewards/rejected": -7.596280574798584, + "step": 533 + }, + { + "epoch": 6.319526627218935, + "grad_norm": 3.9131998116199918, + "learning_rate": 6.168325403239913e-08, + "logits/chosen": -1.3168898820877075, + "logits/rejected": -1.4670816659927368, + "logps/chosen": -45.90602111816406, + "logps/rejected": -59.57246398925781, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.582594633102417, + "rewards/margins": 7.453899383544922, + "rewards/rejected": -9.036494255065918, + "step": 534 + }, + { + "epoch": 6.331360946745562, + "grad_norm": 4.764780460541576, + "learning_rate": 6.08305600923463e-08, + "logits/chosen": -1.1949267387390137, + "logits/rejected": -1.23961341381073, + "logps/chosen": -36.51057434082031, + "logps/rejected": -72.36233520507812, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.984785795211792, + "rewards/margins": 8.723003387451172, + "rewards/rejected": -11.707788467407227, + "step": 535 + }, + { + "epoch": 6.34319526627219, + "grad_norm": 4.149947688223677, + "learning_rate": 5.998298386458545e-08, + "logits/chosen": -1.1838600635528564, + "logits/rejected": -1.0856642723083496, + "logps/chosen": -42.33216857910156, + "logps/rejected": -66.8110122680664, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4710516929626465, + "rewards/margins": 6.801319599151611, + "rewards/rejected": -9.272371292114258, + "step": 536 + }, + { + "epoch": 6.355029585798817, + "grad_norm": 4.1571001554854465, + "learning_rate": 5.914054827909548e-08, + "logits/chosen": -1.094924807548523, + "logits/rejected": -1.1296252012252808, + "logps/chosen": -46.14147186279297, + "logps/rejected": -86.65098571777344, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7291374206542969, + "rewards/margins": 10.872190475463867, + "rewards/rejected": -12.601327896118164, + "step": 537 + }, + { + "epoch": 6.366863905325443, + "grad_norm": 4.127454820039479, + "learning_rate": 5.830327612678265e-08, + "logits/chosen": -1.187494158744812, + "logits/rejected": -1.201699137687683, + "logps/chosen": -43.62383270263672, + "logps/rejected": -69.97433471679688, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9448773860931396, + "rewards/margins": 9.503475189208984, + "rewards/rejected": -11.448352813720703, + "step": 538 + }, + { + "epoch": 6.378698224852071, + "grad_norm": 5.236200973476889, + "learning_rate": 5.747119005886361e-08, + "logits/chosen": -1.3696651458740234, + "logits/rejected": -1.2970082759857178, + "logps/chosen": -35.38874816894531, + "logps/rejected": -63.076866149902344, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.804175615310669, + "rewards/margins": 7.7187957763671875, + "rewards/rejected": -8.522972106933594, + "step": 539 + }, + { + "epoch": 6.390532544378698, + "grad_norm": 3.4361194305765226, + "learning_rate": 5.6644312586253044e-08, + "logits/chosen": -1.3181718587875366, + "logits/rejected": -1.4402356147766113, + "logps/chosen": -47.9390754699707, + "logps/rejected": -69.83265686035156, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2599098682403564, + "rewards/margins": 6.955543518066406, + "rewards/rejected": -8.215453147888184, + "step": 540 + }, + { + "epoch": 6.402366863905326, + "grad_norm": 5.247988440802347, + "learning_rate": 5.582266607895422e-08, + "logits/chosen": -1.2013980150222778, + "logits/rejected": -1.2277536392211914, + "logps/chosen": -32.803062438964844, + "logps/rejected": -51.46520233154297, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23354974389076233, + "rewards/margins": 7.179046154022217, + "rewards/rejected": -7.412596702575684, + "step": 541 + }, + { + "epoch": 6.414201183431953, + "grad_norm": 4.074351591350812, + "learning_rate": 5.5006272765454056e-08, + "logits/chosen": -1.6633051633834839, + "logits/rejected": -1.3203377723693848, + "logps/chosen": -43.05129623413086, + "logps/rejected": -89.4016342163086, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.711348056793213, + "rewards/margins": 10.217995643615723, + "rewards/rejected": -11.929344177246094, + "step": 542 + }, + { + "epoch": 6.42603550295858, + "grad_norm": 4.882941673599114, + "learning_rate": 5.419515473212191e-08, + "logits/chosen": -1.0465339422225952, + "logits/rejected": -0.9034765958786011, + "logps/chosen": -39.69237518310547, + "logps/rejected": -64.68145751953125, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7466580867767334, + "rewards/margins": 5.554471969604492, + "rewards/rejected": -7.301130294799805, + "step": 543 + }, + { + "epoch": 6.437869822485207, + "grad_norm": 3.1692395598442804, + "learning_rate": 5.338933392261158e-08, + "logits/chosen": -1.3215035200119019, + "logits/rejected": -1.341964602470398, + "logps/chosen": -40.1537971496582, + "logps/rejected": -63.64967346191406, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.646998405456543, + "rewards/margins": 8.139799118041992, + "rewards/rejected": -10.786796569824219, + "step": 544 + }, + { + "epoch": 6.449704142011834, + "grad_norm": 5.949554370120341, + "learning_rate": 5.258883213726828e-08, + "logits/chosen": -1.194183588027954, + "logits/rejected": -1.2076101303100586, + "logps/chosen": -44.26350402832031, + "logps/rejected": -64.31948852539062, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1567471027374268, + "rewards/margins": 7.238821506500244, + "rewards/rejected": -8.39556884765625, + "step": 545 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 3.7663787692141995, + "learning_rate": 5.1793671032538206e-08, + "logits/chosen": -1.4605358839035034, + "logits/rejected": -1.4506992101669312, + "logps/chosen": -35.8072395324707, + "logps/rejected": -57.40998458862305, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3689743280410767, + "rewards/margins": 7.345977783203125, + "rewards/rejected": -8.714951515197754, + "step": 546 + }, + { + "epoch": 6.4733727810650885, + "grad_norm": 2.9635523735572247, + "learning_rate": 5.100387212038324e-08, + "logits/chosen": -1.2727254629135132, + "logits/rejected": -1.374696135520935, + "logps/chosen": -39.98680114746094, + "logps/rejected": -69.32289123535156, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.05542254447937, + "rewards/margins": 8.780436515808105, + "rewards/rejected": -10.835859298706055, + "step": 547 + }, + { + "epoch": 6.485207100591716, + "grad_norm": 5.072194913958191, + "learning_rate": 5.021945676769859e-08, + "logits/chosen": -1.2008986473083496, + "logits/rejected": -1.1709994077682495, + "logps/chosen": -46.82018280029297, + "logps/rejected": -68.16090393066406, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5476022958755493, + "rewards/margins": 7.798556804656982, + "rewards/rejected": -9.346158981323242, + "step": 548 + }, + { + "epoch": 6.497041420118343, + "grad_norm": 4.114324411595974, + "learning_rate": 4.9440446195734817e-08, + "logits/chosen": -1.4089347124099731, + "logits/rejected": -1.346145749092102, + "logps/chosen": -31.944503784179688, + "logps/rejected": -61.80097198486328, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2490514516830444, + "rewards/margins": 8.163646697998047, + "rewards/rejected": -9.412696838378906, + "step": 549 + }, + { + "epoch": 6.508875739644971, + "grad_norm": 5.13609604269027, + "learning_rate": 4.866686147952387e-08, + "logits/chosen": -1.5707539319992065, + "logits/rejected": -1.5316340923309326, + "logps/chosen": -42.37779235839844, + "logps/rejected": -67.51331329345703, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2908955812454224, + "rewards/margins": 8.278606414794922, + "rewards/rejected": -9.569501876831055, + "step": 550 + }, + { + "epoch": 6.520710059171598, + "grad_norm": 5.255591810225031, + "learning_rate": 4.789872354730873e-08, + "logits/chosen": -1.5177299976348877, + "logits/rejected": -1.4273862838745117, + "logps/chosen": -32.705142974853516, + "logps/rejected": -63.322547912597656, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9528019428253174, + "rewards/margins": 7.921998023986816, + "rewards/rejected": -8.874799728393555, + "step": 551 + }, + { + "epoch": 6.5325443786982245, + "grad_norm": 3.6838956833536938, + "learning_rate": 4.71360531799774e-08, + "logits/chosen": -1.2748504877090454, + "logits/rejected": -1.2340025901794434, + "logps/chosen": -38.841758728027344, + "logps/rejected": -68.60884094238281, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4062817096710205, + "rewards/margins": 8.743720054626465, + "rewards/rejected": -10.150002479553223, + "step": 552 + }, + { + "epoch": 6.544378698224852, + "grad_norm": 3.742498897516179, + "learning_rate": 4.637887101050053e-08, + "logits/chosen": -1.331305742263794, + "logits/rejected": -1.2110060453414917, + "logps/chosen": -35.87771224975586, + "logps/rejected": -66.48898315429688, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5530174374580383, + "rewards/margins": 7.692140579223633, + "rewards/rejected": -8.245157241821289, + "step": 553 + }, + { + "epoch": 6.556213017751479, + "grad_norm": 4.145756635966609, + "learning_rate": 4.562719752337349e-08, + "logits/chosen": -1.241339087486267, + "logits/rejected": -1.238210916519165, + "logps/chosen": -39.84772491455078, + "logps/rejected": -65.00758361816406, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1539757251739502, + "rewards/margins": 7.031815528869629, + "rewards/rejected": -8.185791015625, + "step": 554 + }, + { + "epoch": 6.568047337278107, + "grad_norm": 4.724411466421152, + "learning_rate": 4.488105305406187e-08, + "logits/chosen": -1.0200409889221191, + "logits/rejected": -0.9774606823921204, + "logps/chosen": -38.79374694824219, + "logps/rejected": -61.03683090209961, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9451122283935547, + "rewards/margins": 7.298187732696533, + "rewards/rejected": -9.24329948425293, + "step": 555 + }, + { + "epoch": 6.579881656804734, + "grad_norm": 4.840401053211273, + "learning_rate": 4.4140457788451434e-08, + "logits/chosen": -1.3930317163467407, + "logits/rejected": -1.3531696796417236, + "logps/chosen": -51.00494384765625, + "logps/rejected": -69.20004272460938, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2470803260803223, + "rewards/margins": 7.987146854400635, + "rewards/rejected": -10.234228134155273, + "step": 556 + }, + { + "epoch": 6.591715976331361, + "grad_norm": 4.058919938184551, + "learning_rate": 4.340543176230232e-08, + "logits/chosen": -1.1343650817871094, + "logits/rejected": -1.149098515510559, + "logps/chosen": -37.745506286621094, + "logps/rejected": -62.56489562988281, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3520774841308594, + "rewards/margins": 7.742616176605225, + "rewards/rejected": -9.094694137573242, + "step": 557 + }, + { + "epoch": 6.603550295857988, + "grad_norm": 3.8922355564931865, + "learning_rate": 4.267599486070647e-08, + "logits/chosen": -1.1515233516693115, + "logits/rejected": -1.1255171298980713, + "logps/chosen": -33.81602478027344, + "logps/rejected": -61.61820983886719, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.054863691329956, + "rewards/margins": 8.12367057800293, + "rewards/rejected": -9.178534507751465, + "step": 558 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 2.9916154212804025, + "learning_rate": 4.1952166817550176e-08, + "logits/chosen": -1.3873323202133179, + "logits/rejected": -1.270090937614441, + "logps/chosen": -37.5373649597168, + "logps/rejected": -70.85179138183594, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6242471933364868, + "rewards/margins": 9.16792106628418, + "rewards/rejected": -10.792167663574219, + "step": 559 + }, + { + "epoch": 6.627218934911243, + "grad_norm": 3.497280688298504, + "learning_rate": 4.1233967214979764e-08, + "logits/chosen": -1.2440836429595947, + "logits/rejected": -1.182302713394165, + "logps/chosen": -36.41077423095703, + "logps/rejected": -67.35704803466797, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9620167016983032, + "rewards/margins": 7.311346054077148, + "rewards/rejected": -8.27336311340332, + "step": 560 + }, + { + "epoch": 6.6390532544378695, + "grad_norm": 5.110607066618349, + "learning_rate": 4.05214154828723e-08, + "logits/chosen": -1.1929218769073486, + "logits/rejected": -1.1870311498641968, + "logps/chosen": -41.97100830078125, + "logps/rejected": -72.02296447753906, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.625248670578003, + "rewards/margins": 8.42895793914795, + "rewards/rejected": -10.054206848144531, + "step": 561 + }, + { + "epoch": 6.650887573964497, + "grad_norm": 4.697261168017771, + "learning_rate": 3.9814530898309356e-08, + "logits/chosen": -1.4104831218719482, + "logits/rejected": -1.1492834091186523, + "logps/chosen": -40.75250244140625, + "logps/rejected": -86.0766372680664, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9909330606460571, + "rewards/margins": 10.393043518066406, + "rewards/rejected": -12.383977890014648, + "step": 562 + }, + { + "epoch": 6.662721893491124, + "grad_norm": 3.86163495686982, + "learning_rate": 3.9113332585056166e-08, + "logits/chosen": -1.0711450576782227, + "logits/rejected": -1.1439751386642456, + "logps/chosen": -47.77074432373047, + "logps/rejected": -57.20773696899414, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2142012119293213, + "rewards/margins": 5.361945152282715, + "rewards/rejected": -7.576145648956299, + "step": 563 + }, + { + "epoch": 6.674556213017752, + "grad_norm": 3.142265753154405, + "learning_rate": 3.8417839513043646e-08, + "logits/chosen": -1.3805079460144043, + "logits/rejected": -1.2833032608032227, + "logps/chosen": -34.34812927246094, + "logps/rejected": -61.53770446777344, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6832209825515747, + "rewards/margins": 7.745155334472656, + "rewards/rejected": -9.428376197814941, + "step": 564 + }, + { + "epoch": 6.686390532544379, + "grad_norm": 4.537864993317601, + "learning_rate": 3.7728070497855594e-08, + "logits/chosen": -1.4104318618774414, + "logits/rejected": -1.3599334955215454, + "logps/chosen": -37.086971282958984, + "logps/rejected": -67.63109588623047, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2368394136428833, + "rewards/margins": 7.939535140991211, + "rewards/rejected": -8.176374435424805, + "step": 565 + }, + { + "epoch": 6.6982248520710055, + "grad_norm": 3.938202778272971, + "learning_rate": 3.704404420021956e-08, + "logits/chosen": -1.3482962846755981, + "logits/rejected": -1.3277989625930786, + "logps/chosen": -42.97172546386719, + "logps/rejected": -66.13438415527344, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.497891902923584, + "rewards/margins": 7.281721115112305, + "rewards/rejected": -9.77961254119873, + "step": 566 + }, + { + "epoch": 6.710059171597633, + "grad_norm": 4.295649983212432, + "learning_rate": 3.636577912550187e-08, + "logits/chosen": -1.3801615238189697, + "logits/rejected": -1.3534774780273438, + "logps/chosen": -48.245174407958984, + "logps/rejected": -75.80772399902344, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.976886510848999, + "rewards/margins": 8.30002212524414, + "rewards/rejected": -11.276908874511719, + "step": 567 + }, + { + "epoch": 6.72189349112426, + "grad_norm": 3.3047241439197497, + "learning_rate": 3.569329362320708e-08, + "logits/chosen": -1.323474407196045, + "logits/rejected": -1.2122368812561035, + "logps/chosen": -38.685638427734375, + "logps/rejected": -81.03096771240234, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7437925934791565, + "rewards/margins": 10.262330055236816, + "rewards/rejected": -11.006122589111328, + "step": 568 + }, + { + "epoch": 6.733727810650888, + "grad_norm": 3.1683374020519315, + "learning_rate": 3.5026605886481736e-08, + "logits/chosen": -1.4267765283584595, + "logits/rejected": -1.4384936094284058, + "logps/chosen": -40.923892974853516, + "logps/rejected": -66.135009765625, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4209558963775635, + "rewards/margins": 7.333900451660156, + "rewards/rejected": -8.75485610961914, + "step": 569 + }, + { + "epoch": 6.745562130177515, + "grad_norm": 4.91012737015821, + "learning_rate": 3.436573395162179e-08, + "logits/chosen": -1.4396589994430542, + "logits/rejected": -1.3906140327453613, + "logps/chosen": -34.09233856201172, + "logps/rejected": -70.2711181640625, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1198707818984985, + "rewards/margins": 8.739965438842773, + "rewards/rejected": -9.85983657836914, + "step": 570 + }, + { + "epoch": 6.757396449704142, + "grad_norm": 3.6007019387475703, + "learning_rate": 3.371069569758511e-08, + "logits/chosen": -1.251147747039795, + "logits/rejected": -1.1890754699707031, + "logps/chosen": -53.093597412109375, + "logps/rejected": -83.10395050048828, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4066200256347656, + "rewards/margins": 8.366990089416504, + "rewards/rejected": -9.773611068725586, + "step": 571 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 3.8258738774626986, + "learning_rate": 3.306150884550732e-08, + "logits/chosen": -1.2099252939224243, + "logits/rejected": -1.2655442953109741, + "logps/chosen": -43.43336486816406, + "logps/rejected": -58.43470764160156, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2070910930633545, + "rewards/margins": 6.868325710296631, + "rewards/rejected": -9.075416564941406, + "step": 572 + }, + { + "epoch": 6.781065088757396, + "grad_norm": 4.510756393854719, + "learning_rate": 3.241819095822288e-08, + "logits/chosen": -1.2797162532806396, + "logits/rejected": -1.3578741550445557, + "logps/chosen": -37.647857666015625, + "logps/rejected": -63.263267517089844, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05109795928001404, + "rewards/margins": 7.741544246673584, + "rewards/rejected": -7.690445899963379, + "step": 573 + }, + { + "epoch": 6.792899408284024, + "grad_norm": 3.963518873210086, + "learning_rate": 3.17807594397895e-08, + "logits/chosen": -1.1429615020751953, + "logits/rejected": -1.1946382522583008, + "logps/chosen": -34.58401870727539, + "logps/rejected": -63.32178497314453, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.431036114692688, + "rewards/margins": 7.827291488647461, + "rewards/rejected": -9.25832748413086, + "step": 574 + }, + { + "epoch": 6.804733727810651, + "grad_norm": 3.969471879010277, + "learning_rate": 3.114923153501747e-08, + "logits/chosen": -1.2164993286132812, + "logits/rejected": -1.1609983444213867, + "logps/chosen": -31.950668334960938, + "logps/rejected": -61.592769622802734, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7955553531646729, + "rewards/margins": 7.445010185241699, + "rewards/rejected": -9.24056625366211, + "step": 575 + }, + { + "epoch": 6.816568047337278, + "grad_norm": 4.7113289076048, + "learning_rate": 3.052362432900332e-08, + "logits/chosen": -1.1342840194702148, + "logits/rejected": -1.0492384433746338, + "logps/chosen": -32.85274124145508, + "logps/rejected": -72.11370086669922, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3644051551818848, + "rewards/margins": 8.635282516479492, + "rewards/rejected": -9.999687194824219, + "step": 576 + }, + { + "epoch": 6.828402366863905, + "grad_norm": 3.7717255825498675, + "learning_rate": 2.990395474666724e-08, + "logits/chosen": -1.0524818897247314, + "logits/rejected": -0.9995134472846985, + "logps/chosen": -40.310935974121094, + "logps/rejected": -60.963748931884766, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0798089504241943, + "rewards/margins": 6.669445991516113, + "rewards/rejected": -8.74925422668457, + "step": 577 + }, + { + "epoch": 6.840236686390533, + "grad_norm": 4.294637735523631, + "learning_rate": 2.9290239552295538e-08, + "logits/chosen": -1.2513582706451416, + "logits/rejected": -1.1278815269470215, + "logps/chosen": -37.120304107666016, + "logps/rejected": -60.05486297607422, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5432335734367371, + "rewards/margins": 6.656371593475342, + "rewards/rejected": -7.1996049880981445, + "step": 578 + }, + { + "epoch": 6.85207100591716, + "grad_norm": 3.417783997535037, + "learning_rate": 2.8682495349086816e-08, + "logits/chosen": -1.1197659969329834, + "logits/rejected": -1.3288072347640991, + "logps/chosen": -47.85517883300781, + "logps/rejected": -61.156795501708984, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6630420684814453, + "rewards/margins": 7.849754810333252, + "rewards/rejected": -9.512797355651855, + "step": 579 + }, + { + "epoch": 6.8639053254437865, + "grad_norm": 3.9888601265802626, + "learning_rate": 2.8080738578703052e-08, + "logits/chosen": -1.3530460596084595, + "logits/rejected": -1.4569690227508545, + "logps/chosen": -54.98475646972656, + "logps/rejected": -76.0159912109375, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.483710765838623, + "rewards/margins": 9.008731842041016, + "rewards/rejected": -10.49244213104248, + "step": 580 + }, + { + "epoch": 6.875739644970414, + "grad_norm": 2.9654411597287273, + "learning_rate": 2.748498552082465e-08, + "logits/chosen": -1.2621766328811646, + "logits/rejected": -1.3557482957839966, + "logps/chosen": -41.09724044799805, + "logps/rejected": -64.43111419677734, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5274691581726074, + "rewards/margins": 7.362679481506348, + "rewards/rejected": -8.890148162841797, + "step": 581 + }, + { + "epoch": 6.887573964497041, + "grad_norm": 3.6343192832198272, + "learning_rate": 2.6895252292709974e-08, + "logits/chosen": -1.2823108434677124, + "logits/rejected": -1.2685920000076294, + "logps/chosen": -47.48131561279297, + "logps/rejected": -65.60801696777344, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3462042808532715, + "rewards/margins": 6.321001052856445, + "rewards/rejected": -7.667204856872559, + "step": 582 + }, + { + "epoch": 6.899408284023669, + "grad_norm": 4.413434221844061, + "learning_rate": 2.631155484875952e-08, + "logits/chosen": -1.2258765697479248, + "logits/rejected": -1.1181480884552002, + "logps/chosen": -43.612274169921875, + "logps/rejected": -72.54850769042969, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8452584743499756, + "rewards/margins": 8.232542991638184, + "rewards/rejected": -10.077801704406738, + "step": 583 + }, + { + "epoch": 6.911242603550296, + "grad_norm": 3.1918002353766037, + "learning_rate": 2.5733908980083984e-08, + "logits/chosen": -1.3073086738586426, + "logits/rejected": -1.2347233295440674, + "logps/chosen": -40.125244140625, + "logps/rejected": -69.67649841308594, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3976001739501953, + "rewards/margins": 8.126445770263672, + "rewards/rejected": -10.524045944213867, + "step": 584 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 3.672509775762771, + "learning_rate": 2.5162330314077385e-08, + "logits/chosen": -1.2911275625228882, + "logits/rejected": -1.332797646522522, + "logps/chosen": -60.97373580932617, + "logps/rejected": -87.76260375976562, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.335407733917236, + "rewards/margins": 8.518815994262695, + "rewards/rejected": -12.854223251342773, + "step": 585 + }, + { + "epoch": 6.93491124260355, + "grad_norm": 2.4576523514031123, + "learning_rate": 2.4596834313994037e-08, + "logits/chosen": -1.215404748916626, + "logits/rejected": -1.20293128490448, + "logps/chosen": -31.431991577148438, + "logps/rejected": -50.589115142822266, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9661941528320312, + "rewards/margins": 6.143428325653076, + "rewards/rejected": -7.109622955322266, + "step": 586 + }, + { + "epoch": 6.946745562130177, + "grad_norm": 4.1809106849701685, + "learning_rate": 2.403743627853039e-08, + "logits/chosen": -1.1825206279754639, + "logits/rejected": -1.193249225616455, + "logps/chosen": -36.14801788330078, + "logps/rejected": -66.2503662109375, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1976075172424316, + "rewards/margins": 9.384969711303711, + "rewards/rejected": -10.582576751708984, + "step": 587 + }, + { + "epoch": 6.958579881656805, + "grad_norm": 4.981046398430429, + "learning_rate": 2.3484151341411018e-08, + "logits/chosen": -1.0915954113006592, + "logits/rejected": -1.0409646034240723, + "logps/chosen": -41.00914001464844, + "logps/rejected": -60.438446044921875, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23959925770759583, + "rewards/margins": 7.114398956298828, + "rewards/rejected": -7.353998184204102, + "step": 588 + }, + { + "epoch": 6.970414201183432, + "grad_norm": 3.7291093865283065, + "learning_rate": 2.2936994470979188e-08, + "logits/chosen": -1.3922624588012695, + "logits/rejected": -1.4559788703918457, + "logps/chosen": -56.968406677246094, + "logps/rejected": -65.14087677001953, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4613820314407349, + "rewards/margins": 7.349767684936523, + "rewards/rejected": -8.811149597167969, + "step": 589 + }, + { + "epoch": 6.982248520710059, + "grad_norm": 4.305952133916051, + "learning_rate": 2.23959804697921e-08, + "logits/chosen": -1.133141040802002, + "logits/rejected": -1.262980341911316, + "logps/chosen": -45.40880584716797, + "logps/rejected": -58.749446868896484, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.105184316635132, + "rewards/margins": 7.048664093017578, + "rewards/rejected": -9.153848648071289, + "step": 590 + }, + { + "epoch": 6.994082840236686, + "grad_norm": 4.189520921835817, + "learning_rate": 2.1861123974220158e-08, + "logits/chosen": -1.2279367446899414, + "logits/rejected": -1.281379222869873, + "logps/chosen": -45.95500183105469, + "logps/rejected": -78.72282409667969, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1575565338134766, + "rewards/margins": 9.946080207824707, + "rewards/rejected": -12.103636741638184, + "step": 591 + }, + { + "epoch": 7.005917159763314, + "grad_norm": 3.8418221234553425, + "learning_rate": 2.1332439454051277e-08, + "logits/chosen": -1.03047513961792, + "logits/rejected": -0.9760541915893555, + "logps/chosen": -35.22906494140625, + "logps/rejected": -65.05894470214844, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0646643340587616, + "rewards/margins": 7.141489028930664, + "rewards/rejected": -7.206153869628906, + "step": 592 + }, + { + "epoch": 7.017751479289941, + "grad_norm": 4.37902481619465, + "learning_rate": 2.080994121209914e-08, + "logits/chosen": -1.2298972606658936, + "logits/rejected": -1.2462295293807983, + "logps/chosen": -52.52454376220703, + "logps/rejected": -73.28079223632812, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6756205558776855, + "rewards/margins": 7.350424766540527, + "rewards/rejected": -10.026044845581055, + "step": 593 + }, + { + "epoch": 7.029585798816568, + "grad_norm": 3.9841674822113564, + "learning_rate": 2.029364338381656e-08, + "logits/chosen": -1.2008388042449951, + "logits/rejected": -1.2781215906143188, + "logps/chosen": -46.399505615234375, + "logps/rejected": -59.717586517333984, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2308552265167236, + "rewards/margins": 6.461019515991211, + "rewards/rejected": -8.691874504089355, + "step": 594 + }, + { + "epoch": 7.041420118343195, + "grad_norm": 3.5149491748374957, + "learning_rate": 1.9783559936912773e-08, + "logits/chosen": -1.4331488609313965, + "logits/rejected": -1.3519785404205322, + "logps/chosen": -44.29254150390625, + "logps/rejected": -78.45376586914062, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.327043056488037, + "rewards/margins": 8.162921905517578, + "rewards/rejected": -9.489965438842773, + "step": 595 + }, + { + "epoch": 7.053254437869822, + "grad_norm": 3.509610689232701, + "learning_rate": 1.9279704670975726e-08, + "logits/chosen": -1.0991450548171997, + "logits/rejected": -1.106555700302124, + "logps/chosen": -48.36073303222656, + "logps/rejected": -64.15289306640625, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.800105631351471, + "rewards/margins": 6.824061393737793, + "rewards/rejected": -7.624166965484619, + "step": 596 + }, + { + "epoch": 7.06508875739645, + "grad_norm": 4.256443303454753, + "learning_rate": 1.8782091217098728e-08, + "logits/chosen": -1.4569830894470215, + "logits/rejected": -1.4492859840393066, + "logps/chosen": -38.11000061035156, + "logps/rejected": -70.66535949707031, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.615638017654419, + "rewards/margins": 8.343268394470215, + "rewards/rejected": -10.958906173706055, + "step": 597 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 3.1992906689262, + "learning_rate": 1.829073303751172e-08, + "logits/chosen": -1.296358585357666, + "logits/rejected": -1.2894511222839355, + "logps/chosen": -49.65617370605469, + "logps/rejected": -63.472599029541016, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.235658884048462, + "rewards/margins": 6.7473063468933105, + "rewards/rejected": -9.982964515686035, + "step": 598 + }, + { + "epoch": 7.088757396449704, + "grad_norm": 3.941759552775281, + "learning_rate": 1.780564342521698e-08, + "logits/chosen": -1.2821180820465088, + "logits/rejected": -1.2152577638626099, + "logps/chosen": -43.84495544433594, + "logps/rejected": -70.52769470214844, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0584795475006104, + "rewards/margins": 7.640247344970703, + "rewards/rejected": -8.69872760772705, + "step": 599 + }, + { + "epoch": 7.100591715976331, + "grad_norm": 3.553599235582575, + "learning_rate": 1.732683550362954e-08, + "logits/chosen": -1.1523908376693726, + "logits/rejected": -1.2564313411712646, + "logps/chosen": -35.25387954711914, + "logps/rejected": -55.94652557373047, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38266000151634216, + "rewards/margins": 6.977762222290039, + "rewards/rejected": -7.360422134399414, + "step": 600 + }, + { + "epoch": 7.112426035502959, + "grad_norm": 3.8288450725810717, + "learning_rate": 1.6854322226222102e-08, + "logits/chosen": -1.2984129190444946, + "logits/rejected": -1.2461819648742676, + "logps/chosen": -46.678077697753906, + "logps/rejected": -75.3173828125, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.282292604446411, + "rewards/margins": 8.546947479248047, + "rewards/rejected": -10.829240798950195, + "step": 601 + }, + { + "epoch": 7.124260355029586, + "grad_norm": 4.219862245925515, + "learning_rate": 1.6388116376174765e-08, + "logits/chosen": -1.2928462028503418, + "logits/rejected": -1.1129931211471558, + "logps/chosen": -40.22349548339844, + "logps/rejected": -70.40786743164062, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8234593868255615, + "rewards/margins": 8.468381881713867, + "rewards/rejected": -10.291842460632324, + "step": 602 + }, + { + "epoch": 7.136094674556213, + "grad_norm": 3.8475070615621014, + "learning_rate": 1.5928230566028932e-08, + "logits/chosen": -1.4136502742767334, + "logits/rejected": -1.2762510776519775, + "logps/chosen": -35.088523864746094, + "logps/rejected": -61.09199523925781, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4712510108947754, + "rewards/margins": 7.1925201416015625, + "rewards/rejected": -8.663771629333496, + "step": 603 + }, + { + "epoch": 7.14792899408284, + "grad_norm": 3.206852988789503, + "learning_rate": 1.5474677237346468e-08, + "logits/chosen": -1.3350969552993774, + "logits/rejected": -1.2451387643814087, + "logps/chosen": -28.593502044677734, + "logps/rejected": -62.2099494934082, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.741662859916687, + "rewards/margins": 8.64741325378418, + "rewards/rejected": -9.389076232910156, + "step": 604 + }, + { + "epoch": 7.159763313609467, + "grad_norm": 3.844656802369703, + "learning_rate": 1.5027468660372604e-08, + "logits/chosen": -1.3713629245758057, + "logits/rejected": -1.378098964691162, + "logps/chosen": -47.545536041259766, + "logps/rejected": -69.47958374023438, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9896124601364136, + "rewards/margins": 8.07425308227539, + "rewards/rejected": -10.063865661621094, + "step": 605 + }, + { + "epoch": 7.171597633136095, + "grad_norm": 3.0241225907385165, + "learning_rate": 1.4586616933704527e-08, + "logits/chosen": -1.2971570491790771, + "logits/rejected": -1.245847225189209, + "logps/chosen": -38.302677154541016, + "logps/rejected": -68.13219451904297, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4747118949890137, + "rewards/margins": 8.88740062713623, + "rewards/rejected": -10.362112998962402, + "step": 606 + }, + { + "epoch": 7.183431952662722, + "grad_norm": 5.654903481243723, + "learning_rate": 1.4152133983963643e-08, + "logits/chosen": -1.3624151945114136, + "logits/rejected": -1.289166808128357, + "logps/chosen": -36.01699447631836, + "logps/rejected": -65.6912841796875, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6957451105117798, + "rewards/margins": 7.106748580932617, + "rewards/rejected": -8.80249309539795, + "step": 607 + }, + { + "epoch": 7.195266272189349, + "grad_norm": 2.9794480072644394, + "learning_rate": 1.372403156547311e-08, + "logits/chosen": -1.2158567905426025, + "logits/rejected": -1.2747105360031128, + "logps/chosen": -34.01419448852539, + "logps/rejected": -57.90458297729492, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9422094821929932, + "rewards/margins": 7.069450378417969, + "rewards/rejected": -8.011659622192383, + "step": 608 + }, + { + "epoch": 7.207100591715976, + "grad_norm": 4.261973188621215, + "learning_rate": 1.330232125993988e-08, + "logits/chosen": -1.3627076148986816, + "logits/rejected": -1.3639800548553467, + "logps/chosen": -48.03813552856445, + "logps/rejected": -64.04691314697266, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8824987411499023, + "rewards/margins": 7.479591369628906, + "rewards/rejected": -10.362090110778809, + "step": 609 + }, + { + "epoch": 7.218934911242604, + "grad_norm": 4.803739657888739, + "learning_rate": 1.2887014476141212e-08, + "logits/chosen": -1.4838051795959473, + "logits/rejected": -1.3311843872070312, + "logps/chosen": -43.892982482910156, + "logps/rejected": -81.17485046386719, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9665614366531372, + "rewards/margins": 9.852771759033203, + "rewards/rejected": -11.81933307647705, + "step": 610 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 4.0756670658237475, + "learning_rate": 1.2478122449616212e-08, + "logits/chosen": -1.1747201681137085, + "logits/rejected": -1.2890151739120483, + "logps/chosen": -56.25837707519531, + "logps/rejected": -65.45867156982422, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2632644176483154, + "rewards/margins": 7.053145885467529, + "rewards/rejected": -9.316410064697266, + "step": 611 + }, + { + "epoch": 7.242603550295858, + "grad_norm": 4.38053938987994, + "learning_rate": 1.2075656242361732e-08, + "logits/chosen": -1.0797568559646606, + "logits/rejected": -1.0762195587158203, + "logps/chosen": -33.57958221435547, + "logps/rejected": -71.72637176513672, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2206077575683594, + "rewards/margins": 10.526464462280273, + "rewards/rejected": -12.74707317352295, + "step": 612 + }, + { + "epoch": 7.254437869822485, + "grad_norm": 2.925185904249556, + "learning_rate": 1.16796267425332e-08, + "logits/chosen": -1.4559069871902466, + "logits/rejected": -1.3925589323043823, + "logps/chosen": -40.79030990600586, + "logps/rejected": -67.69854736328125, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6704390048980713, + "rewards/margins": 7.342378616333008, + "rewards/rejected": -10.0128173828125, + "step": 613 + }, + { + "epoch": 7.266272189349112, + "grad_norm": 4.199453917170064, + "learning_rate": 1.1290044664149873e-08, + "logits/chosen": -1.323655128479004, + "logits/rejected": -1.2902326583862305, + "logps/chosen": -43.997676849365234, + "logps/rejected": -70.29308319091797, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0053741931915283, + "rewards/margins": 7.789745330810547, + "rewards/rejected": -9.795119285583496, + "step": 614 + }, + { + "epoch": 7.27810650887574, + "grad_norm": 2.9644130977906675, + "learning_rate": 1.0906920546805253e-08, + "logits/chosen": -1.331533670425415, + "logits/rejected": -1.318424940109253, + "logps/chosen": -41.30125045776367, + "logps/rejected": -69.17133331298828, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8695695400238037, + "rewards/margins": 8.263007164001465, + "rewards/rejected": -10.132576942443848, + "step": 615 + }, + { + "epoch": 7.289940828402367, + "grad_norm": 2.9111429661875294, + "learning_rate": 1.0530264755381824e-08, + "logits/chosen": -1.5518194437026978, + "logits/rejected": -1.2643704414367676, + "logps/chosen": -25.555744171142578, + "logps/rejected": -70.7457275390625, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22286051511764526, + "rewards/margins": 10.125096321105957, + "rewards/rejected": -10.347957611083984, + "step": 616 + }, + { + "epoch": 7.3017751479289945, + "grad_norm": 3.222903401246695, + "learning_rate": 1.0160087479770513e-08, + "logits/chosen": -1.147964358329773, + "logits/rejected": -1.1484017372131348, + "logps/chosen": -36.42656326293945, + "logps/rejected": -54.97199249267578, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2413222789764404, + "rewards/margins": 6.4268999099731445, + "rewards/rejected": -7.668221950531006, + "step": 617 + }, + { + "epoch": 7.313609467455621, + "grad_norm": 4.567760348222854, + "learning_rate": 9.796398734595284e-09, + "logits/chosen": -1.261488437652588, + "logits/rejected": -1.1865301132202148, + "logps/chosen": -33.513763427734375, + "logps/rejected": -80.79141235351562, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4354597330093384, + "rewards/margins": 9.321052551269531, + "rewards/rejected": -10.756511688232422, + "step": 618 + }, + { + "epoch": 7.325443786982248, + "grad_norm": 3.241502517680726, + "learning_rate": 9.439208358941907e-09, + "logits/chosen": -1.1972136497497559, + "logits/rejected": -1.074997901916504, + "logps/chosen": -47.66037368774414, + "logps/rejected": -74.0272445678711, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2272040843963623, + "rewards/margins": 8.553009033203125, + "rewards/rejected": -10.78021240234375, + "step": 619 + }, + { + "epoch": 7.337278106508876, + "grad_norm": 5.389252767817157, + "learning_rate": 9.088526016092141e-09, + "logits/chosen": -1.5381629467010498, + "logits/rejected": -1.3760014772415161, + "logps/chosen": -40.557369232177734, + "logps/rejected": -73.04681396484375, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1969547271728516, + "rewards/margins": 8.717408180236816, + "rewards/rejected": -11.914361953735352, + "step": 620 + }, + { + "epoch": 7.349112426035503, + "grad_norm": 4.492355419112391, + "learning_rate": 8.744361193261912e-09, + "logits/chosen": -1.2798972129821777, + "logits/rejected": -1.2866759300231934, + "logps/chosen": -46.820777893066406, + "logps/rejected": -71.25297546386719, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2585588693618774, + "rewards/margins": 8.543848037719727, + "rewards/rejected": -9.802406311035156, + "step": 621 + }, + { + "epoch": 7.3609467455621305, + "grad_norm": 2.590234218369342, + "learning_rate": 8.40672320134489e-09, + "logits/chosen": -1.3868498802185059, + "logits/rejected": -1.3360377550125122, + "logps/chosen": -32.945518493652344, + "logps/rejected": -53.556705474853516, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3330631256103516, + "rewards/margins": 6.077226638793945, + "rewards/rejected": -8.410289764404297, + "step": 622 + }, + { + "epoch": 7.372781065088757, + "grad_norm": 4.999566857886415, + "learning_rate": 8.075621174660625e-09, + "logits/chosen": -1.1354734897613525, + "logits/rejected": -1.1073994636535645, + "logps/chosen": -46.984012603759766, + "logps/rejected": -67.13949584960938, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1461362838745117, + "rewards/margins": 7.404544353485107, + "rewards/rejected": -9.550680160522461, + "step": 623 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 4.442848421246316, + "learning_rate": 7.751064070707247e-09, + "logits/chosen": -0.9358320236206055, + "logits/rejected": -1.0718661546707153, + "logps/chosen": -57.682579040527344, + "logps/rejected": -67.44542694091797, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3646697998046875, + "rewards/margins": 7.940317153930664, + "rewards/rejected": -9.304986953735352, + "step": 624 + }, + { + "epoch": 7.396449704142012, + "grad_norm": 3.6393760430903463, + "learning_rate": 7.4330606699193055e-09, + "logits/chosen": -1.3721582889556885, + "logits/rejected": -1.194779634475708, + "logps/chosen": -41.03765869140625, + "logps/rejected": -67.66683959960938, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.100832939147949, + "rewards/margins": 7.313269138336182, + "rewards/rejected": -9.414101600646973, + "step": 625 + }, + { + "epoch": 7.408284023668639, + "grad_norm": 2.8173995600402995, + "learning_rate": 7.12161957543006e-09, + "logits/chosen": -1.1632230281829834, + "logits/rejected": -1.1450843811035156, + "logps/chosen": -45.309226989746094, + "logps/rejected": -67.70977783203125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5633859634399414, + "rewards/margins": 7.859521865844727, + "rewards/rejected": -9.422907829284668, + "step": 626 + }, + { + "epoch": 7.420118343195266, + "grad_norm": 3.6516772499433054, + "learning_rate": 6.816749212839007e-09, + "logits/chosen": -1.2808470726013184, + "logits/rejected": -1.3078573942184448, + "logps/chosen": -44.84081268310547, + "logps/rejected": -70.39693450927734, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.452430009841919, + "rewards/margins": 7.464086532592773, + "rewards/rejected": -9.91651725769043, + "step": 627 + }, + { + "epoch": 7.431952662721893, + "grad_norm": 3.889072407283795, + "learning_rate": 6.518457829983559e-09, + "logits/chosen": -1.185568928718567, + "logits/rejected": -1.052988052368164, + "logps/chosen": -41.724830627441406, + "logps/rejected": -69.43457794189453, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4893982410430908, + "rewards/margins": 6.81561803817749, + "rewards/rejected": -8.30501651763916, + "step": 628 + }, + { + "epoch": 7.443786982248521, + "grad_norm": 2.887200349157875, + "learning_rate": 6.226753496716253e-09, + "logits/chosen": -1.2950429916381836, + "logits/rejected": -1.252742052078247, + "logps/chosen": -30.59502410888672, + "logps/rejected": -50.878868103027344, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032743215560913086, + "rewards/margins": 6.188243865966797, + "rewards/rejected": -6.1915178298950195, + "step": 629 + }, + { + "epoch": 7.455621301775148, + "grad_norm": 4.244222893379881, + "learning_rate": 5.9416441046862555e-09, + "logits/chosen": -1.301514983177185, + "logits/rejected": -1.3513267040252686, + "logps/chosen": -39.38859558105469, + "logps/rejected": -67.55226135253906, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7557300329208374, + "rewards/margins": 9.34812068939209, + "rewards/rejected": -11.103851318359375, + "step": 630 + }, + { + "epoch": 7.4674556213017755, + "grad_norm": 4.290461165134473, + "learning_rate": 5.663137367125898e-09, + "logits/chosen": -1.3903230428695679, + "logits/rejected": -1.3901610374450684, + "logps/chosen": -46.33057403564453, + "logps/rejected": -58.841392517089844, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6279226541519165, + "rewards/margins": 7.242958068847656, + "rewards/rejected": -8.870880126953125, + "step": 631 + }, + { + "epoch": 7.479289940828402, + "grad_norm": 3.439301636458819, + "learning_rate": 5.3912408186420064e-09, + "logits/chosen": -1.1362035274505615, + "logits/rejected": -1.0982844829559326, + "logps/chosen": -37.11709213256836, + "logps/rejected": -61.59524154663086, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.149167537689209, + "rewards/margins": 7.633286952972412, + "rewards/rejected": -8.782454490661621, + "step": 632 + }, + { + "epoch": 7.491124260355029, + "grad_norm": 4.470095918148292, + "learning_rate": 5.12596181501207e-09, + "logits/chosen": -1.2651134729385376, + "logits/rejected": -1.3249282836914062, + "logps/chosen": -30.522804260253906, + "logps/rejected": -51.84233856201172, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16683726012706757, + "rewards/margins": 7.101442813873291, + "rewards/rejected": -7.268280029296875, + "step": 633 + }, + { + "epoch": 7.502958579881657, + "grad_norm": 3.3611637574552025, + "learning_rate": 4.867307532985227e-09, + "logits/chosen": -1.364112377166748, + "logits/rejected": -1.2576634883880615, + "logps/chosen": -49.3748664855957, + "logps/rejected": -83.98435974121094, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.741516590118408, + "rewards/margins": 9.956401824951172, + "rewards/rejected": -12.697917938232422, + "step": 634 + }, + { + "epoch": 7.514792899408284, + "grad_norm": 2.9574639791748036, + "learning_rate": 4.615284970088173e-09, + "logits/chosen": -1.1360864639282227, + "logits/rejected": -1.1513174772262573, + "logps/chosen": -36.475341796875, + "logps/rejected": -70.95317077636719, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0532495975494385, + "rewards/margins": 8.06146240234375, + "rewards/rejected": -10.114712715148926, + "step": 635 + }, + { + "epoch": 7.5266272189349115, + "grad_norm": 3.4730054907543497, + "learning_rate": 4.369900944435734e-09, + "logits/chosen": -1.369525671005249, + "logits/rejected": -1.2865912914276123, + "logps/chosen": -40.689971923828125, + "logps/rejected": -69.06320190429688, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.378922700881958, + "rewards/margins": 8.550222396850586, + "rewards/rejected": -10.929145812988281, + "step": 636 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 4.223323554636038, + "learning_rate": 4.131162094546531e-09, + "logits/chosen": -1.3048131465911865, + "logits/rejected": -1.3768178224563599, + "logps/chosen": -57.49602508544922, + "logps/rejected": -68.12883758544922, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1675872802734375, + "rewards/margins": 7.353907585144043, + "rewards/rejected": -11.521493911743164, + "step": 637 + }, + { + "epoch": 7.550295857988166, + "grad_norm": 3.593337678884347, + "learning_rate": 3.899074879163244e-09, + "logits/chosen": -0.9969456195831299, + "logits/rejected": -1.053671956062317, + "logps/chosen": -44.34745788574219, + "logps/rejected": -60.61125183105469, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0102906227111816, + "rewards/margins": 7.186006546020508, + "rewards/rejected": -8.196297645568848, + "step": 638 + }, + { + "epoch": 7.562130177514793, + "grad_norm": 4.313259960588259, + "learning_rate": 3.6736455770781104e-09, + "logits/chosen": -1.0693228244781494, + "logits/rejected": -1.02896249294281, + "logps/chosen": -38.95392608642578, + "logps/rejected": -63.7398796081543, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6491146087646484, + "rewards/margins": 5.766778945922852, + "rewards/rejected": -7.4158935546875, + "step": 639 + }, + { + "epoch": 7.57396449704142, + "grad_norm": 3.606040906516179, + "learning_rate": 3.4548802869627804e-09, + "logits/chosen": -1.2556226253509521, + "logits/rejected": -1.1845530271530151, + "logps/chosen": -42.52489471435547, + "logps/rejected": -69.6248550415039, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0042614936828613, + "rewards/margins": 8.077693939208984, + "rewards/rejected": -11.081954956054688, + "step": 640 + }, + { + "epoch": 7.585798816568047, + "grad_norm": 3.105559539498645, + "learning_rate": 3.2427849272035067e-09, + "logits/chosen": -0.9435504674911499, + "logits/rejected": -0.9759422540664673, + "logps/chosen": -40.1533203125, + "logps/rejected": -66.1624526977539, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.300183653831482, + "rewards/margins": 7.85737943649292, + "rewards/rejected": -9.157563209533691, + "step": 641 + }, + { + "epoch": 7.597633136094674, + "grad_norm": 3.558021048910989, + "learning_rate": 3.037365235741024e-09, + "logits/chosen": -1.1166036128997803, + "logits/rejected": -1.038915753364563, + "logps/chosen": -46.17367935180664, + "logps/rejected": -77.80493927001953, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5460665225982666, + "rewards/margins": 9.366143226623535, + "rewards/rejected": -11.912210464477539, + "step": 642 + }, + { + "epoch": 7.609467455621302, + "grad_norm": 3.2180872359588033, + "learning_rate": 2.8386267699152256e-09, + "logits/chosen": -1.32170832157135, + "logits/rejected": -1.1445355415344238, + "logps/chosen": -33.306297302246094, + "logps/rejected": -64.81905364990234, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5187739133834839, + "rewards/margins": 7.3708953857421875, + "rewards/rejected": -8.889669418334961, + "step": 643 + }, + { + "epoch": 7.621301775147929, + "grad_norm": 4.516310865033977, + "learning_rate": 2.6465749063149245e-09, + "logits/chosen": -1.3887925148010254, + "logits/rejected": -1.5802987813949585, + "logps/chosen": -41.122772216796875, + "logps/rejected": -60.960174560546875, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43830376863479614, + "rewards/margins": 8.353026390075684, + "rewards/rejected": -8.79133129119873, + "step": 644 + }, + { + "epoch": 7.633136094674557, + "grad_norm": 3.522662435537469, + "learning_rate": 2.461214840632331e-09, + "logits/chosen": -1.2272520065307617, + "logits/rejected": -1.2148579359054565, + "logps/chosen": -40.47775650024414, + "logps/rejected": -66.96976470947266, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9865707159042358, + "rewards/margins": 8.534340858459473, + "rewards/rejected": -9.520912170410156, + "step": 645 + }, + { + "epoch": 7.644970414201183, + "grad_norm": 5.08495729529155, + "learning_rate": 2.282551587522441e-09, + "logits/chosen": -1.2286673784255981, + "logits/rejected": -1.187401294708252, + "logps/chosen": -38.408416748046875, + "logps/rejected": -66.8298110961914, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.147416591644287, + "rewards/margins": 8.060477256774902, + "rewards/rejected": -10.207894325256348, + "step": 646 + }, + { + "epoch": 7.65680473372781, + "grad_norm": 3.2465540149343206, + "learning_rate": 2.1105899804675363e-09, + "logits/chosen": -1.167961597442627, + "logits/rejected": -1.1866670846939087, + "logps/chosen": -45.849952697753906, + "logps/rejected": -69.32218170166016, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9047467708587646, + "rewards/margins": 8.320653915405273, + "rewards/rejected": -10.225400924682617, + "step": 647 + }, + { + "epoch": 7.668639053254438, + "grad_norm": 3.0376394347131352, + "learning_rate": 1.9453346716462316e-09, + "logits/chosen": -1.45213782787323, + "logits/rejected": -1.4141486883163452, + "logps/chosen": -36.55447006225586, + "logps/rejected": -63.271644592285156, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9351499080657959, + "rewards/margins": 8.255223274230957, + "rewards/rejected": -9.190373420715332, + "step": 648 + }, + { + "epoch": 7.680473372781065, + "grad_norm": 4.323080858658896, + "learning_rate": 1.7867901318077695e-09, + "logits/chosen": -1.2777527570724487, + "logits/rejected": -1.1896119117736816, + "logps/chosen": -50.66539764404297, + "logps/rejected": -77.70809936523438, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.193788766860962, + "rewards/margins": 8.507574081420898, + "rewards/rejected": -11.701362609863281, + "step": 649 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 5.437515225024147, + "learning_rate": 1.6349606501509794e-09, + "logits/chosen": -1.4479446411132812, + "logits/rejected": -1.4918639659881592, + "logps/chosen": -37.68550109863281, + "logps/rejected": -60.039100646972656, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.733797550201416, + "rewards/margins": 7.636280059814453, + "rewards/rejected": -9.370077133178711, + "step": 650 + }, + { + "epoch": 7.704142011834319, + "grad_norm": 4.271376085946802, + "learning_rate": 1.489850334208259e-09, + "logits/chosen": -1.4708189964294434, + "logits/rejected": -1.3335071802139282, + "logps/chosen": -36.9536247253418, + "logps/rejected": -68.65011596679688, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.618829369544983, + "rewards/margins": 7.648519515991211, + "rewards/rejected": -9.267349243164062, + "step": 651 + }, + { + "epoch": 7.715976331360947, + "grad_norm": 4.326704771148402, + "learning_rate": 1.351463109734441e-09, + "logits/chosen": -1.3178719282150269, + "logits/rejected": -1.2290101051330566, + "logps/chosen": -39.71061325073242, + "logps/rejected": -64.45901489257812, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.014697790145874, + "rewards/margins": 7.242136478424072, + "rewards/rejected": -8.256834030151367, + "step": 652 + }, + { + "epoch": 7.727810650887574, + "grad_norm": 3.7356892625464018, + "learning_rate": 1.2198027206006822e-09, + "logits/chosen": -1.1409059762954712, + "logits/rejected": -1.035496711730957, + "logps/chosen": -44.18361282348633, + "logps/rejected": -72.97080993652344, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7735768556594849, + "rewards/margins": 8.308616638183594, + "rewards/rejected": -10.082193374633789, + "step": 653 + }, + { + "epoch": 7.739644970414201, + "grad_norm": 3.888887440172467, + "learning_rate": 1.0948727286930192e-09, + "logits/chosen": -1.0706766843795776, + "logits/rejected": -1.0967326164245605, + "logps/chosen": -52.38780212402344, + "logps/rejected": -74.34490966796875, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6245856285095215, + "rewards/margins": 8.349431037902832, + "rewards/rejected": -10.974016189575195, + "step": 654 + }, + { + "epoch": 7.7514792899408285, + "grad_norm": 4.910196712545756, + "learning_rate": 9.766765138160827e-10, + "logits/chosen": -1.3771216869354248, + "logits/rejected": -1.3673535585403442, + "logps/chosen": -26.542402267456055, + "logps/rejected": -53.72931671142578, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7858462929725647, + "rewards/margins": 8.258723258972168, + "rewards/rejected": -9.044569969177246, + "step": 655 + }, + { + "epoch": 7.763313609467455, + "grad_norm": 3.6303736143642094, + "learning_rate": 8.652172736017816e-10, + "logits/chosen": -1.2556291818618774, + "logits/rejected": -1.3194972276687622, + "logps/chosen": -45.683570861816406, + "logps/rejected": -59.34172821044922, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3988208770751953, + "rewards/margins": 7.259479522705078, + "rewards/rejected": -8.658300399780273, + "step": 656 + }, + { + "epoch": 7.775147928994083, + "grad_norm": 4.139059122264169, + "learning_rate": 7.604980234225122e-10, + "logits/chosen": -1.1913983821868896, + "logits/rejected": -1.2692341804504395, + "logps/chosen": -34.41987228393555, + "logps/rejected": -62.04487609863281, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5674678087234497, + "rewards/margins": 9.807378768920898, + "rewards/rejected": -10.374847412109375, + "step": 657 + }, + { + "epoch": 7.78698224852071, + "grad_norm": 3.618597489407487, + "learning_rate": 6.625215963098896e-10, + "logits/chosen": -1.019304871559143, + "logits/rejected": -1.0009970664978027, + "logps/chosen": -41.301448822021484, + "logps/rejected": -56.97692108154297, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1815185546875, + "rewards/margins": 5.260707855224609, + "rewards/rejected": -6.442225933074951, + "step": 658 + }, + { + "epoch": 7.798816568047338, + "grad_norm": 3.336765309826322, + "learning_rate": 5.712906428778919e-10, + "logits/chosen": -1.4606183767318726, + "logits/rejected": -1.3886492252349854, + "logps/chosen": -27.533708572387695, + "logps/rejected": -57.605125427246094, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1746628284454346, + "rewards/margins": 7.080971717834473, + "rewards/rejected": -8.255634307861328, + "step": 659 + }, + { + "epoch": 7.810650887573964, + "grad_norm": 3.5093648749750477, + "learning_rate": 4.868076312512515e-10, + "logits/chosen": -1.306321144104004, + "logits/rejected": -1.302777886390686, + "logps/chosen": -35.620086669921875, + "logps/rejected": -84.12001037597656, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.335205078125, + "rewards/margins": 9.759173393249512, + "rewards/rejected": -11.094379425048828, + "step": 660 + }, + { + "epoch": 7.822485207100591, + "grad_norm": 4.227789623534498, + "learning_rate": 4.090748469986471e-10, + "logits/chosen": -1.2036335468292236, + "logits/rejected": -1.2865405082702637, + "logps/chosen": -35.319541931152344, + "logps/rejected": -55.186180114746094, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08307170867919922, + "rewards/margins": 6.8585710525512695, + "rewards/rejected": -6.77549934387207, + "step": 661 + }, + { + "epoch": 7.834319526627219, + "grad_norm": 3.6041816520598307, + "learning_rate": 3.3809439307086463e-10, + "logits/chosen": -1.1416850090026855, + "logits/rejected": -1.2989619970321655, + "logps/chosen": -46.875675201416016, + "logps/rejected": -77.26806640625, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.14378023147583, + "rewards/margins": 8.901089668273926, + "rewards/rejected": -11.044870376586914, + "step": 662 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 4.065861079364059, + "learning_rate": 2.7386818974395323e-10, + "logits/chosen": -1.4301998615264893, + "logits/rejected": -1.3408135175704956, + "logps/chosen": -44.26204299926758, + "logps/rejected": -67.56184387207031, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9295194149017334, + "rewards/margins": 6.951984882354736, + "rewards/rejected": -9.88150405883789, + "step": 663 + }, + { + "epoch": 7.8579881656804735, + "grad_norm": 3.4612690772571497, + "learning_rate": 2.1639797456723952e-10, + "logits/chosen": -1.22926664352417, + "logits/rejected": -1.2551913261413574, + "logps/chosen": -35.541709899902344, + "logps/rejected": -57.685508728027344, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7379496693611145, + "rewards/margins": 7.399726867675781, + "rewards/rejected": -8.137676239013672, + "step": 664 + }, + { + "epoch": 7.8698224852071, + "grad_norm": 3.9801474006233795, + "learning_rate": 1.6568530231628185e-10, + "logits/chosen": -1.4825738668441772, + "logits/rejected": -1.4905058145523071, + "logps/chosen": -37.85133361816406, + "logps/rejected": -63.87050247192383, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4308565855026245, + "rewards/margins": 8.55210018157959, + "rewards/rejected": -9.982955932617188, + "step": 665 + }, + { + "epoch": 7.881656804733728, + "grad_norm": 4.104029377641207, + "learning_rate": 1.21731544950876e-10, + "logits/chosen": -1.226098656654358, + "logits/rejected": -1.371010422706604, + "logps/chosen": -48.73766326904297, + "logps/rejected": -67.00294494628906, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1735711097717285, + "rewards/margins": 7.302567481994629, + "rewards/rejected": -9.476139068603516, + "step": 666 + }, + { + "epoch": 7.893491124260355, + "grad_norm": 4.0448161239485145, + "learning_rate": 8.453789157794599e-11, + "logits/chosen": -1.1803443431854248, + "logits/rejected": -1.2014808654785156, + "logps/chosen": -38.218624114990234, + "logps/rejected": -63.20378112792969, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.791195273399353, + "rewards/margins": 8.358329772949219, + "rewards/rejected": -9.14952564239502, + "step": 667 + }, + { + "epoch": 7.905325443786982, + "grad_norm": 4.422193721862127, + "learning_rate": 5.4105348419264394e-11, + "logits/chosen": -1.3956677913665771, + "logits/rejected": -1.1959235668182373, + "logps/chosen": -37.42644119262695, + "logps/rejected": -67.5661392211914, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9582293629646301, + "rewards/margins": 8.263663291931152, + "rewards/rejected": -9.221893310546875, + "step": 668 + }, + { + "epoch": 7.9171597633136095, + "grad_norm": 2.7605966616715523, + "learning_rate": 3.043473878436287e-11, + "logits/chosen": -1.191653847694397, + "logits/rejected": -1.153673768043518, + "logps/chosen": -44.5345573425293, + "logps/rejected": -70.56991577148438, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7923838496208191, + "rewards/margins": 8.033748626708984, + "rewards/rejected": -8.826131820678711, + "step": 669 + }, + { + "epoch": 7.928994082840236, + "grad_norm": 3.739721658753547, + "learning_rate": 1.3526703048216682e-11, + "logits/chosen": -1.5901668071746826, + "logits/rejected": -1.452099323272705, + "logps/chosen": -38.17021942138672, + "logps/rejected": -63.99696350097656, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6206248998641968, + "rewards/margins": 6.980554580688477, + "rewards/rejected": -8.601179122924805, + "step": 670 + }, + { + "epoch": 7.940828402366864, + "grad_norm": 4.73168147418833, + "learning_rate": 3.3816986338142117e-12, + "logits/chosen": -1.312976598739624, + "logits/rejected": -1.323945164680481, + "logps/chosen": -33.24012756347656, + "logps/rejected": -62.29106140136719, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5721168518066406, + "rewards/margins": 8.698356628417969, + "rewards/rejected": -10.27047348022461, + "step": 671 + }, + { + "epoch": 7.952662721893491, + "grad_norm": 4.043153951778064, + "learning_rate": 0.0, + "logits/chosen": -1.2264971733093262, + "logits/rejected": -1.2341418266296387, + "logps/chosen": -45.756378173828125, + "logps/rejected": -61.860496520996094, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.606259822845459, + "rewards/margins": 6.51239013671875, + "rewards/rejected": -9.118650436401367, + "step": 672 + }, + { + "epoch": 7.952662721893491, + "step": 672, + "total_flos": 0.0, + "train_loss": 0.13648563410000256, + "train_runtime": 6735.2566, + "train_samples_per_second": 12.822, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1, + "max_steps": 672, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 300, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}