diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7158 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.988679245283019, + "eval_steps": 500, + "global_step": 396, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 8.418300067215691, + "learning_rate": 1.25e-08, + "logps/chosen": -39.02219009399414, + "logps/rejected": -45.12399673461914, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 1.552122950553894, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -39.02219009399414, + "ref_logps/rejected": -45.12399673461914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.02, + "grad_norm": 7.6721075942113535, + "learning_rate": 2.5e-08, + "logps/chosen": -37.21428680419922, + "logps/rejected": -44.4819221496582, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 1.6663763523101807, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -37.21428680419922, + "ref_logps/rejected": -44.4819221496582, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.02, + "grad_norm": 7.948340327346078, + "learning_rate": 3.75e-08, + "logps/chosen": -41.46142578125, + "logps/rejected": -52.18663024902344, + "loss": 0.6926, + "losses/dpo": 0.6867616176605225, + "losses/sft": 1.7890703678131104, + "losses/total": 0.6867616176605225, + "ref_logps/chosen": -41.46522903442383, + "ref_logps/rejected": -52.1768798828125, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.0003804098814725876, + "rewards/margins": 0.0013556077610701323, + "rewards/rejected": -0.0009751979378052056, + "step": 3 + }, + { + "epoch": 0.03, + "grad_norm": 8.038993728778431, + "learning_rate": 5e-08, + "logps/chosen": -39.45478057861328, + "logps/rejected": -45.85334014892578, + "loss": 0.6936, + "losses/dpo": 0.6930198073387146, + "losses/sft": 1.6549196243286133, + "losses/total": 0.6930198073387146, + "ref_logps/chosen": -39.42698287963867, + "ref_logps/rejected": -45.83390426635742, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.002779680071398616, + "rewards/margins": -0.0008358716731891036, + "rewards/rejected": -0.001943808514624834, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 7.749366133023054, + "learning_rate": 6.25e-08, + "logps/chosen": -44.29286193847656, + "logps/rejected": -51.08875274658203, + "loss": 0.695, + "losses/dpo": 0.6971508264541626, + "losses/sft": 1.3629276752471924, + "losses/total": 0.6971508264541626, + "ref_logps/chosen": -44.301361083984375, + "ref_logps/rejected": -51.132286071777344, + "rewards/accuracies": 0.4453125, + "rewards/chosen": 0.0008498989045619965, + "rewards/margins": -0.003503247397020459, + "rewards/rejected": 0.004353146068751812, + "step": 5 + }, + { + "epoch": 0.05, + "grad_norm": 7.545034174633802, + "learning_rate": 7.5e-08, + "logps/chosen": -37.192138671875, + "logps/rejected": -44.56536865234375, + "loss": 0.693, + "losses/dpo": 0.6901252269744873, + "losses/sft": 1.235260248184204, + "losses/total": 0.6901252269744873, + "ref_logps/chosen": -37.197486877441406, + "ref_logps/rejected": -44.56662368774414, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.000535178929567337, + "rewards/margins": 0.000409391475841403, + "rewards/rejected": 0.00012578748282976449, + "step": 6 + }, + { + "epoch": 0.05, + "grad_norm": 7.645026557219469, + "learning_rate": 8.75e-08, + "logps/chosen": -40.067909240722656, + "logps/rejected": -46.251487731933594, + "loss": 0.6941, + "losses/dpo": 0.695063054561615, + "losses/sft": 1.8211989402770996, + "losses/total": 0.695063054561615, + "ref_logps/chosen": -40.05988311767578, + "ref_logps/rejected": -46.26015090942383, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.0008021063404157758, + "rewards/margins": -0.001668928423896432, + "rewards/rejected": 0.000866822199895978, + "step": 7 + }, + { + "epoch": 0.06, + "grad_norm": 8.091132576285203, + "learning_rate": 1e-07, + "logps/chosen": -44.73344802856445, + "logps/rejected": -46.818824768066406, + "loss": 0.6949, + "losses/dpo": 0.6943204402923584, + "losses/sft": 1.470657229423523, + "losses/total": 0.6943204402923584, + "ref_logps/chosen": -44.7131233215332, + "ref_logps/rejected": -46.83186340332031, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0020323917269706726, + "rewards/margins": -0.0033366940915584564, + "rewards/rejected": 0.0013043024810031056, + "step": 8 + }, + { + "epoch": 0.07, + "grad_norm": 7.987567478058529, + "learning_rate": 1.125e-07, + "logps/chosen": -40.069969177246094, + "logps/rejected": -50.12396240234375, + "loss": 0.6944, + "losses/dpo": 0.6880265474319458, + "losses/sft": 1.1660749912261963, + "losses/total": 0.6880265474319458, + "ref_logps/chosen": -40.05424118041992, + "ref_logps/rejected": -50.13201904296875, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.0015729822916910052, + "rewards/margins": -0.002378995530307293, + "rewards/rejected": 0.000806013063993305, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 7.458190702935405, + "learning_rate": 1.25e-07, + "logps/chosen": -36.18950271606445, + "logps/rejected": -45.11402130126953, + "loss": 0.694, + "losses/dpo": 0.6928867697715759, + "losses/sft": 1.605255365371704, + "losses/total": 0.6928867697715759, + "ref_logps/chosen": -36.18606185913086, + "ref_logps/rejected": -45.12499237060547, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.00034411592059768736, + "rewards/margins": -0.0014411872252821922, + "rewards/rejected": 0.0010970717994496226, + "step": 10 + }, + { + "epoch": 0.08, + "grad_norm": 7.498431737676318, + "learning_rate": 1.375e-07, + "logps/chosen": -38.895912170410156, + "logps/rejected": -44.2772216796875, + "loss": 0.6922, + "losses/dpo": 0.6932664513587952, + "losses/sft": 1.4097728729248047, + "losses/total": 0.6932664513587952, + "ref_logps/chosen": -38.896759033203125, + "ref_logps/rejected": -44.25825881958008, + "rewards/accuracies": 0.53125, + "rewards/chosen": 8.490856271237135e-05, + "rewards/margins": 0.0019812812097370625, + "rewards/rejected": -0.001896372647024691, + "step": 11 + }, + { + "epoch": 0.09, + "grad_norm": 7.699329532824058, + "learning_rate": 1.5e-07, + "logps/chosen": -41.140281677246094, + "logps/rejected": -45.357364654541016, + "loss": 0.6914, + "losses/dpo": 0.6933699250221252, + "losses/sft": 1.6783134937286377, + "losses/total": 0.6933699250221252, + "ref_logps/chosen": -41.16625213623047, + "ref_logps/rejected": -45.34600830078125, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.002597447484731674, + "rewards/margins": 0.0037334603257477283, + "rewards/rejected": -0.0011360126081854105, + "step": 12 + }, + { + "epoch": 0.1, + "grad_norm": 7.69326977444187, + "learning_rate": 1.625e-07, + "logps/chosen": -40.31540298461914, + "logps/rejected": -50.180397033691406, + "loss": 0.6925, + "losses/dpo": 0.6910371780395508, + "losses/sft": 1.366438865661621, + "losses/total": 0.6910371780395508, + "ref_logps/chosen": -40.30924987792969, + "ref_logps/rejected": -50.16073989868164, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.0006153000867925584, + "rewards/margins": 0.0013502361252903938, + "rewards/rejected": -0.0019655367359519005, + "step": 13 + }, + { + "epoch": 0.11, + "grad_norm": 8.114372207024019, + "learning_rate": 1.75e-07, + "logps/chosen": -37.29108428955078, + "logps/rejected": -44.525848388671875, + "loss": 0.6911, + "losses/dpo": 0.6899442076683044, + "losses/sft": 1.4768216609954834, + "losses/total": 0.6899442076683044, + "ref_logps/chosen": -37.311187744140625, + "ref_logps/rejected": -44.50410079956055, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 0.0020105522125959396, + "rewards/margins": 0.004185608588159084, + "rewards/rejected": -0.0021750556770712137, + "step": 14 + }, + { + "epoch": 0.11, + "grad_norm": 7.835230950103234, + "learning_rate": 1.875e-07, + "logps/chosen": -38.33734893798828, + "logps/rejected": -43.93443298339844, + "loss": 0.6932, + "losses/dpo": 0.6937546133995056, + "losses/sft": 1.4263617992401123, + "losses/total": 0.6937546133995056, + "ref_logps/chosen": -38.31421661376953, + "ref_logps/rejected": -43.90996170043945, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.0023131906054913998, + "rewards/margins": 0.00013422727352008224, + "rewards/rejected": -0.002447417937219143, + "step": 15 + }, + { + "epoch": 0.12, + "grad_norm": 7.58949645380321, + "learning_rate": 2e-07, + "logps/chosen": -40.26841735839844, + "logps/rejected": -43.40159225463867, + "loss": 0.6934, + "losses/dpo": 0.7032474279403687, + "losses/sft": 1.5701673030853271, + "losses/total": 0.7032474279403687, + "ref_logps/chosen": -40.24923324584961, + "ref_logps/rejected": -43.38625717163086, + "rewards/accuracies": 0.4921875, + "rewards/chosen": -0.0019177356734871864, + "rewards/margins": -0.0003846373874694109, + "rewards/rejected": -0.0015330985188484192, + "step": 16 + }, + { + "epoch": 0.13, + "grad_norm": 7.829467009021115, + "learning_rate": 2.1249999999999998e-07, + "logps/chosen": -41.63703536987305, + "logps/rejected": -46.70919418334961, + "loss": 0.6952, + "losses/dpo": 0.6850873231887817, + "losses/sft": 1.4479947090148926, + "losses/total": 0.6850873231887817, + "ref_logps/chosen": -41.58295822143555, + "ref_logps/rejected": -46.69389343261719, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.005406979937106371, + "rewards/margins": -0.003876863745972514, + "rewards/rejected": -0.0015301161911338568, + "step": 17 + }, + { + "epoch": 0.14, + "grad_norm": 7.485609028305383, + "learning_rate": 2.25e-07, + "logps/chosen": -40.4469108581543, + "logps/rejected": -44.94635009765625, + "loss": 0.6927, + "losses/dpo": 0.6907854080200195, + "losses/sft": 1.1833800077438354, + "losses/total": 0.6907854080200195, + "ref_logps/chosen": -40.407257080078125, + "ref_logps/rejected": -44.894813537597656, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.003965577110648155, + "rewards/margins": 0.0011882353574037552, + "rewards/rejected": -0.005153812933713198, + "step": 18 + }, + { + "epoch": 0.14, + "grad_norm": 7.404661140565325, + "learning_rate": 2.3749999999999998e-07, + "logps/chosen": -35.739524841308594, + "logps/rejected": -46.330265045166016, + "loss": 0.6923, + "losses/dpo": 0.6935074329376221, + "losses/sft": 1.8608835935592651, + "losses/total": 0.6935074329376221, + "ref_logps/chosen": -35.708274841308594, + "ref_logps/rejected": -46.27949523925781, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.003125070594251156, + "rewards/margins": 0.001951692276634276, + "rewards/rejected": -0.00507676275447011, + "step": 19 + }, + { + "epoch": 0.15, + "grad_norm": 7.514252016049334, + "learning_rate": 2.5e-07, + "logps/chosen": -40.884029388427734, + "logps/rejected": -47.1005859375, + "loss": 0.6939, + "losses/dpo": 0.6919451355934143, + "losses/sft": 1.3290549516677856, + "losses/total": 0.6919451355934143, + "ref_logps/chosen": -40.82601547241211, + "ref_logps/rejected": -47.054588317871094, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.0058018057607114315, + "rewards/margins": -0.001202343963086605, + "rewards/rejected": -0.004599461797624826, + "step": 20 + }, + { + "epoch": 0.16, + "grad_norm": 11.774765376903847, + "learning_rate": 2.625e-07, + "logps/chosen": -40.38345718383789, + "logps/rejected": -51.66474914550781, + "loss": 0.6922, + "losses/dpo": 0.6949824690818787, + "losses/sft": 1.548210620880127, + "losses/total": 0.6949824690818787, + "ref_logps/chosen": -40.31751251220703, + "ref_logps/rejected": -51.57673645019531, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.006594239268451929, + "rewards/margins": 0.002207120880484581, + "rewards/rejected": -0.008801360614597797, + "step": 21 + }, + { + "epoch": 0.17, + "grad_norm": 7.542951860866026, + "learning_rate": 2.75e-07, + "logps/chosen": -38.18585205078125, + "logps/rejected": -47.13993835449219, + "loss": 0.6924, + "losses/dpo": 0.6899946928024292, + "losses/sft": 1.1579391956329346, + "losses/total": 0.6899946928024292, + "ref_logps/chosen": -38.091922760009766, + "ref_logps/rejected": -47.029541015625, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.009392979554831982, + "rewards/margins": 0.0016468917019665241, + "rewards/rejected": -0.011039872653782368, + "step": 22 + }, + { + "epoch": 0.17, + "grad_norm": 7.876385596657873, + "learning_rate": 2.8749999999999995e-07, + "logps/chosen": -38.30439376831055, + "logps/rejected": -49.61843490600586, + "loss": 0.6895, + "losses/dpo": 0.6851339936256409, + "losses/sft": 1.5843318700790405, + "losses/total": 0.6851339936256409, + "ref_logps/chosen": -38.242801666259766, + "ref_logps/rejected": -49.4824104309082, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.006159077398478985, + "rewards/margins": 0.007443387992680073, + "rewards/rejected": -0.013602466322481632, + "step": 23 + }, + { + "epoch": 0.18, + "grad_norm": 7.668314504955077, + "learning_rate": 3e-07, + "logps/chosen": -40.48552322387695, + "logps/rejected": -46.48503494262695, + "loss": 0.6895, + "losses/dpo": 0.6920894980430603, + "losses/sft": 1.4189947843551636, + "losses/total": 0.6920894980430603, + "ref_logps/chosen": -40.395687103271484, + "ref_logps/rejected": -46.31945037841797, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.008983338251709938, + "rewards/margins": 0.007574939634650946, + "rewards/rejected": -0.01655827835202217, + "step": 24 + }, + { + "epoch": 0.19, + "grad_norm": 7.344435357199023, + "learning_rate": 3.1249999999999997e-07, + "logps/chosen": -39.81501388549805, + "logps/rejected": -45.954071044921875, + "loss": 0.6914, + "losses/dpo": 0.6927582621574402, + "losses/sft": 1.6129871606826782, + "losses/total": 0.6927582621574402, + "ref_logps/chosen": -39.700050354003906, + "ref_logps/rejected": -45.80275344848633, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.011496355757117271, + "rewards/margins": 0.0036351331509649754, + "rewards/rejected": -0.01513148844242096, + "step": 25 + }, + { + "epoch": 0.2, + "grad_norm": 7.977738556938539, + "learning_rate": 3.25e-07, + "logps/chosen": -39.41866683959961, + "logps/rejected": -47.94341278076172, + "loss": 0.6919, + "losses/dpo": 0.6823984384536743, + "losses/sft": 1.1218098402023315, + "losses/total": 0.6823984384536743, + "ref_logps/chosen": -39.244632720947266, + "ref_logps/rejected": -47.742191314697266, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.017403149977326393, + "rewards/margins": 0.002718748524785042, + "rewards/rejected": -0.020121898502111435, + "step": 26 + }, + { + "epoch": 0.2, + "grad_norm": 7.137045291395353, + "learning_rate": 3.375e-07, + "logps/chosen": -36.09528350830078, + "logps/rejected": -43.786441802978516, + "loss": 0.6934, + "losses/dpo": 0.7064226269721985, + "losses/sft": 1.185333251953125, + "losses/total": 0.7064226269721985, + "ref_logps/chosen": -35.91387939453125, + "ref_logps/rejected": -43.607757568359375, + "rewards/accuracies": 0.5390625, + "rewards/chosen": -0.01814102753996849, + "rewards/margins": -0.0002719040203373879, + "rewards/rejected": -0.017869124189019203, + "step": 27 + }, + { + "epoch": 0.21, + "grad_norm": 7.379899822037054, + "learning_rate": 3.5e-07, + "logps/chosen": -44.72355270385742, + "logps/rejected": -47.34676742553711, + "loss": 0.6909, + "losses/dpo": 0.6887847185134888, + "losses/sft": 1.6178021430969238, + "losses/total": 0.6887847185134888, + "ref_logps/chosen": -44.496578216552734, + "ref_logps/rejected": -47.07260513305664, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02269744500517845, + "rewards/margins": 0.004718274809420109, + "rewards/rejected": -0.027415720745921135, + "step": 28 + }, + { + "epoch": 0.22, + "grad_norm": 8.0485668325091, + "learning_rate": 3.6249999999999997e-07, + "logps/chosen": -41.700931549072266, + "logps/rejected": -50.03131103515625, + "loss": 0.6893, + "losses/dpo": 0.6823500394821167, + "losses/sft": 1.4876271486282349, + "losses/total": 0.6823500394821167, + "ref_logps/chosen": -41.42716979980469, + "ref_logps/rejected": -49.678436279296875, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.027376368641853333, + "rewards/margins": 0.007911860011518002, + "rewards/rejected": -0.03528822585940361, + "step": 29 + }, + { + "epoch": 0.23, + "grad_norm": 7.828639871809106, + "learning_rate": 3.75e-07, + "logps/chosen": -41.79233932495117, + "logps/rejected": -48.521629333496094, + "loss": 0.6884, + "losses/dpo": 0.6861717700958252, + "losses/sft": 1.3226033449172974, + "losses/total": 0.6861717700958252, + "ref_logps/chosen": -41.542991638183594, + "ref_logps/rejected": -48.17402648925781, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.024935012683272362, + "rewards/margins": 0.009825671091675758, + "rewards/rejected": -0.03476068750023842, + "step": 30 + }, + { + "epoch": 0.23, + "grad_norm": 8.365578824729102, + "learning_rate": 3.875e-07, + "logps/chosen": -41.142433166503906, + "logps/rejected": -48.93161392211914, + "loss": 0.6954, + "losses/dpo": 0.6871266961097717, + "losses/sft": 1.0435420274734497, + "losses/total": 0.6871266961097717, + "ref_logps/chosen": -40.73477554321289, + "ref_logps/rejected": -48.56393814086914, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.04076562076807022, + "rewards/margins": -0.00399819714948535, + "rewards/rejected": -0.03676741570234299, + "step": 31 + }, + { + "epoch": 0.24, + "grad_norm": 7.2403699625390585, + "learning_rate": 4e-07, + "logps/chosen": -37.81801223754883, + "logps/rejected": -46.747371673583984, + "loss": 0.6899, + "losses/dpo": 0.696927011013031, + "losses/sft": 1.7572060823440552, + "losses/total": 0.696927011013031, + "ref_logps/chosen": -37.4744758605957, + "ref_logps/rejected": -46.330135345458984, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.03435356542468071, + "rewards/margins": 0.007370149716734886, + "rewards/rejected": -0.04172371327877045, + "step": 32 + }, + { + "epoch": 0.25, + "grad_norm": 7.310694198303384, + "learning_rate": 4.1249999999999997e-07, + "logps/chosen": -35.633243560791016, + "logps/rejected": -41.00613021850586, + "loss": 0.6852, + "losses/dpo": 0.6834661960601807, + "losses/sft": 1.3767448663711548, + "losses/total": 0.6834661960601807, + "ref_logps/chosen": -35.318153381347656, + "ref_logps/rejected": -40.5264892578125, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.03150848299264908, + "rewards/margins": 0.016455503180623055, + "rewards/rejected": -0.04796398803591728, + "step": 33 + }, + { + "epoch": 0.26, + "grad_norm": 7.510894622614614, + "learning_rate": 4.2499999999999995e-07, + "logps/chosen": -42.22370529174805, + "logps/rejected": -48.45228576660156, + "loss": 0.688, + "losses/dpo": 0.6784626841545105, + "losses/sft": 1.7890020608901978, + "losses/total": 0.6784626841545105, + "ref_logps/chosen": -41.76036834716797, + "ref_logps/rejected": -47.876914978027344, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.046334076672792435, + "rewards/margins": 0.011203275993466377, + "rewards/rejected": -0.05753735080361366, + "step": 34 + }, + { + "epoch": 0.26, + "grad_norm": 7.524260365013066, + "learning_rate": 4.375e-07, + "logps/chosen": -41.117164611816406, + "logps/rejected": -47.30539321899414, + "loss": 0.6874, + "losses/dpo": 0.6890352368354797, + "losses/sft": 1.9127196073532104, + "losses/total": 0.6890352368354797, + "ref_logps/chosen": -40.611358642578125, + "ref_logps/rejected": -46.675148010253906, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.0505804568529129, + "rewards/margins": 0.01244389358907938, + "rewards/rejected": -0.0630243569612503, + "step": 35 + }, + { + "epoch": 0.27, + "grad_norm": 7.835502878067996, + "learning_rate": 4.5e-07, + "logps/chosen": -42.31553649902344, + "logps/rejected": -48.77557373046875, + "loss": 0.6846, + "losses/dpo": 0.6801737546920776, + "losses/sft": 1.0419285297393799, + "losses/total": 0.6801737546920776, + "ref_logps/chosen": -41.76920700073242, + "ref_logps/rejected": -48.04461669921875, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.054632995277643204, + "rewards/margins": 0.0184622872620821, + "rewards/rejected": -0.07309528440237045, + "step": 36 + }, + { + "epoch": 0.28, + "grad_norm": 10.089436620525504, + "learning_rate": 4.625e-07, + "logps/chosen": -40.23670196533203, + "logps/rejected": -44.72167205810547, + "loss": 0.6856, + "losses/dpo": 0.6820752620697021, + "losses/sft": 1.6919959783554077, + "losses/total": 0.6820752620697021, + "ref_logps/chosen": -39.70112991333008, + "ref_logps/rejected": -44.0257682800293, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05355698987841606, + "rewards/margins": 0.016033286228775978, + "rewards/rejected": -0.06959027796983719, + "step": 37 + }, + { + "epoch": 0.29, + "grad_norm": 7.4705752998877974, + "learning_rate": 4.7499999999999995e-07, + "logps/chosen": -40.608951568603516, + "logps/rejected": -46.77935791015625, + "loss": 0.6846, + "losses/dpo": 0.7083909511566162, + "losses/sft": 1.3596407175064087, + "losses/total": 0.7083909511566162, + "ref_logps/chosen": -40.02342987060547, + "ref_logps/rejected": -46.008670806884766, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.05855226144194603, + "rewards/margins": 0.018516112118959427, + "rewards/rejected": -0.07706836611032486, + "step": 38 + }, + { + "epoch": 0.29, + "grad_norm": 7.4834962101051445, + "learning_rate": 4.875e-07, + "logps/chosen": -38.924591064453125, + "logps/rejected": -44.15880584716797, + "loss": 0.6806, + "losses/dpo": 0.6712931394577026, + "losses/sft": 1.4741321802139282, + "losses/total": 0.6712931394577026, + "ref_logps/chosen": -38.32358932495117, + "ref_logps/rejected": -43.294708251953125, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.060100097209215164, + "rewards/margins": 0.02630985900759697, + "rewards/rejected": -0.08640995621681213, + "step": 39 + }, + { + "epoch": 0.3, + "grad_norm": 7.593289226892247, + "learning_rate": 5e-07, + "logps/chosen": -36.14179611206055, + "logps/rejected": -43.69697952270508, + "loss": 0.6893, + "losses/dpo": 0.6905455589294434, + "losses/sft": 1.8340303897857666, + "losses/total": 0.6905455589294434, + "ref_logps/chosen": -35.46891784667969, + "ref_logps/rejected": -42.935638427734375, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.06728792935609818, + "rewards/margins": 0.008846651762723923, + "rewards/rejected": -0.0761345773935318, + "step": 40 + }, + { + "epoch": 0.31, + "grad_norm": 7.184019422431451, + "learning_rate": 4.985955056179775e-07, + "logps/chosen": -36.04156494140625, + "logps/rejected": -44.501773834228516, + "loss": 0.6765, + "losses/dpo": 0.6632527112960815, + "losses/sft": 1.562534213066101, + "losses/total": 0.6632527112960815, + "ref_logps/chosen": -35.38131332397461, + "ref_logps/rejected": -43.47880935668945, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.06602565199136734, + "rewards/margins": 0.036271147429943085, + "rewards/rejected": -0.10229679197072983, + "step": 41 + }, + { + "epoch": 0.32, + "grad_norm": 7.817004450242549, + "learning_rate": 4.97191011235955e-07, + "logps/chosen": -40.78254699707031, + "logps/rejected": -48.181861877441406, + "loss": 0.6803, + "losses/dpo": 0.7099467515945435, + "losses/sft": 1.8783167600631714, + "losses/total": 0.7099467515945435, + "ref_logps/chosen": -40.004154205322266, + "ref_logps/rejected": -47.115760803222656, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.07783940434455872, + "rewards/margins": 0.028770849108695984, + "rewards/rejected": -0.1066102534532547, + "step": 42 + }, + { + "epoch": 0.32, + "grad_norm": 7.433654746121009, + "learning_rate": 4.957865168539325e-07, + "logps/chosen": -40.71480941772461, + "logps/rejected": -47.88724136352539, + "loss": 0.6803, + "losses/dpo": 0.7072566151618958, + "losses/sft": 1.6432607173919678, + "losses/total": 0.7072566151618958, + "ref_logps/chosen": -39.739933013916016, + "ref_logps/rejected": -46.61668014526367, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.09748795628547668, + "rewards/margins": 0.029567349702119827, + "rewards/rejected": -0.1270553022623062, + "step": 43 + }, + { + "epoch": 0.33, + "grad_norm": 7.64577773243097, + "learning_rate": 4.943820224719101e-07, + "logps/chosen": -36.590328216552734, + "logps/rejected": -45.61329650878906, + "loss": 0.6824, + "losses/dpo": 0.6843121647834778, + "losses/sft": 2.019310235977173, + "losses/total": 0.6843121647834778, + "ref_logps/chosen": -35.54472732543945, + "ref_logps/rejected": -44.32211685180664, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.10456002503633499, + "rewards/margins": 0.024558255448937416, + "rewards/rejected": -0.12911829352378845, + "step": 44 + }, + { + "epoch": 0.34, + "grad_norm": 7.451674659941506, + "learning_rate": 4.929775280898877e-07, + "logps/chosen": -39.82986068725586, + "logps/rejected": -44.68933868408203, + "loss": 0.6767, + "losses/dpo": 0.6984357833862305, + "losses/sft": 1.321048617362976, + "losses/total": 0.6984357833862305, + "ref_logps/chosen": -38.77911376953125, + "ref_logps/rejected": -43.251163482666016, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.10507487505674362, + "rewards/margins": 0.038742441684007645, + "rewards/rejected": -0.14381732046604156, + "step": 45 + }, + { + "epoch": 0.35, + "grad_norm": 7.481580488082584, + "learning_rate": 4.915730337078651e-07, + "logps/chosen": -40.547637939453125, + "logps/rejected": -48.37934112548828, + "loss": 0.6784, + "losses/dpo": 0.682788610458374, + "losses/sft": 1.305440902709961, + "losses/total": 0.682788610458374, + "ref_logps/chosen": -39.39177703857422, + "ref_logps/rejected": -46.888671875, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.11558566987514496, + "rewards/margins": 0.03348149359226227, + "rewards/rejected": -0.14906716346740723, + "step": 46 + }, + { + "epoch": 0.35, + "grad_norm": 7.503690704863454, + "learning_rate": 4.901685393258427e-07, + "logps/chosen": -43.58844757080078, + "logps/rejected": -46.27735137939453, + "loss": 0.6796, + "losses/dpo": 0.6725805997848511, + "losses/sft": 1.7845996618270874, + "losses/total": 0.6725805997848511, + "ref_logps/chosen": -42.20465850830078, + "ref_logps/rejected": -44.57255554199219, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.13837924599647522, + "rewards/margins": 0.03210053965449333, + "rewards/rejected": -0.17047978937625885, + "step": 47 + }, + { + "epoch": 0.36, + "grad_norm": 7.6005277154002275, + "learning_rate": 4.887640449438202e-07, + "logps/chosen": -40.19657897949219, + "logps/rejected": -46.2965087890625, + "loss": 0.6806, + "losses/dpo": 0.6915363669395447, + "losses/sft": 1.4134502410888672, + "losses/total": 0.6915363669395447, + "ref_logps/chosen": -38.93235778808594, + "ref_logps/rejected": -44.731178283691406, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.12642225623130798, + "rewards/margins": 0.030110429972410202, + "rewards/rejected": -0.15653270483016968, + "step": 48 + }, + { + "epoch": 0.37, + "grad_norm": 7.342034157176263, + "learning_rate": 4.873595505617978e-07, + "logps/chosen": -35.246055603027344, + "logps/rejected": -45.013092041015625, + "loss": 0.671, + "losses/dpo": 0.6273987293243408, + "losses/sft": 1.2200208902359009, + "losses/total": 0.6273987293243408, + "ref_logps/chosen": -33.989261627197266, + "ref_logps/rejected": -43.26504898071289, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.12567944824695587, + "rewards/margins": 0.04912441223859787, + "rewards/rejected": -0.17480388283729553, + "step": 49 + }, + { + "epoch": 0.38, + "grad_norm": 7.768856876362336, + "learning_rate": 4.859550561797752e-07, + "logps/chosen": -41.647483825683594, + "logps/rejected": -49.59557342529297, + "loss": 0.6668, + "losses/dpo": 0.6410457491874695, + "losses/sft": 2.0004844665527344, + "losses/total": 0.6410457491874695, + "ref_logps/chosen": -40.150108337402344, + "ref_logps/rejected": -47.4974365234375, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.1497381180524826, + "rewards/margins": 0.06007564440369606, + "rewards/rejected": -0.20981375873088837, + "step": 50 + }, + { + "epoch": 0.38, + "grad_norm": 7.482808682633612, + "learning_rate": 4.845505617977528e-07, + "logps/chosen": -42.28547668457031, + "logps/rejected": -46.57417297363281, + "loss": 0.6686, + "losses/dpo": 0.6821735501289368, + "losses/sft": 1.643945336341858, + "losses/total": 0.6821735501289368, + "ref_logps/chosen": -40.781097412109375, + "ref_logps/rejected": -44.47871017456055, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15043821930885315, + "rewards/margins": 0.05910744518041611, + "rewards/rejected": -0.20954564213752747, + "step": 51 + }, + { + "epoch": 0.39, + "grad_norm": 7.444552223013138, + "learning_rate": 4.831460674157303e-07, + "logps/chosen": -38.653770446777344, + "logps/rejected": -47.96025848388672, + "loss": 0.6696, + "losses/dpo": 0.653758704662323, + "losses/sft": 1.9075889587402344, + "losses/total": 0.653758704662323, + "ref_logps/chosen": -37.04439163208008, + "ref_logps/rejected": -45.76567840576172, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16093730926513672, + "rewards/margins": 0.05852021649479866, + "rewards/rejected": -0.2194575071334839, + "step": 52 + }, + { + "epoch": 0.4, + "grad_norm": 7.051622049892774, + "learning_rate": 4.817415730337078e-07, + "logps/chosen": -36.511940002441406, + "logps/rejected": -42.634193420410156, + "loss": 0.672, + "losses/dpo": 0.6566299200057983, + "losses/sft": 1.6063774824142456, + "losses/total": 0.6566299200057983, + "ref_logps/chosen": -34.99435806274414, + "ref_logps/rejected": -40.612342834472656, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.15175840258598328, + "rewards/margins": 0.05042674392461777, + "rewards/rejected": -0.20218515396118164, + "step": 53 + }, + { + "epoch": 0.41, + "grad_norm": 7.752968590362967, + "learning_rate": 4.803370786516854e-07, + "logps/chosen": -43.374481201171875, + "logps/rejected": -46.1808967590332, + "loss": 0.664, + "losses/dpo": 0.5915548801422119, + "losses/sft": 1.5764846801757812, + "losses/total": 0.5915548801422119, + "ref_logps/chosen": -41.619239807128906, + "ref_logps/rejected": -43.757720947265625, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.17552456259727478, + "rewards/margins": 0.06679282337427139, + "rewards/rejected": -0.24231737852096558, + "step": 54 + }, + { + "epoch": 0.42, + "grad_norm": 7.375135165111918, + "learning_rate": 4.789325842696629e-07, + "logps/chosen": -40.775726318359375, + "logps/rejected": -45.556365966796875, + "loss": 0.6783, + "losses/dpo": 0.6768916845321655, + "losses/sft": 1.3732706308364868, + "losses/total": 0.6768916845321655, + "ref_logps/chosen": -39.01034927368164, + "ref_logps/rejected": -43.436553955078125, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.17653760313987732, + "rewards/margins": 0.03544352203607559, + "rewards/rejected": -0.21198111772537231, + "step": 55 + }, + { + "epoch": 0.42, + "grad_norm": 7.238231049231885, + "learning_rate": 4.775280898876405e-07, + "logps/chosen": -39.11316680908203, + "logps/rejected": -45.04530334472656, + "loss": 0.6642, + "losses/dpo": 0.6528148651123047, + "losses/sft": 1.159528136253357, + "losses/total": 0.6528148651123047, + "ref_logps/chosen": -37.2640266418457, + "ref_logps/rejected": -42.495338439941406, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.18491369485855103, + "rewards/margins": 0.07008323073387146, + "rewards/rejected": -0.2549969553947449, + "step": 56 + }, + { + "epoch": 0.43, + "grad_norm": 7.652181625277677, + "learning_rate": 4.7612359550561797e-07, + "logps/chosen": -43.244380950927734, + "logps/rejected": -49.415409088134766, + "loss": 0.6679, + "losses/dpo": 0.6176864504814148, + "losses/sft": 1.796196460723877, + "losses/total": 0.6176864504814148, + "ref_logps/chosen": -41.167381286621094, + "ref_logps/rejected": -46.70917892456055, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.20770025253295898, + "rewards/margins": 0.06292243301868439, + "rewards/rejected": -0.2706226706504822, + "step": 57 + }, + { + "epoch": 0.44, + "grad_norm": 8.012193852457372, + "learning_rate": 4.747191011235955e-07, + "logps/chosen": -38.95054626464844, + "logps/rejected": -45.45573043823242, + "loss": 0.6545, + "losses/dpo": 0.721019983291626, + "losses/sft": 1.6278411149978638, + "losses/total": 0.721019983291626, + "ref_logps/chosen": -37.17967987060547, + "ref_logps/rejected": -42.77497100830078, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.17708644270896912, + "rewards/margins": 0.09098967909812927, + "rewards/rejected": -0.2680761218070984, + "step": 58 + }, + { + "epoch": 0.45, + "grad_norm": 7.745152630727936, + "learning_rate": 4.7331460674157303e-07, + "logps/chosen": -41.84577560424805, + "logps/rejected": -54.23434066772461, + "loss": 0.654, + "losses/dpo": 0.601816713809967, + "losses/sft": 1.5886242389678955, + "losses/total": 0.601816713809967, + "ref_logps/chosen": -39.931434631347656, + "ref_logps/rejected": -51.34343719482422, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.19143418967723846, + "rewards/margins": 0.09765592962503433, + "rewards/rejected": -0.2890901267528534, + "step": 59 + }, + { + "epoch": 0.45, + "grad_norm": 8.104292528879256, + "learning_rate": 4.7191011235955054e-07, + "logps/chosen": -40.5402717590332, + "logps/rejected": -48.11115264892578, + "loss": 0.6612, + "losses/dpo": 0.6307883858680725, + "losses/sft": 1.6475903987884521, + "losses/total": 0.6307883858680725, + "ref_logps/chosen": -38.59111022949219, + "ref_logps/rejected": -45.39031219482422, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.19491644203662872, + "rewards/margins": 0.07716768234968185, + "rewards/rejected": -0.27208411693573, + "step": 60 + }, + { + "epoch": 0.46, + "grad_norm": 8.24076971848322, + "learning_rate": 4.705056179775281e-07, + "logps/chosen": -41.55039978027344, + "logps/rejected": -51.43959426879883, + "loss": 0.6677, + "losses/dpo": 0.7012457251548767, + "losses/sft": 2.175475597381592, + "losses/total": 0.7012457251548767, + "ref_logps/chosen": -39.29528045654297, + "ref_logps/rejected": -48.50824737548828, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.22551202774047852, + "rewards/margins": 0.06762254983186722, + "rewards/rejected": -0.2931345999240875, + "step": 61 + }, + { + "epoch": 0.47, + "grad_norm": 8.35481489409821, + "learning_rate": 4.691011235955056e-07, + "logps/chosen": -43.0926399230957, + "logps/rejected": -47.44728088378906, + "loss": 0.6462, + "losses/dpo": 0.6053961515426636, + "losses/sft": 1.459052324295044, + "losses/total": 0.6053961515426636, + "ref_logps/chosen": -40.96197509765625, + "ref_logps/rejected": -44.2166633605957, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.2130661904811859, + "rewards/margins": 0.1099955290555954, + "rewards/rejected": -0.3230617344379425, + "step": 62 + }, + { + "epoch": 0.48, + "grad_norm": 7.797774561384425, + "learning_rate": 4.6769662921348315e-07, + "logps/chosen": -38.055320739746094, + "logps/rejected": -47.66813659667969, + "loss": 0.652, + "losses/dpo": 0.632691502571106, + "losses/sft": 1.4375559091567993, + "losses/total": 0.632691502571106, + "ref_logps/chosen": -36.05341720581055, + "ref_logps/rejected": -44.64914321899414, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.20018979907035828, + "rewards/margins": 0.10170910507440567, + "rewards/rejected": -0.30189892649650574, + "step": 63 + }, + { + "epoch": 0.48, + "grad_norm": 7.28638958987082, + "learning_rate": 4.662921348314606e-07, + "logps/chosen": -39.81006622314453, + "logps/rejected": -46.78810501098633, + "loss": 0.662, + "losses/dpo": 0.6003807783126831, + "losses/sft": 1.3374682664871216, + "losses/total": 0.6003807783126831, + "ref_logps/chosen": -37.332698822021484, + "ref_logps/rejected": -43.507659912109375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.247736856341362, + "rewards/margins": 0.0803074836730957, + "rewards/rejected": -0.3280443251132965, + "step": 64 + }, + { + "epoch": 0.49, + "grad_norm": 7.798097069372596, + "learning_rate": 4.6488764044943816e-07, + "logps/chosen": -45.76461410522461, + "logps/rejected": -50.199825286865234, + "loss": 0.6814, + "losses/dpo": 0.6105685234069824, + "losses/sft": 1.8878819942474365, + "losses/total": 0.6105685234069824, + "ref_logps/chosen": -42.697021484375, + "ref_logps/rejected": -46.67133331298828, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.30675944685935974, + "rewards/margins": 0.046089351177215576, + "rewards/rejected": -0.35284876823425293, + "step": 65 + }, + { + "epoch": 0.5, + "grad_norm": 7.825603469401775, + "learning_rate": 4.634831460674157e-07, + "logps/chosen": -43.35639953613281, + "logps/rejected": -53.741355895996094, + "loss": 0.6436, + "losses/dpo": 0.6293699741363525, + "losses/sft": 1.4026882648468018, + "losses/total": 0.6293699741363525, + "ref_logps/chosen": -40.74187469482422, + "ref_logps/rejected": -49.880897521972656, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.2614526152610779, + "rewards/margins": 0.12459328025579453, + "rewards/rejected": -0.3860458731651306, + "step": 66 + }, + { + "epoch": 0.51, + "grad_norm": 7.635556989764748, + "learning_rate": 4.620786516853932e-07, + "logps/chosen": -40.68864440917969, + "logps/rejected": -46.91835021972656, + "loss": 0.6539, + "losses/dpo": 0.7432792782783508, + "losses/sft": 1.5986056327819824, + "losses/total": 0.7432792782783508, + "ref_logps/chosen": -38.21084213256836, + "ref_logps/rejected": -43.418067932128906, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.24778038263320923, + "rewards/margins": 0.10224790126085281, + "rewards/rejected": -0.35002827644348145, + "step": 67 + }, + { + "epoch": 0.51, + "grad_norm": 7.6861933707333305, + "learning_rate": 4.606741573033708e-07, + "logps/chosen": -42.49998092651367, + "logps/rejected": -49.80910873413086, + "loss": 0.6241, + "losses/dpo": 0.5949782133102417, + "losses/sft": 1.2951277494430542, + "losses/total": 0.5949782133102417, + "ref_logps/chosen": -40.09019088745117, + "ref_logps/rejected": -45.69980239868164, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.24097900092601776, + "rewards/margins": 0.16995173692703247, + "rewards/rejected": -0.4109307527542114, + "step": 68 + }, + { + "epoch": 0.52, + "grad_norm": 7.5671933847566555, + "learning_rate": 4.592696629213483e-07, + "logps/chosen": -42.11725616455078, + "logps/rejected": -52.82745361328125, + "loss": 0.6438, + "losses/dpo": 0.6235805749893188, + "losses/sft": 1.4768122434616089, + "losses/total": 0.6235805749893188, + "ref_logps/chosen": -39.234642028808594, + "ref_logps/rejected": -48.657501220703125, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.2882614731788635, + "rewards/margins": 0.12873350083827972, + "rewards/rejected": -0.41699495911598206, + "step": 69 + }, + { + "epoch": 0.53, + "grad_norm": 7.380869348497186, + "learning_rate": 4.5786516853932584e-07, + "logps/chosen": -40.33063507080078, + "logps/rejected": -47.34225845336914, + "loss": 0.6519, + "losses/dpo": 0.6829323768615723, + "losses/sft": 1.6434234380722046, + "losses/total": 0.6829323768615723, + "ref_logps/chosen": -37.294776916503906, + "ref_logps/rejected": -43.10985565185547, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.30358612537384033, + "rewards/margins": 0.11965445429086685, + "rewards/rejected": -0.4232405722141266, + "step": 70 + }, + { + "epoch": 0.54, + "grad_norm": 7.505680485493994, + "learning_rate": 4.5646067415730334e-07, + "logps/chosen": -40.74094009399414, + "logps/rejected": -49.552616119384766, + "loss": 0.6423, + "losses/dpo": 0.6549758315086365, + "losses/sft": 1.5461335182189941, + "losses/total": 0.6549758315086365, + "ref_logps/chosen": -37.58620071411133, + "ref_logps/rejected": -45.091209411621094, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.3154744505882263, + "rewards/margins": 0.1306663304567337, + "rewards/rejected": -0.4461407959461212, + "step": 71 + }, + { + "epoch": 0.54, + "grad_norm": 7.805459051678804, + "learning_rate": 4.550561797752809e-07, + "logps/chosen": -42.94220733642578, + "logps/rejected": -54.635963439941406, + "loss": 0.6384, + "losses/dpo": 0.589752733707428, + "losses/sft": 1.55972421169281, + "losses/total": 0.589752733707428, + "ref_logps/chosen": -39.39100646972656, + "ref_logps/rejected": -49.64848709106445, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.35512006282806396, + "rewards/margins": 0.1436270773410797, + "rewards/rejected": -0.49874716997146606, + "step": 72 + }, + { + "epoch": 0.55, + "grad_norm": 8.048352395094511, + "learning_rate": 4.536516853932584e-07, + "logps/chosen": -41.640235900878906, + "logps/rejected": -53.20794677734375, + "loss": 0.621, + "losses/dpo": 0.6062641143798828, + "losses/sft": 1.0970079898834229, + "losses/total": 0.6062641143798828, + "ref_logps/chosen": -38.45201873779297, + "ref_logps/rejected": -48.13432312011719, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.3188212513923645, + "rewards/margins": 0.18854106962680817, + "rewards/rejected": -0.5073623061180115, + "step": 73 + }, + { + "epoch": 0.56, + "grad_norm": 12.072882817554818, + "learning_rate": 4.522471910112359e-07, + "logps/chosen": -43.404823303222656, + "logps/rejected": -50.17894744873047, + "loss": 0.6527, + "losses/dpo": 0.5960186719894409, + "losses/sft": 1.444412112236023, + "losses/total": 0.5960186719894409, + "ref_logps/chosen": -39.74496841430664, + "ref_logps/rejected": -45.32371139526367, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3659852147102356, + "rewards/margins": 0.11953801661729813, + "rewards/rejected": -0.4855232238769531, + "step": 74 + }, + { + "epoch": 0.57, + "grad_norm": 7.650328731492573, + "learning_rate": 4.5084269662921347e-07, + "logps/chosen": -43.5896110534668, + "logps/rejected": -51.086971282958984, + "loss": 0.6409, + "losses/dpo": 0.6057982444763184, + "losses/sft": 1.4658453464508057, + "losses/total": 0.6057982444763184, + "ref_logps/chosen": -39.98381423950195, + "ref_logps/rejected": -45.86697769165039, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.3605796992778778, + "rewards/margins": 0.16141945123672485, + "rewards/rejected": -0.521999180316925, + "step": 75 + }, + { + "epoch": 0.57, + "grad_norm": 7.945574535694735, + "learning_rate": 4.4943820224719097e-07, + "logps/chosen": -44.07162094116211, + "logps/rejected": -50.66339874267578, + "loss": 0.6556, + "losses/dpo": 0.6170323491096497, + "losses/sft": 1.8739807605743408, + "losses/total": 0.6170323491096497, + "ref_logps/chosen": -40.06593322753906, + "ref_logps/rejected": -45.41100311279297, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.4005686044692993, + "rewards/margins": 0.12467078864574432, + "rewards/rejected": -0.5252394080162048, + "step": 76 + }, + { + "epoch": 0.58, + "grad_norm": 9.606133876539282, + "learning_rate": 4.4803370786516853e-07, + "logps/chosen": -42.20341110229492, + "logps/rejected": -50.49468231201172, + "loss": 0.6503, + "losses/dpo": 0.586646556854248, + "losses/sft": 1.5989439487457275, + "losses/total": 0.586646556854248, + "ref_logps/chosen": -38.3419075012207, + "ref_logps/rejected": -45.38350296020508, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.3861507773399353, + "rewards/margins": 0.12496703863143921, + "rewards/rejected": -0.5111178159713745, + "step": 77 + }, + { + "epoch": 0.59, + "grad_norm": 8.32429224169226, + "learning_rate": 4.4662921348314603e-07, + "logps/chosen": -42.94104766845703, + "logps/rejected": -54.27581787109375, + "loss": 0.6335, + "losses/dpo": 0.629318356513977, + "losses/sft": 1.5925976037979126, + "losses/total": 0.629318356513977, + "ref_logps/chosen": -38.90857696533203, + "ref_logps/rejected": -48.47710037231445, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.40324676036834717, + "rewards/margins": 0.1766246110200882, + "rewards/rejected": -0.5798712968826294, + "step": 78 + }, + { + "epoch": 0.6, + "grad_norm": 7.898667752271484, + "learning_rate": 4.452247191011236e-07, + "logps/chosen": -42.00642395019531, + "logps/rejected": -51.58437728881836, + "loss": 0.6326, + "losses/dpo": 0.6813949346542358, + "losses/sft": 1.5958709716796875, + "losses/total": 0.6813949346542358, + "ref_logps/chosen": -38.2513427734375, + "ref_logps/rejected": -46.194374084472656, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.3755083978176117, + "rewards/margins": 0.16349197924137115, + "rewards/rejected": -0.5390004515647888, + "step": 79 + }, + { + "epoch": 0.6, + "grad_norm": 8.4984309671094, + "learning_rate": 4.438202247191011e-07, + "logps/chosen": -47.29401397705078, + "logps/rejected": -54.29883575439453, + "loss": 0.642, + "losses/dpo": 0.6930491924285889, + "losses/sft": 1.8281772136688232, + "losses/total": 0.6930491924285889, + "ref_logps/chosen": -42.54518127441406, + "ref_logps/rejected": -47.80082321166992, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.47488299012184143, + "rewards/margins": 0.17491832375526428, + "rewards/rejected": -0.6498013138771057, + "step": 80 + }, + { + "epoch": 0.61, + "grad_norm": 8.385277776330648, + "learning_rate": 4.4241573033707865e-07, + "logps/chosen": -48.636085510253906, + "logps/rejected": -54.089210510253906, + "loss": 0.6105, + "losses/dpo": 0.5880630612373352, + "losses/sft": 1.647892951965332, + "losses/total": 0.5880630612373352, + "ref_logps/chosen": -43.907169342041016, + "ref_logps/rejected": -47.025569915771484, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.472891628742218, + "rewards/margins": 0.2334723323583603, + "rewards/rejected": -0.7063639760017395, + "step": 81 + }, + { + "epoch": 0.62, + "grad_norm": 10.98551780733234, + "learning_rate": 4.410112359550562e-07, + "logps/chosen": -44.03840637207031, + "logps/rejected": -52.65192413330078, + "loss": 0.622, + "losses/dpo": 0.5322688817977905, + "losses/sft": 1.9210578203201294, + "losses/total": 0.5322688817977905, + "ref_logps/chosen": -39.61579132080078, + "ref_logps/rejected": -46.18141174316406, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.4422611594200134, + "rewards/margins": 0.2047904133796692, + "rewards/rejected": -0.6470515727996826, + "step": 82 + }, + { + "epoch": 0.63, + "grad_norm": 7.7041080998180425, + "learning_rate": 4.3960674157303366e-07, + "logps/chosen": -44.86250305175781, + "logps/rejected": -52.133052825927734, + "loss": 0.6265, + "losses/dpo": 0.5972993969917297, + "losses/sft": 1.4703764915466309, + "losses/total": 0.5972993969917297, + "ref_logps/chosen": -40.2207145690918, + "ref_logps/rejected": -45.56148910522461, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46417874097824097, + "rewards/margins": 0.192977637052536, + "rewards/rejected": -0.6571563482284546, + "step": 83 + }, + { + "epoch": 0.63, + "grad_norm": 7.893501393776045, + "learning_rate": 4.382022471910112e-07, + "logps/chosen": -45.97453689575195, + "logps/rejected": -55.243316650390625, + "loss": 0.6289, + "losses/dpo": 0.5659444332122803, + "losses/sft": 1.6541783809661865, + "losses/total": 0.5659444332122803, + "ref_logps/chosen": -40.886837005615234, + "ref_logps/rejected": -48.23194885253906, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5087698698043823, + "rewards/margins": 0.19236721098423004, + "rewards/rejected": -0.7011370062828064, + "step": 84 + }, + { + "epoch": 0.64, + "grad_norm": 9.509139423826628, + "learning_rate": 4.367977528089887e-07, + "logps/chosen": -44.12831115722656, + "logps/rejected": -54.7608642578125, + "loss": 0.6193, + "losses/dpo": 0.5432471036911011, + "losses/sft": 1.8381226062774658, + "losses/total": 0.5432471036911011, + "ref_logps/chosen": -39.457069396972656, + "ref_logps/rejected": -47.7266731262207, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.46712398529052734, + "rewards/margins": 0.23629523813724518, + "rewards/rejected": -0.7034192681312561, + "step": 85 + }, + { + "epoch": 0.65, + "grad_norm": 7.887107642093141, + "learning_rate": 4.353932584269663e-07, + "logps/chosen": -45.145904541015625, + "logps/rejected": -55.40123748779297, + "loss": 0.6092, + "losses/dpo": 0.5780594944953918, + "losses/sft": 1.9163440465927124, + "losses/total": 0.5780594944953918, + "ref_logps/chosen": -40.329139709472656, + "ref_logps/rejected": -48.183074951171875, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.4816761910915375, + "rewards/margins": 0.24014019966125488, + "rewards/rejected": -0.7218164205551147, + "step": 86 + }, + { + "epoch": 0.66, + "grad_norm": 8.39204893477319, + "learning_rate": 4.339887640449438e-07, + "logps/chosen": -46.145851135253906, + "logps/rejected": -53.961036682128906, + "loss": 0.6403, + "losses/dpo": 0.60181725025177, + "losses/sft": 1.625700831413269, + "losses/total": 0.60181725025177, + "ref_logps/chosen": -40.71726608276367, + "ref_logps/rejected": -46.57926940917969, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.5428579449653625, + "rewards/margins": 0.19531863927841187, + "rewards/rejected": -0.7381765842437744, + "step": 87 + }, + { + "epoch": 0.66, + "grad_norm": 8.63628479914341, + "learning_rate": 4.3258426966292134e-07, + "logps/chosen": -48.23160171508789, + "logps/rejected": -53.496604919433594, + "loss": 0.6382, + "losses/dpo": 0.5790094137191772, + "losses/sft": 1.4581667184829712, + "losses/total": 0.5790094137191772, + "ref_logps/chosen": -42.78254699707031, + "ref_logps/rejected": -46.28538131713867, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5449056029319763, + "rewards/margins": 0.1762169450521469, + "rewards/rejected": -0.7211225628852844, + "step": 88 + }, + { + "epoch": 0.67, + "grad_norm": 7.682264878764254, + "learning_rate": 4.311797752808989e-07, + "logps/chosen": -44.9177360534668, + "logps/rejected": -57.826866149902344, + "loss": 0.5789, + "losses/dpo": 0.5269919037818909, + "losses/sft": 1.9525985717773438, + "losses/total": 0.5269919037818909, + "ref_logps/chosen": -39.326385498046875, + "ref_logps/rejected": -48.92866134643555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5591354370117188, + "rewards/margins": 0.33068495988845825, + "rewards/rejected": -0.889820396900177, + "step": 89 + }, + { + "epoch": 0.68, + "grad_norm": 9.993714538070773, + "learning_rate": 4.297752808988764e-07, + "logps/chosen": -47.74348068237305, + "logps/rejected": -51.28035354614258, + "loss": 0.6687, + "losses/dpo": 0.6379462480545044, + "losses/sft": 1.61128830909729, + "losses/total": 0.6379462480545044, + "ref_logps/chosen": -41.72521209716797, + "ref_logps/rejected": -44.09041976928711, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.6018266677856445, + "rewards/margins": 0.11716663837432861, + "rewards/rejected": -0.7189933061599731, + "step": 90 + }, + { + "epoch": 0.69, + "grad_norm": 9.79439579593073, + "learning_rate": 4.2837078651685396e-07, + "logps/chosen": -46.20167541503906, + "logps/rejected": -54.01323318481445, + "loss": 0.6199, + "losses/dpo": 0.4756242632865906, + "losses/sft": 1.6830320358276367, + "losses/total": 0.4756242632865906, + "ref_logps/chosen": -40.16909408569336, + "ref_logps/rejected": -45.69634246826172, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.6032581329345703, + "rewards/margins": 0.22843076288700104, + "rewards/rejected": -0.8316888809204102, + "step": 91 + }, + { + "epoch": 0.69, + "grad_norm": 8.115443534609298, + "learning_rate": 4.269662921348314e-07, + "logps/chosen": -49.6710319519043, + "logps/rejected": -60.84608840942383, + "loss": 0.5948, + "losses/dpo": 0.6356014013290405, + "losses/sft": 1.7809040546417236, + "losses/total": 0.6356014013290405, + "ref_logps/chosen": -43.4056282043457, + "ref_logps/rejected": -51.69749069213867, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.6265405416488647, + "rewards/margins": 0.2883196473121643, + "rewards/rejected": -0.914860188961029, + "step": 92 + }, + { + "epoch": 0.7, + "grad_norm": 8.161770613063224, + "learning_rate": 4.2556179775280896e-07, + "logps/chosen": -44.44715881347656, + "logps/rejected": -52.76496124267578, + "loss": 0.6499, + "losses/dpo": 0.5320106744766235, + "losses/sft": 1.5193849802017212, + "losses/total": 0.5320106744766235, + "ref_logps/chosen": -38.260562896728516, + "ref_logps/rejected": -44.902198791503906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.618659496307373, + "rewards/margins": 0.1676165610551834, + "rewards/rejected": -0.78627610206604, + "step": 93 + }, + { + "epoch": 0.71, + "grad_norm": 11.63146009410558, + "learning_rate": 4.2415730337078647e-07, + "logps/chosen": -48.569419860839844, + "logps/rejected": -55.14238357543945, + "loss": 0.6309, + "losses/dpo": 0.5968553423881531, + "losses/sft": 1.5720221996307373, + "losses/total": 0.5968553423881531, + "ref_logps/chosen": -42.18726348876953, + "ref_logps/rejected": -46.23759460449219, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6382158994674683, + "rewards/margins": 0.2522626221179962, + "rewards/rejected": -0.8904784917831421, + "step": 94 + }, + { + "epoch": 0.72, + "grad_norm": 7.826574210546138, + "learning_rate": 4.22752808988764e-07, + "logps/chosen": -48.970035552978516, + "logps/rejected": -56.224632263183594, + "loss": 0.6209, + "losses/dpo": 0.566143810749054, + "losses/sft": 1.7626792192459106, + "losses/total": 0.566143810749054, + "ref_logps/chosen": -42.288108825683594, + "ref_logps/rejected": -47.209964752197266, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.6681923270225525, + "rewards/margins": 0.23327398300170898, + "rewards/rejected": -0.9014662504196167, + "step": 95 + }, + { + "epoch": 0.72, + "grad_norm": 7.984390488888068, + "learning_rate": 4.2134831460674153e-07, + "logps/chosen": -44.690216064453125, + "logps/rejected": -57.38431930541992, + "loss": 0.5916, + "losses/dpo": 0.5278609395027161, + "losses/sft": 1.7001551389694214, + "losses/total": 0.5278609395027161, + "ref_logps/chosen": -39.018001556396484, + "ref_logps/rejected": -48.73064422607422, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.567221999168396, + "rewards/margins": 0.2981455326080322, + "rewards/rejected": -0.8653674125671387, + "step": 96 + }, + { + "epoch": 0.73, + "grad_norm": 8.140376307221391, + "learning_rate": 4.199438202247191e-07, + "logps/chosen": -45.94129180908203, + "logps/rejected": -57.70640563964844, + "loss": 0.5843, + "losses/dpo": 0.6323425769805908, + "losses/sft": 1.9729546308517456, + "losses/total": 0.6323425769805908, + "ref_logps/chosen": -39.79209899902344, + "ref_logps/rejected": -47.78965377807617, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6149196624755859, + "rewards/margins": 0.37675485014915466, + "rewards/rejected": -0.991674542427063, + "step": 97 + }, + { + "epoch": 0.74, + "grad_norm": 8.940775697669213, + "learning_rate": 4.1853932584269664e-07, + "logps/chosen": -49.44526672363281, + "logps/rejected": -54.83664321899414, + "loss": 0.6672, + "losses/dpo": 0.6795445084571838, + "losses/sft": 1.7624062299728394, + "losses/total": 0.6795445084571838, + "ref_logps/chosen": -42.365447998046875, + "ref_logps/rejected": -46.211761474609375, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7079817652702332, + "rewards/margins": 0.15450690686702728, + "rewards/rejected": -0.8624885678291321, + "step": 98 + }, + { + "epoch": 0.75, + "grad_norm": 7.425234946687916, + "learning_rate": 4.1713483146067415e-07, + "logps/chosen": -43.5301513671875, + "logps/rejected": -52.020164489746094, + "loss": 0.5957, + "losses/dpo": 0.6246699690818787, + "losses/sft": 1.7049494981765747, + "losses/total": 0.6246699690818787, + "ref_logps/chosen": -37.423370361328125, + "ref_logps/rejected": -42.76348876953125, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.6106777191162109, + "rewards/margins": 0.31499022245407104, + "rewards/rejected": -0.9256680607795715, + "step": 99 + }, + { + "epoch": 0.75, + "grad_norm": 9.179756029554161, + "learning_rate": 4.157303370786517e-07, + "logps/chosen": -49.9049072265625, + "logps/rejected": -51.58677673339844, + "loss": 0.6817, + "losses/dpo": 0.4553123712539673, + "losses/sft": 1.8781102895736694, + "losses/total": 0.4553123712539673, + "ref_logps/chosen": -42.09438705444336, + "ref_logps/rejected": -42.1667594909668, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7810521721839905, + "rewards/margins": 0.16094914078712463, + "rewards/rejected": -0.9420013427734375, + "step": 100 + }, + { + "epoch": 0.76, + "grad_norm": 8.48875168411848, + "learning_rate": 4.1432584269662915e-07, + "logps/chosen": -43.814697265625, + "logps/rejected": -56.10200881958008, + "loss": 0.5895, + "losses/dpo": 0.6046161651611328, + "losses/sft": 1.7643048763275146, + "losses/total": 0.6046161651611328, + "ref_logps/chosen": -37.67271423339844, + "ref_logps/rejected": -46.54473114013672, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6141979694366455, + "rewards/margins": 0.3415302038192749, + "rewards/rejected": -0.9557281732559204, + "step": 101 + }, + { + "epoch": 0.77, + "grad_norm": 8.913022100408304, + "learning_rate": 4.129213483146067e-07, + "logps/chosen": -43.33885955810547, + "logps/rejected": -49.23841094970703, + "loss": 0.6399, + "losses/dpo": 0.5780912637710571, + "losses/sft": 2.1572117805480957, + "losses/total": 0.5780912637710571, + "ref_logps/chosen": -37.323486328125, + "ref_logps/rejected": -41.119110107421875, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.6015373468399048, + "rewards/margins": 0.21039217710494995, + "rewards/rejected": -0.8119295239448547, + "step": 102 + }, + { + "epoch": 0.78, + "grad_norm": 8.206141020123177, + "learning_rate": 4.115168539325842e-07, + "logps/chosen": -48.0225830078125, + "logps/rejected": -54.95545196533203, + "loss": 0.6109, + "losses/dpo": 0.5648887753486633, + "losses/sft": 1.740958571434021, + "losses/total": 0.5648887753486633, + "ref_logps/chosen": -41.10425567626953, + "ref_logps/rejected": -45.241817474365234, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6918322443962097, + "rewards/margins": 0.27953118085861206, + "rewards/rejected": -0.9713634848594666, + "step": 103 + }, + { + "epoch": 0.78, + "grad_norm": 7.999449117068295, + "learning_rate": 4.1011235955056177e-07, + "logps/chosen": -48.33061981201172, + "logps/rejected": -56.14019775390625, + "loss": 0.6036, + "losses/dpo": 0.5798739194869995, + "losses/sft": 1.6369647979736328, + "losses/total": 0.5798739194869995, + "ref_logps/chosen": -40.94407653808594, + "ref_logps/rejected": -45.9286003112793, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.7386540174484253, + "rewards/margins": 0.28250569105148315, + "rewards/rejected": -1.0211596488952637, + "step": 104 + }, + { + "epoch": 0.79, + "grad_norm": 7.961208032843563, + "learning_rate": 4.0870786516853933e-07, + "logps/chosen": -43.01828384399414, + "logps/rejected": -55.2956657409668, + "loss": 0.5814, + "losses/dpo": 0.5777114629745483, + "losses/sft": 2.0679547786712646, + "losses/total": 0.5777114629745483, + "ref_logps/chosen": -36.25102233886719, + "ref_logps/rejected": -45.063175201416016, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6767261624336243, + "rewards/margins": 0.3465230464935303, + "rewards/rejected": -1.0232491493225098, + "step": 105 + }, + { + "epoch": 0.8, + "grad_norm": 8.162855072558866, + "learning_rate": 4.0730337078651683e-07, + "logps/chosen": -38.68614959716797, + "logps/rejected": -51.58649444580078, + "loss": 0.6392, + "losses/dpo": 0.6045973896980286, + "losses/sft": 1.7759897708892822, + "losses/total": 0.6045973896980286, + "ref_logps/chosen": -32.37638854980469, + "ref_logps/rejected": -42.94057083129883, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.6309766173362732, + "rewards/margins": 0.2336157262325287, + "rewards/rejected": -0.8645923733711243, + "step": 106 + }, + { + "epoch": 0.81, + "grad_norm": 10.312576606575677, + "learning_rate": 4.058988764044944e-07, + "logps/chosen": -50.928245544433594, + "logps/rejected": -59.36943054199219, + "loss": 0.654, + "losses/dpo": 0.735203206539154, + "losses/sft": 2.190847158432007, + "losses/total": 0.735203206539154, + "ref_logps/chosen": -42.11771774291992, + "ref_logps/rejected": -48.740692138671875, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.8810529112815857, + "rewards/margins": 0.18182089924812317, + "rewards/rejected": -1.0628738403320312, + "step": 107 + }, + { + "epoch": 0.82, + "grad_norm": 8.648201545330428, + "learning_rate": 4.044943820224719e-07, + "logps/chosen": -47.14788818359375, + "logps/rejected": -54.653385162353516, + "loss": 0.63, + "losses/dpo": 0.6501352787017822, + "losses/sft": 2.0834176540374756, + "losses/total": 0.6501352787017822, + "ref_logps/chosen": -39.38991928100586, + "ref_logps/rejected": -44.46839904785156, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.7757968902587891, + "rewards/margins": 0.24270157516002655, + "rewards/rejected": -1.0184985399246216, + "step": 108 + }, + { + "epoch": 0.82, + "grad_norm": 9.8578017889453, + "learning_rate": 4.0308988764044945e-07, + "logps/chosen": -48.94048309326172, + "logps/rejected": -56.453369140625, + "loss": 0.6293, + "losses/dpo": 0.7049952149391174, + "losses/sft": 2.280228614807129, + "losses/total": 0.7049952149391174, + "ref_logps/chosen": -41.544471740722656, + "ref_logps/rejected": -46.083526611328125, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.7396014332771301, + "rewards/margins": 0.29738324880599976, + "rewards/rejected": -1.0369845628738403, + "step": 109 + }, + { + "epoch": 0.83, + "grad_norm": 8.394061989951249, + "learning_rate": 4.0168539325842696e-07, + "logps/chosen": -45.53207015991211, + "logps/rejected": -54.01716613769531, + "loss": 0.6652, + "losses/dpo": 0.5625556111335754, + "losses/sft": 2.00128436088562, + "losses/total": 0.5625556111335754, + "ref_logps/chosen": -37.465904235839844, + "ref_logps/rejected": -44.16294860839844, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8066164255142212, + "rewards/margins": 0.17880576848983765, + "rewards/rejected": -0.9854221940040588, + "step": 110 + }, + { + "epoch": 0.84, + "grad_norm": 8.76540952525436, + "learning_rate": 4.0028089887640446e-07, + "logps/chosen": -46.494407653808594, + "logps/rejected": -56.70625686645508, + "loss": 0.6355, + "losses/dpo": 0.816941499710083, + "losses/sft": 2.149186372756958, + "losses/total": 0.816941499710083, + "ref_logps/chosen": -38.6019401550293, + "ref_logps/rejected": -46.42439270019531, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.7892469167709351, + "rewards/margins": 0.23893946409225464, + "rewards/rejected": -1.028186321258545, + "step": 111 + }, + { + "epoch": 0.85, + "grad_norm": 8.706675493262694, + "learning_rate": 3.9887640449438196e-07, + "logps/chosen": -48.493709564208984, + "logps/rejected": -58.21279525756836, + "loss": 0.6039, + "losses/dpo": 0.49378710985183716, + "losses/sft": 1.3456647396087646, + "losses/total": 0.49378710985183716, + "ref_logps/chosen": -41.08473205566406, + "ref_logps/rejected": -47.72603225708008, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.7408972978591919, + "rewards/margins": 0.30777889490127563, + "rewards/rejected": -1.0486761331558228, + "step": 112 + }, + { + "epoch": 0.85, + "grad_norm": 8.390537463814029, + "learning_rate": 3.974719101123595e-07, + "logps/chosen": -45.94505310058594, + "logps/rejected": -59.44303894042969, + "loss": 0.5863, + "losses/dpo": 0.7109102606773376, + "losses/sft": 1.734868049621582, + "losses/total": 0.7109102606773376, + "ref_logps/chosen": -39.46669387817383, + "ref_logps/rejected": -49.24740219116211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6478357911109924, + "rewards/margins": 0.37172842025756836, + "rewards/rejected": -1.0195642709732056, + "step": 113 + }, + { + "epoch": 0.86, + "grad_norm": 7.999605487961454, + "learning_rate": 3.960674157303371e-07, + "logps/chosen": -43.57410430908203, + "logps/rejected": -53.34558868408203, + "loss": 0.6172, + "losses/dpo": 0.5053269863128662, + "losses/sft": 1.450685739517212, + "losses/total": 0.5053269863128662, + "ref_logps/chosen": -36.833072662353516, + "ref_logps/rejected": -43.94281768798828, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6741028428077698, + "rewards/margins": 0.26617470383644104, + "rewards/rejected": -0.9402774572372437, + "step": 114 + }, + { + "epoch": 0.87, + "grad_norm": 9.415075126299298, + "learning_rate": 3.946629213483146e-07, + "logps/chosen": -46.565555572509766, + "logps/rejected": -57.010372161865234, + "loss": 0.5955, + "losses/dpo": 0.5628423690795898, + "losses/sft": 1.7239865064620972, + "losses/total": 0.5628423690795898, + "ref_logps/chosen": -39.13492202758789, + "ref_logps/rejected": -46.0317497253418, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.743063747882843, + "rewards/margins": 0.3547991216182709, + "rewards/rejected": -1.097862958908081, + "step": 115 + }, + { + "epoch": 0.88, + "grad_norm": 9.600908058178481, + "learning_rate": 3.9325842696629214e-07, + "logps/chosen": -50.78204345703125, + "logps/rejected": -56.04452896118164, + "loss": 0.6852, + "losses/dpo": 0.635813295841217, + "losses/sft": 1.73310124874115, + "losses/total": 0.635813295841217, + "ref_logps/chosen": -42.01323318481445, + "ref_logps/rejected": -45.830718994140625, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.8768815994262695, + "rewards/margins": 0.14449933171272278, + "rewards/rejected": -1.02138090133667, + "step": 116 + }, + { + "epoch": 0.88, + "grad_norm": 8.373342286084757, + "learning_rate": 3.9185393258426964e-07, + "logps/chosen": -47.82256317138672, + "logps/rejected": -54.956302642822266, + "loss": 0.6136, + "losses/dpo": 0.6849408149719238, + "losses/sft": 2.210094451904297, + "losses/total": 0.6849408149719238, + "ref_logps/chosen": -40.209930419921875, + "ref_logps/rejected": -44.30906295776367, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.7612636089324951, + "rewards/margins": 0.3034607172012329, + "rewards/rejected": -1.0647242069244385, + "step": 117 + }, + { + "epoch": 0.89, + "grad_norm": 9.637403138990082, + "learning_rate": 3.904494382022472e-07, + "logps/chosen": -48.02635955810547, + "logps/rejected": -57.71215057373047, + "loss": 0.5835, + "losses/dpo": 0.6933724880218506, + "losses/sft": 1.4713902473449707, + "losses/total": 0.6933724880218506, + "ref_logps/chosen": -40.86788558959961, + "ref_logps/rejected": -47.20375442504883, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.7158471345901489, + "rewards/margins": 0.33499258756637573, + "rewards/rejected": -1.0508397817611694, + "step": 118 + }, + { + "epoch": 0.9, + "grad_norm": 8.531064508491891, + "learning_rate": 3.890449438202247e-07, + "logps/chosen": -46.2841796875, + "logps/rejected": -54.74563980102539, + "loss": 0.6334, + "losses/dpo": 0.702929675579071, + "losses/sft": 1.360397219657898, + "losses/total": 0.702929675579071, + "ref_logps/chosen": -39.4974365234375, + "ref_logps/rejected": -45.235111236572266, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.6786739826202393, + "rewards/margins": 0.2723783850669861, + "rewards/rejected": -0.9510524272918701, + "step": 119 + }, + { + "epoch": 0.91, + "grad_norm": 41.566258002848386, + "learning_rate": 3.876404494382022e-07, + "logps/chosen": -44.022640228271484, + "logps/rejected": -55.926021575927734, + "loss": 0.596, + "losses/dpo": 0.5612522959709167, + "losses/sft": 1.8725919723510742, + "losses/total": 0.5612522959709167, + "ref_logps/chosen": -37.003326416015625, + "ref_logps/rejected": -45.59555435180664, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.701931357383728, + "rewards/margins": 0.3311149477958679, + "rewards/rejected": -1.0330464839935303, + "step": 120 + }, + { + "epoch": 0.91, + "grad_norm": 8.107636440875833, + "learning_rate": 3.8623595505617977e-07, + "logps/chosen": -46.37071990966797, + "logps/rejected": -57.88187026977539, + "loss": 0.5981, + "losses/dpo": 0.5683261156082153, + "losses/sft": 1.2367005348205566, + "losses/total": 0.5683261156082153, + "ref_logps/chosen": -40.31574630737305, + "ref_logps/rejected": -48.33371353149414, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6054975986480713, + "rewards/margins": 0.3493175208568573, + "rewards/rejected": -0.9548150300979614, + "step": 121 + }, + { + "epoch": 0.92, + "grad_norm": 7.976666948784577, + "learning_rate": 3.8483146067415727e-07, + "logps/chosen": -49.71910095214844, + "logps/rejected": -61.38713073730469, + "loss": 0.5628, + "losses/dpo": 0.6633545756340027, + "losses/sft": 2.0874011516571045, + "losses/total": 0.6633545756340027, + "ref_logps/chosen": -42.128177642822266, + "ref_logps/rejected": -49.5493278503418, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7590923309326172, + "rewards/margins": 0.42468804121017456, + "rewards/rejected": -1.1837804317474365, + "step": 122 + }, + { + "epoch": 0.93, + "grad_norm": 8.273120167281697, + "learning_rate": 3.834269662921348e-07, + "logps/chosen": -44.52275848388672, + "logps/rejected": -56.69093322753906, + "loss": 0.5973, + "losses/dpo": 0.7617586851119995, + "losses/sft": 1.4152177572250366, + "losses/total": 0.7617586851119995, + "ref_logps/chosen": -38.226234436035156, + "ref_logps/rejected": -47.10552215576172, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6296522617340088, + "rewards/margins": 0.32888925075531006, + "rewards/rejected": -0.9585415124893188, + "step": 123 + }, + { + "epoch": 0.94, + "grad_norm": 8.426019615665346, + "learning_rate": 3.8202247191011233e-07, + "logps/chosen": -45.17867660522461, + "logps/rejected": -52.477989196777344, + "loss": 0.5956, + "losses/dpo": 0.8184994459152222, + "losses/sft": 2.1767871379852295, + "losses/total": 0.8184994459152222, + "ref_logps/chosen": -39.22052001953125, + "ref_logps/rejected": -42.9237060546875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5958150625228882, + "rewards/margins": 0.35961273312568665, + "rewards/rejected": -0.9554278254508972, + "step": 124 + }, + { + "epoch": 0.94, + "grad_norm": 9.445037206868065, + "learning_rate": 3.806179775280899e-07, + "logps/chosen": -49.57510757446289, + "logps/rejected": -57.41535568237305, + "loss": 0.626, + "losses/dpo": 0.5964499711990356, + "losses/sft": 1.574311375617981, + "losses/total": 0.5964499711990356, + "ref_logps/chosen": -42.10932159423828, + "ref_logps/rejected": -47.498348236083984, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.746578574180603, + "rewards/margins": 0.24512259662151337, + "rewards/rejected": -0.9917011260986328, + "step": 125 + }, + { + "epoch": 0.95, + "grad_norm": 8.142660343260747, + "learning_rate": 3.792134831460674e-07, + "logps/chosen": -49.39699935913086, + "logps/rejected": -60.31464385986328, + "loss": 0.5557, + "losses/dpo": 0.4981670677661896, + "losses/sft": 1.7636176347732544, + "losses/total": 0.4981670677661896, + "ref_logps/chosen": -42.645172119140625, + "ref_logps/rejected": -49.29505157470703, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.675183117389679, + "rewards/margins": 0.4267764091491699, + "rewards/rejected": -1.1019595861434937, + "step": 126 + }, + { + "epoch": 0.96, + "grad_norm": 10.371790207828976, + "learning_rate": 3.7780898876404495e-07, + "logps/chosen": -49.649539947509766, + "logps/rejected": -55.797752380371094, + "loss": 0.6236, + "losses/dpo": 0.6386290788650513, + "losses/sft": 1.9548349380493164, + "losses/total": 0.6386290788650513, + "ref_logps/chosen": -43.093387603759766, + "ref_logps/rejected": -46.55876541137695, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.6556151509284973, + "rewards/margins": 0.2682836949825287, + "rewards/rejected": -0.9238989353179932, + "step": 127 + }, + { + "epoch": 0.97, + "grad_norm": 8.551344033897433, + "learning_rate": 3.7640449438202245e-07, + "logps/chosen": -47.41435623168945, + "logps/rejected": -53.9222526550293, + "loss": 0.6097, + "losses/dpo": 0.5466079711914062, + "losses/sft": 1.6678849458694458, + "losses/total": 0.5466079711914062, + "ref_logps/chosen": -40.969482421875, + "ref_logps/rejected": -44.308204650878906, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6444875001907349, + "rewards/margins": 0.3169165849685669, + "rewards/rejected": -0.9614041447639465, + "step": 128 + }, + { + "epoch": 0.97, + "grad_norm": 8.48986980564796, + "learning_rate": 3.75e-07, + "logps/chosen": -46.43178176879883, + "logps/rejected": -54.83501434326172, + "loss": 0.6116, + "losses/dpo": 0.7334883213043213, + "losses/sft": 2.06964111328125, + "losses/total": 0.7334883213043213, + "ref_logps/chosen": -39.70704650878906, + "ref_logps/rejected": -45.3802490234375, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.6724739074707031, + "rewards/margins": 0.2730027139186859, + "rewards/rejected": -0.9454765915870667, + "step": 129 + }, + { + "epoch": 0.98, + "grad_norm": 7.931029049263691, + "learning_rate": 3.735955056179775e-07, + "logps/chosen": -41.7554817199707, + "logps/rejected": -51.33645248413086, + "loss": 0.6118, + "losses/dpo": 0.6140174865722656, + "losses/sft": 1.9485254287719727, + "losses/total": 0.6140174865722656, + "ref_logps/chosen": -36.31658172607422, + "ref_logps/rejected": -43.00047302246094, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.5438905954360962, + "rewards/margins": 0.28970640897750854, + "rewards/rejected": -0.8335970640182495, + "step": 130 + }, + { + "epoch": 0.99, + "grad_norm": 7.961225244135859, + "learning_rate": 3.72191011235955e-07, + "logps/chosen": -45.26293182373047, + "logps/rejected": -53.40849304199219, + "loss": 0.5975, + "losses/dpo": 0.6238459348678589, + "losses/sft": 1.9288058280944824, + "losses/total": 0.6238459348678589, + "ref_logps/chosen": -39.287940979003906, + "ref_logps/rejected": -43.95112991333008, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.597498893737793, + "rewards/margins": 0.34823763370513916, + "rewards/rejected": -0.9457363486289978, + "step": 131 + }, + { + "epoch": 1.0, + "grad_norm": 8.463248361122965, + "learning_rate": 3.707865168539326e-07, + "logps/chosen": -46.658084869384766, + "logps/rejected": -55.08595275878906, + "loss": 0.6124, + "losses/dpo": 0.5780600309371948, + "losses/sft": 2.4361934661865234, + "losses/total": 0.5780600309371948, + "ref_logps/chosen": -40.17093276977539, + "ref_logps/rejected": -45.625953674316406, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.6487153172492981, + "rewards/margins": 0.29728472232818604, + "rewards/rejected": -0.9460000395774841, + "step": 132 + }, + { + "epoch": 1.0, + "grad_norm": 7.966461556458486, + "learning_rate": 3.693820224719101e-07, + "logps/chosen": -43.571929931640625, + "logps/rejected": -56.445987701416016, + "loss": 0.5996, + "losses/dpo": 0.6271636486053467, + "losses/sft": 1.7036837339401245, + "losses/total": 0.6271636486053467, + "ref_logps/chosen": -37.748313903808594, + "ref_logps/rejected": -46.95930862426758, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5823614001274109, + "rewards/margins": 0.366305947303772, + "rewards/rejected": -0.9486674070358276, + "step": 133 + }, + { + "epoch": 1.01, + "grad_norm": 8.685706334967172, + "learning_rate": 3.6797752808988764e-07, + "logps/chosen": -45.6722412109375, + "logps/rejected": -58.07893371582031, + "loss": 0.5569, + "losses/dpo": 0.44357961416244507, + "losses/sft": 1.9906073808670044, + "losses/total": 0.44357961416244507, + "ref_logps/chosen": -39.666465759277344, + "ref_logps/rejected": -47.67109298706055, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.6005774736404419, + "rewards/margins": 0.44020622968673706, + "rewards/rejected": -1.0407837629318237, + "step": 134 + }, + { + "epoch": 1.02, + "grad_norm": 7.5465440969699475, + "learning_rate": 3.6657303370786514e-07, + "logps/chosen": -44.68495178222656, + "logps/rejected": -59.658729553222656, + "loss": 0.5418, + "losses/dpo": 0.6047704815864563, + "losses/sft": 1.8062564134597778, + "losses/total": 0.6047704815864563, + "ref_logps/chosen": -38.53544616699219, + "ref_logps/rejected": -48.757301330566406, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.6149506568908691, + "rewards/margins": 0.47519204020500183, + "rewards/rejected": -1.0901426076889038, + "step": 135 + }, + { + "epoch": 1.03, + "grad_norm": 8.040538818716184, + "learning_rate": 3.651685393258427e-07, + "logps/chosen": -49.753868103027344, + "logps/rejected": -55.77690124511719, + "loss": 0.5676, + "losses/dpo": 0.6332641243934631, + "losses/sft": 1.9821722507476807, + "losses/total": 0.6332641243934631, + "ref_logps/chosen": -43.2289924621582, + "ref_logps/rejected": -44.969783782958984, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.6524879932403564, + "rewards/margins": 0.4282234311103821, + "rewards/rejected": -1.0807113647460938, + "step": 136 + }, + { + "epoch": 1.03, + "grad_norm": 8.176454529563474, + "learning_rate": 3.637640449438202e-07, + "logps/chosen": -44.740989685058594, + "logps/rejected": -56.03924560546875, + "loss": 0.5426, + "losses/dpo": 0.5114879608154297, + "losses/sft": 1.7858185768127441, + "losses/total": 0.5114879608154297, + "ref_logps/chosen": -39.14986038208008, + "ref_logps/rejected": -45.894187927246094, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5591133236885071, + "rewards/margins": 0.45539283752441406, + "rewards/rejected": -1.014506220817566, + "step": 137 + }, + { + "epoch": 1.04, + "grad_norm": 7.378583429433305, + "learning_rate": 3.6235955056179776e-07, + "logps/chosen": -43.69382095336914, + "logps/rejected": -58.01210021972656, + "loss": 0.5576, + "losses/dpo": 0.5225633978843689, + "losses/sft": 1.5681252479553223, + "losses/total": 0.5225633978843689, + "ref_logps/chosen": -37.7261848449707, + "ref_logps/rejected": -47.4239616394043, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5967639684677124, + "rewards/margins": 0.46204984188079834, + "rewards/rejected": -1.0588138103485107, + "step": 138 + }, + { + "epoch": 1.05, + "grad_norm": 6.804949706358365, + "learning_rate": 3.6095505617977526e-07, + "logps/chosen": -41.86065673828125, + "logps/rejected": -56.17615509033203, + "loss": 0.541, + "losses/dpo": 0.5155816674232483, + "losses/sft": 1.66976797580719, + "losses/total": 0.5155816674232483, + "ref_logps/chosen": -36.93909454345703, + "ref_logps/rejected": -46.33450698852539, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.49215659499168396, + "rewards/margins": 0.492008239030838, + "rewards/rejected": -0.9841648936271667, + "step": 139 + }, + { + "epoch": 1.06, + "grad_norm": 9.08939326139503, + "learning_rate": 3.5955056179775277e-07, + "logps/chosen": -50.215492248535156, + "logps/rejected": -61.04364013671875, + "loss": 0.4877, + "losses/dpo": 0.4505589008331299, + "losses/sft": 2.045948028564453, + "losses/total": 0.4505589008331299, + "ref_logps/chosen": -44.16120147705078, + "ref_logps/rejected": -48.857383728027344, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.6054291725158691, + "rewards/margins": 0.6131964921951294, + "rewards/rejected": -1.218625545501709, + "step": 140 + }, + { + "epoch": 1.06, + "grad_norm": 7.204665546592892, + "learning_rate": 3.581460674157303e-07, + "logps/chosen": -44.662261962890625, + "logps/rejected": -56.869659423828125, + "loss": 0.5186, + "losses/dpo": 0.4171184301376343, + "losses/sft": 1.8761239051818848, + "losses/total": 0.4171184301376343, + "ref_logps/chosen": -38.65668487548828, + "ref_logps/rejected": -45.534423828125, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6005581617355347, + "rewards/margins": 0.5329651832580566, + "rewards/rejected": -1.1335232257843018, + "step": 141 + }, + { + "epoch": 1.07, + "grad_norm": 7.887278895914157, + "learning_rate": 3.5674157303370783e-07, + "logps/chosen": -42.77589416503906, + "logps/rejected": -51.52098083496094, + "loss": 0.5407, + "losses/dpo": 0.35035043954849243, + "losses/sft": 1.5323569774627686, + "losses/total": 0.35035043954849243, + "ref_logps/chosen": -37.22514343261719, + "ref_logps/rejected": -41.38079833984375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5550752282142639, + "rewards/margins": 0.45894336700439453, + "rewards/rejected": -1.0140186548233032, + "step": 142 + }, + { + "epoch": 1.08, + "grad_norm": 7.8842349887311505, + "learning_rate": 3.553370786516854e-07, + "logps/chosen": -48.38291549682617, + "logps/rejected": -60.9052619934082, + "loss": 0.5223, + "losses/dpo": 0.3222103714942932, + "losses/sft": 1.7116018533706665, + "losses/total": 0.3222103714942932, + "ref_logps/chosen": -41.83588409423828, + "ref_logps/rejected": -48.54487228393555, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.6547029614448547, + "rewards/margins": 0.5813360810279846, + "rewards/rejected": -1.2360389232635498, + "step": 143 + }, + { + "epoch": 1.09, + "grad_norm": 8.55836602139343, + "learning_rate": 3.539325842696629e-07, + "logps/chosen": -43.38884353637695, + "logps/rejected": -58.84518051147461, + "loss": 0.5957, + "losses/dpo": 0.458510160446167, + "losses/sft": 1.5300884246826172, + "losses/total": 0.458510160446167, + "ref_logps/chosen": -36.08906555175781, + "ref_logps/rejected": -47.359981536865234, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7299779653549194, + "rewards/margins": 0.41854166984558105, + "rewards/rejected": -1.14851975440979, + "step": 144 + }, + { + "epoch": 1.09, + "grad_norm": 7.848515280846644, + "learning_rate": 3.5252808988764045e-07, + "logps/chosen": -44.214759826660156, + "logps/rejected": -55.23337936401367, + "loss": 0.5469, + "losses/dpo": 0.47486573457717896, + "losses/sft": 1.7108842134475708, + "losses/total": 0.47486573457717896, + "ref_logps/chosen": -38.41852569580078, + "ref_logps/rejected": -44.74144744873047, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.5796229243278503, + "rewards/margins": 0.4695700407028198, + "rewards/rejected": -1.049193024635315, + "step": 145 + }, + { + "epoch": 1.1, + "grad_norm": 7.658781592826533, + "learning_rate": 3.51123595505618e-07, + "logps/chosen": -44.951683044433594, + "logps/rejected": -60.23070526123047, + "loss": 0.5266, + "losses/dpo": 0.5872490406036377, + "losses/sft": 1.8830925226211548, + "losses/total": 0.5872490406036377, + "ref_logps/chosen": -38.69530487060547, + "ref_logps/rejected": -48.41132354736328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6256377696990967, + "rewards/margins": 0.5562998056411743, + "rewards/rejected": -1.1819374561309814, + "step": 146 + }, + { + "epoch": 1.11, + "grad_norm": 8.54544198888784, + "learning_rate": 3.497191011235955e-07, + "logps/chosen": -48.944068908691406, + "logps/rejected": -63.39553451538086, + "loss": 0.533, + "losses/dpo": 0.40039166808128357, + "losses/sft": 1.862857460975647, + "losses/total": 0.40039166808128357, + "ref_logps/chosen": -42.56263732910156, + "ref_logps/rejected": -50.94089126586914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6381431818008423, + "rewards/margins": 0.6073207259178162, + "rewards/rejected": -1.2454639673233032, + "step": 147 + }, + { + "epoch": 1.12, + "grad_norm": 7.389416730585067, + "learning_rate": 3.48314606741573e-07, + "logps/chosen": -38.91257858276367, + "logps/rejected": -49.71195602416992, + "loss": 0.5426, + "losses/dpo": 0.46995991468429565, + "losses/sft": 1.7020084857940674, + "losses/total": 0.46995991468429565, + "ref_logps/chosen": -33.769771575927734, + "ref_logps/rejected": -40.01531219482422, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.5142804384231567, + "rewards/margins": 0.45538395643234253, + "rewards/rejected": -0.969664454460144, + "step": 148 + }, + { + "epoch": 1.12, + "grad_norm": 7.674169098254074, + "learning_rate": 3.469101123595505e-07, + "logps/chosen": -51.57361602783203, + "logps/rejected": -61.665931701660156, + "loss": 0.4713, + "losses/dpo": 0.4245557487010956, + "losses/sft": 1.5430463552474976, + "losses/total": 0.4245557487010956, + "ref_logps/chosen": -44.85792541503906, + "ref_logps/rejected": -48.205955505371094, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.6715684533119202, + "rewards/margins": 0.6744291186332703, + "rewards/rejected": -1.3459975719451904, + "step": 149 + }, + { + "epoch": 1.13, + "grad_norm": 22.33237134875282, + "learning_rate": 3.4550561797752807e-07, + "logps/chosen": -47.884517669677734, + "logps/rejected": -58.32150650024414, + "loss": 0.5577, + "losses/dpo": 0.5011469721794128, + "losses/sft": 1.6741063594818115, + "losses/total": 0.5011469721794128, + "ref_logps/chosen": -40.72835159301758, + "ref_logps/rejected": -46.43974685668945, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.7156162261962891, + "rewards/margins": 0.47256001830101013, + "rewards/rejected": -1.1881763935089111, + "step": 150 + }, + { + "epoch": 1.14, + "grad_norm": 7.494096272004472, + "learning_rate": 3.441011235955056e-07, + "logps/chosen": -47.11280059814453, + "logps/rejected": -60.22679901123047, + "loss": 0.4938, + "losses/dpo": 0.5828070044517517, + "losses/sft": 1.9693294763565063, + "losses/total": 0.5828070044517517, + "ref_logps/chosen": -41.27467727661133, + "ref_logps/rejected": -47.46904754638672, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5838119983673096, + "rewards/margins": 0.691962480545044, + "rewards/rejected": -1.2757744789123535, + "step": 151 + }, + { + "epoch": 1.15, + "grad_norm": 7.516820355286503, + "learning_rate": 3.4269662921348313e-07, + "logps/chosen": -40.18495559692383, + "logps/rejected": -54.60972595214844, + "loss": 0.5232, + "losses/dpo": 0.5568748116493225, + "losses/sft": 1.7009055614471436, + "losses/total": 0.5568748116493225, + "ref_logps/chosen": -33.816017150878906, + "ref_logps/rejected": -43.013160705566406, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.6368939280509949, + "rewards/margins": 0.5227622389793396, + "rewards/rejected": -1.159656047821045, + "step": 152 + }, + { + "epoch": 1.15, + "grad_norm": 7.502946402300403, + "learning_rate": 3.4129213483146064e-07, + "logps/chosen": -42.238433837890625, + "logps/rejected": -53.268348693847656, + "loss": 0.543, + "losses/dpo": 0.5467118620872498, + "losses/sft": 1.2100037336349487, + "losses/total": 0.5467118620872498, + "ref_logps/chosen": -35.477867126464844, + "ref_logps/rejected": -41.79436111450195, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.676056981086731, + "rewards/margins": 0.47134220600128174, + "rewards/rejected": -1.1473990678787231, + "step": 153 + }, + { + "epoch": 1.16, + "grad_norm": 7.772041407679626, + "learning_rate": 3.398876404494382e-07, + "logps/chosen": -44.26182174682617, + "logps/rejected": -64.4139633178711, + "loss": 0.4692, + "losses/dpo": 0.4075261354446411, + "losses/sft": 1.9415578842163086, + "losses/total": 0.4075261354446411, + "ref_logps/chosen": -37.64110565185547, + "ref_logps/rejected": -50.549678802490234, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.6620715260505676, + "rewards/margins": 0.7243567109107971, + "rewards/rejected": -1.3864283561706543, + "step": 154 + }, + { + "epoch": 1.17, + "grad_norm": 9.887168758150576, + "learning_rate": 3.3848314606741575e-07, + "logps/chosen": -46.856407165527344, + "logps/rejected": -60.13237380981445, + "loss": 0.5388, + "losses/dpo": 0.4830757975578308, + "losses/sft": 1.654982328414917, + "losses/total": 0.4830757975578308, + "ref_logps/chosen": -39.95355987548828, + "ref_logps/rejected": -47.57228088378906, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.6902844905853271, + "rewards/margins": 0.5657243132591248, + "rewards/rejected": -1.2560089826583862, + "step": 155 + }, + { + "epoch": 1.18, + "grad_norm": 8.0774394495718, + "learning_rate": 3.3707865168539325e-07, + "logps/chosen": -46.11971664428711, + "logps/rejected": -58.2076301574707, + "loss": 0.5571, + "losses/dpo": 0.6668601036071777, + "losses/sft": 2.1451334953308105, + "losses/total": 0.6668601036071777, + "ref_logps/chosen": -38.718963623046875, + "ref_logps/rejected": -45.97993850708008, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.7400756478309631, + "rewards/margins": 0.48269355297088623, + "rewards/rejected": -1.2227692604064941, + "step": 156 + }, + { + "epoch": 1.18, + "grad_norm": 7.6995260059404735, + "learning_rate": 3.356741573033708e-07, + "logps/chosen": -42.61933898925781, + "logps/rejected": -53.914058685302734, + "loss": 0.5563, + "losses/dpo": 0.5214600563049316, + "losses/sft": 1.6869933605194092, + "losses/total": 0.5214600563049316, + "ref_logps/chosen": -35.26099395751953, + "ref_logps/rejected": -41.69981002807617, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7358340620994568, + "rewards/margins": 0.4855908751487732, + "rewards/rejected": -1.2214250564575195, + "step": 157 + }, + { + "epoch": 1.19, + "grad_norm": 10.378363778579734, + "learning_rate": 3.3426966292134826e-07, + "logps/chosen": -46.86024856567383, + "logps/rejected": -61.01960754394531, + "loss": 0.5047, + "losses/dpo": 0.35230398178100586, + "losses/sft": 2.1814329624176025, + "losses/total": 0.35230398178100586, + "ref_logps/chosen": -40.29644775390625, + "ref_logps/rejected": -48.00575256347656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6563804149627686, + "rewards/margins": 0.6450048685073853, + "rewards/rejected": -1.3013852834701538, + "step": 158 + }, + { + "epoch": 1.2, + "grad_norm": 8.023005518353788, + "learning_rate": 3.328651685393258e-07, + "logps/chosen": -48.07395553588867, + "logps/rejected": -60.29581069946289, + "loss": 0.5284, + "losses/dpo": 0.57308030128479, + "losses/sft": 2.2541966438293457, + "losses/total": 0.57308030128479, + "ref_logps/chosen": -39.92966842651367, + "ref_logps/rejected": -46.47784423828125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8144292831420898, + "rewards/margins": 0.5673672556877136, + "rewards/rejected": -1.3817965984344482, + "step": 159 + }, + { + "epoch": 1.21, + "grad_norm": 7.252828799858416, + "learning_rate": 3.314606741573033e-07, + "logps/chosen": -45.51723861694336, + "logps/rejected": -60.294551849365234, + "loss": 0.4636, + "losses/dpo": 0.581541895866394, + "losses/sft": 1.6298561096191406, + "losses/total": 0.581541895866394, + "ref_logps/chosen": -38.64426040649414, + "ref_logps/rejected": -45.62995147705078, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6872978210449219, + "rewards/margins": 0.7791618704795837, + "rewards/rejected": -1.4664596319198608, + "step": 160 + }, + { + "epoch": 1.22, + "grad_norm": 8.57918326492943, + "learning_rate": 3.300561797752809e-07, + "logps/chosen": -50.444820404052734, + "logps/rejected": -56.87160110473633, + "loss": 0.5883, + "losses/dpo": 0.5911461710929871, + "losses/sft": 2.1341657638549805, + "losses/total": 0.5911461710929871, + "ref_logps/chosen": -41.919921875, + "ref_logps/rejected": -43.998451232910156, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8524903059005737, + "rewards/margins": 0.43482455611228943, + "rewards/rejected": -1.2873148918151855, + "step": 161 + }, + { + "epoch": 1.22, + "grad_norm": 8.38374098065899, + "learning_rate": 3.2865168539325844e-07, + "logps/chosen": -50.57026672363281, + "logps/rejected": -61.819053649902344, + "loss": 0.5544, + "losses/dpo": 0.5535627603530884, + "losses/sft": 1.9382483959197998, + "losses/total": 0.5535627603530884, + "ref_logps/chosen": -41.4891357421875, + "ref_logps/rejected": -47.33740234375, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.9081130027770996, + "rewards/margins": 0.540052056312561, + "rewards/rejected": -1.4481650590896606, + "step": 162 + }, + { + "epoch": 1.23, + "grad_norm": 8.053746658561026, + "learning_rate": 3.2724719101123594e-07, + "logps/chosen": -49.71845245361328, + "logps/rejected": -55.962120056152344, + "loss": 0.5659, + "losses/dpo": 0.5589174628257751, + "losses/sft": 2.0584986209869385, + "losses/total": 0.5589174628257751, + "ref_logps/chosen": -41.30320739746094, + "ref_logps/rejected": -42.65535354614258, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.841524600982666, + "rewards/margins": 0.4891516864299774, + "rewards/rejected": -1.3306763172149658, + "step": 163 + }, + { + "epoch": 1.24, + "grad_norm": 8.128166160006492, + "learning_rate": 3.258426966292135e-07, + "logps/chosen": -49.184532165527344, + "logps/rejected": -62.28634262084961, + "loss": 0.5594, + "losses/dpo": 0.6036563515663147, + "losses/sft": 1.7208425998687744, + "losses/total": 0.6036563515663147, + "ref_logps/chosen": -40.15517044067383, + "ref_logps/rejected": -47.867679595947266, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9029368162155151, + "rewards/margins": 0.5389291644096375, + "rewards/rejected": -1.441866159439087, + "step": 164 + }, + { + "epoch": 1.25, + "grad_norm": 7.785958027787598, + "learning_rate": 3.24438202247191e-07, + "logps/chosen": -46.6161003112793, + "logps/rejected": -64.03797912597656, + "loss": 0.5012, + "losses/dpo": 0.36227643489837646, + "losses/sft": 1.8801686763763428, + "losses/total": 0.36227643489837646, + "ref_logps/chosen": -39.068603515625, + "ref_logps/rejected": -49.793888092041016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7547495365142822, + "rewards/margins": 0.6696599125862122, + "rewards/rejected": -1.42440927028656, + "step": 165 + }, + { + "epoch": 1.25, + "grad_norm": 8.005747191405192, + "learning_rate": 3.2303370786516856e-07, + "logps/chosen": -47.46510696411133, + "logps/rejected": -64.76780700683594, + "loss": 0.495, + "losses/dpo": 0.5762285590171814, + "losses/sft": 1.6507606506347656, + "losses/total": 0.5762285590171814, + "ref_logps/chosen": -39.22062683105469, + "ref_logps/rejected": -49.37548065185547, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.824447751045227, + "rewards/margins": 0.7147842645645142, + "rewards/rejected": -1.5392321348190308, + "step": 166 + }, + { + "epoch": 1.26, + "grad_norm": 8.587266771042302, + "learning_rate": 3.21629213483146e-07, + "logps/chosen": -44.891380310058594, + "logps/rejected": -63.128692626953125, + "loss": 0.476, + "losses/dpo": 0.3979690968990326, + "losses/sft": 1.9852135181427002, + "losses/total": 0.3979690968990326, + "ref_logps/chosen": -36.76803970336914, + "ref_logps/rejected": -47.980194091796875, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8123339414596558, + "rewards/margins": 0.7025157809257507, + "rewards/rejected": -1.5148497819900513, + "step": 167 + }, + { + "epoch": 1.27, + "grad_norm": 8.326932342308945, + "learning_rate": 3.2022471910112357e-07, + "logps/chosen": -47.12702941894531, + "logps/rejected": -63.2083625793457, + "loss": 0.4967, + "losses/dpo": 0.46893593668937683, + "losses/sft": 2.4472603797912598, + "losses/total": 0.46893593668937683, + "ref_logps/chosen": -38.752357482910156, + "ref_logps/rejected": -48.1095085144043, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.8374671339988708, + "rewards/margins": 0.6724185943603516, + "rewards/rejected": -1.5098857879638672, + "step": 168 + }, + { + "epoch": 1.28, + "grad_norm": 8.055720143262379, + "learning_rate": 3.1882022471910107e-07, + "logps/chosen": -47.89605712890625, + "logps/rejected": -61.83879852294922, + "loss": 0.5001, + "losses/dpo": 0.5356773138046265, + "losses/sft": 2.059915781021118, + "losses/total": 0.5356773138046265, + "ref_logps/chosen": -38.51155090332031, + "ref_logps/rejected": -45.76554870605469, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9384507536888123, + "rewards/margins": 0.6688745021820068, + "rewards/rejected": -1.6073251962661743, + "step": 169 + }, + { + "epoch": 1.28, + "grad_norm": 7.586041013117629, + "learning_rate": 3.1741573033707863e-07, + "logps/chosen": -48.82367706298828, + "logps/rejected": -61.73891830444336, + "loss": 0.482, + "losses/dpo": 0.4217451214790344, + "losses/sft": 1.6581342220306396, + "losses/total": 0.4217451214790344, + "ref_logps/chosen": -40.12367248535156, + "ref_logps/rejected": -45.60664367675781, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.870000422000885, + "rewards/margins": 0.7432273626327515, + "rewards/rejected": -1.6132278442382812, + "step": 170 + }, + { + "epoch": 1.29, + "grad_norm": 8.642028616636065, + "learning_rate": 3.160112359550562e-07, + "logps/chosen": -48.638423919677734, + "logps/rejected": -60.67471694946289, + "loss": 0.5046, + "losses/dpo": 0.5264810919761658, + "losses/sft": 1.7720236778259277, + "losses/total": 0.5264810919761658, + "ref_logps/chosen": -39.852603912353516, + "ref_logps/rejected": -44.74626159667969, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8785818815231323, + "rewards/margins": 0.7142631411552429, + "rewards/rejected": -1.592844843864441, + "step": 171 + }, + { + "epoch": 1.3, + "grad_norm": 8.335982070287319, + "learning_rate": 3.146067415730337e-07, + "logps/chosen": -52.76824951171875, + "logps/rejected": -63.022315979003906, + "loss": 0.5192, + "losses/dpo": 0.4350988268852234, + "losses/sft": 2.3970541954040527, + "losses/total": 0.4350988268852234, + "ref_logps/chosen": -42.989315032958984, + "ref_logps/rejected": -47.05906677246094, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9778933525085449, + "rewards/margins": 0.6184311509132385, + "rewards/rejected": -1.5963245630264282, + "step": 172 + }, + { + "epoch": 1.31, + "grad_norm": 8.969238567903124, + "learning_rate": 3.1320224719101125e-07, + "logps/chosen": -50.0399169921875, + "logps/rejected": -61.899139404296875, + "loss": 0.5552, + "losses/dpo": 0.6638925075531006, + "losses/sft": 2.114647150039673, + "losses/total": 0.6638925075531006, + "ref_logps/chosen": -40.658668518066406, + "ref_logps/rejected": -46.74747848510742, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.938124418258667, + "rewards/margins": 0.5770419239997864, + "rewards/rejected": -1.515166163444519, + "step": 173 + }, + { + "epoch": 1.31, + "grad_norm": 8.235186141597438, + "learning_rate": 3.1179775280898875e-07, + "logps/chosen": -52.105690002441406, + "logps/rejected": -62.85676574707031, + "loss": 0.5092, + "losses/dpo": 0.7253843545913696, + "losses/sft": 2.424346446990967, + "losses/total": 0.7253843545913696, + "ref_logps/chosen": -43.18442916870117, + "ref_logps/rejected": -47.48811340332031, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8921262621879578, + "rewards/margins": 0.6447390913963318, + "rewards/rejected": -1.5368653535842896, + "step": 174 + }, + { + "epoch": 1.32, + "grad_norm": 7.582218244857748, + "learning_rate": 3.103932584269663e-07, + "logps/chosen": -50.67258071899414, + "logps/rejected": -66.15370178222656, + "loss": 0.4585, + "losses/dpo": 0.45553505420684814, + "losses/sft": 1.935417890548706, + "losses/total": 0.45553505420684814, + "ref_logps/chosen": -41.14632034301758, + "ref_logps/rejected": -48.69713592529297, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.9526264667510986, + "rewards/margins": 0.7930303812026978, + "rewards/rejected": -1.745656967163086, + "step": 175 + }, + { + "epoch": 1.33, + "grad_norm": 8.60397689359237, + "learning_rate": 3.0898876404494376e-07, + "logps/chosen": -48.78178405761719, + "logps/rejected": -56.637123107910156, + "loss": 0.5424, + "losses/dpo": 0.46894580125808716, + "losses/sft": 1.4491811990737915, + "losses/total": 0.46894580125808716, + "ref_logps/chosen": -39.89250183105469, + "ref_logps/rejected": -41.9540901184082, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8889281749725342, + "rewards/margins": 0.5793753862380981, + "rewards/rejected": -1.4683035612106323, + "step": 176 + }, + { + "epoch": 1.34, + "grad_norm": 8.889595865380464, + "learning_rate": 3.075842696629213e-07, + "logps/chosen": -54.22248077392578, + "logps/rejected": -62.7822380065918, + "loss": 0.5381, + "losses/dpo": 0.5002489686012268, + "losses/sft": 1.6078195571899414, + "losses/total": 0.5002489686012268, + "ref_logps/chosen": -43.68180847167969, + "ref_logps/rejected": -46.31658172607422, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -1.0540671348571777, + "rewards/margins": 0.592498779296875, + "rewards/rejected": -1.6465659141540527, + "step": 177 + }, + { + "epoch": 1.34, + "grad_norm": 8.532472841711035, + "learning_rate": 3.0617977528089887e-07, + "logps/chosen": -49.95827865600586, + "logps/rejected": -61.806705474853516, + "loss": 0.5407, + "losses/dpo": 0.6499188542366028, + "losses/sft": 2.202641725540161, + "losses/total": 0.6499188542366028, + "ref_logps/chosen": -39.29413986206055, + "ref_logps/rejected": -44.9674072265625, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0664141178131104, + "rewards/margins": 0.6175155639648438, + "rewards/rejected": -1.6839298009872437, + "step": 178 + }, + { + "epoch": 1.35, + "grad_norm": 8.909287859284307, + "learning_rate": 3.047752808988764e-07, + "logps/chosen": -51.441497802734375, + "logps/rejected": -62.11806106567383, + "loss": 0.527, + "losses/dpo": 0.2888008952140808, + "losses/sft": 1.8648221492767334, + "losses/total": 0.2888008952140808, + "ref_logps/chosen": -41.599273681640625, + "ref_logps/rejected": -46.178306579589844, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9842216372489929, + "rewards/margins": 0.6097543239593506, + "rewards/rejected": -1.5939760208129883, + "step": 179 + }, + { + "epoch": 1.36, + "grad_norm": 8.118331116269069, + "learning_rate": 3.0337078651685393e-07, + "logps/chosen": -50.35320281982422, + "logps/rejected": -68.11038208007812, + "loss": 0.4825, + "losses/dpo": 0.2915097177028656, + "losses/sft": 2.2731690406799316, + "losses/total": 0.2915097177028656, + "ref_logps/chosen": -40.37377166748047, + "ref_logps/rejected": -50.12421798706055, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.997943103313446, + "rewards/margins": 0.8006736636161804, + "rewards/rejected": -1.7986167669296265, + "step": 180 + }, + { + "epoch": 1.37, + "grad_norm": 7.611558494319546, + "learning_rate": 3.0196629213483144e-07, + "logps/chosen": -48.98621368408203, + "logps/rejected": -63.958377838134766, + "loss": 0.4771, + "losses/dpo": 0.8142632246017456, + "losses/sft": 2.000248670578003, + "losses/total": 0.8142632246017456, + "ref_logps/chosen": -39.60884475708008, + "ref_logps/rejected": -46.56087875366211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9377367496490479, + "rewards/margins": 0.8020133376121521, + "rewards/rejected": -1.7397500276565552, + "step": 181 + }, + { + "epoch": 1.37, + "grad_norm": 7.87112340864733, + "learning_rate": 3.00561797752809e-07, + "logps/chosen": -50.70545959472656, + "logps/rejected": -60.79212951660156, + "loss": 0.481, + "losses/dpo": 0.4779345989227295, + "losses/sft": 2.002454996109009, + "losses/total": 0.4779345989227295, + "ref_logps/chosen": -40.73643112182617, + "ref_logps/rejected": -43.39889144897461, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.9969026446342468, + "rewards/margins": 0.7424213886260986, + "rewards/rejected": -1.7393240928649902, + "step": 182 + }, + { + "epoch": 1.38, + "grad_norm": 8.88113336772084, + "learning_rate": 2.991573033707865e-07, + "logps/chosen": -49.58301544189453, + "logps/rejected": -58.122802734375, + "loss": 0.5481, + "losses/dpo": 0.5509602427482605, + "losses/sft": 2.1736414432525635, + "losses/total": 0.5509602427482605, + "ref_logps/chosen": -39.8292350769043, + "ref_logps/rejected": -42.636268615722656, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.9753779768943787, + "rewards/margins": 0.5732761025428772, + "rewards/rejected": -1.5486540794372559, + "step": 183 + }, + { + "epoch": 1.39, + "grad_norm": 8.086846655989897, + "learning_rate": 2.9775280898876406e-07, + "logps/chosen": -49.57462692260742, + "logps/rejected": -59.01618194580078, + "loss": 0.5191, + "losses/dpo": 0.41641122102737427, + "losses/sft": 1.5094877481460571, + "losses/total": 0.41641122102737427, + "ref_logps/chosen": -40.067962646484375, + "ref_logps/rejected": -43.40235900878906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9506663084030151, + "rewards/margins": 0.6107163429260254, + "rewards/rejected": -1.561382532119751, + "step": 184 + }, + { + "epoch": 1.4, + "grad_norm": 14.80695955125612, + "learning_rate": 2.9634831460674156e-07, + "logps/chosen": -49.670318603515625, + "logps/rejected": -63.068153381347656, + "loss": 0.4798, + "losses/dpo": 0.71369868516922, + "losses/sft": 2.1753716468811035, + "losses/total": 0.71369868516922, + "ref_logps/chosen": -40.42032241821289, + "ref_logps/rejected": -45.94530487060547, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.9249992966651917, + "rewards/margins": 0.7872861623764038, + "rewards/rejected": -1.7122855186462402, + "step": 185 + }, + { + "epoch": 1.4, + "grad_norm": 7.764172479016813, + "learning_rate": 2.9494382022471906e-07, + "logps/chosen": -46.467315673828125, + "logps/rejected": -61.956661224365234, + "loss": 0.4781, + "losses/dpo": 0.6086141467094421, + "losses/sft": 1.7402431964874268, + "losses/total": 0.6086141467094421, + "ref_logps/chosen": -37.99009704589844, + "ref_logps/rejected": -45.549415588378906, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8477218747138977, + "rewards/margins": 0.7930029630661011, + "rewards/rejected": -1.640724778175354, + "step": 186 + }, + { + "epoch": 1.41, + "grad_norm": 8.22953072067457, + "learning_rate": 2.935393258426966e-07, + "logps/chosen": -50.86983871459961, + "logps/rejected": -62.61697769165039, + "loss": 0.5075, + "losses/dpo": 0.4520256519317627, + "losses/sft": 2.4947710037231445, + "losses/total": 0.4520256519317627, + "ref_logps/chosen": -40.587074279785156, + "ref_logps/rejected": -45.32030487060547, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.0282765626907349, + "rewards/margins": 0.7013900279998779, + "rewards/rejected": -1.7296665906906128, + "step": 187 + }, + { + "epoch": 1.42, + "grad_norm": 7.9279440455740735, + "learning_rate": 2.921348314606741e-07, + "logps/chosen": -44.96842956542969, + "logps/rejected": -62.975059509277344, + "loss": 0.5187, + "losses/dpo": 0.5396404266357422, + "losses/sft": 2.048635482788086, + "losses/total": 0.5396404266357422, + "ref_logps/chosen": -34.73614501953125, + "ref_logps/rejected": -46.000450134277344, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -1.023228406906128, + "rewards/margins": 0.6742324829101562, + "rewards/rejected": -1.6974608898162842, + "step": 188 + }, + { + "epoch": 1.43, + "grad_norm": 7.307548456693543, + "learning_rate": 2.907303370786517e-07, + "logps/chosen": -45.52065658569336, + "logps/rejected": -59.8885498046875, + "loss": 0.4755, + "losses/dpo": 0.40278539061546326, + "losses/sft": 2.2951760292053223, + "losses/total": 0.40278539061546326, + "ref_logps/chosen": -37.46648406982422, + "ref_logps/rejected": -44.16557312011719, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8054174184799194, + "rewards/margins": 0.7668801546096802, + "rewards/rejected": -1.5722976922988892, + "step": 189 + }, + { + "epoch": 1.43, + "grad_norm": 8.912502235874074, + "learning_rate": 2.893258426966292e-07, + "logps/chosen": -45.76905059814453, + "logps/rejected": -57.87416076660156, + "loss": 0.5655, + "losses/dpo": 0.5564082860946655, + "losses/sft": 1.3282924890518188, + "losses/total": 0.5564082860946655, + "ref_logps/chosen": -36.90949630737305, + "ref_logps/rejected": -43.09407424926758, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8859553933143616, + "rewards/margins": 0.5920534133911133, + "rewards/rejected": -1.47800874710083, + "step": 190 + }, + { + "epoch": 1.44, + "grad_norm": 7.963453038939498, + "learning_rate": 2.8792134831460674e-07, + "logps/chosen": -48.343929290771484, + "logps/rejected": -68.3438720703125, + "loss": 0.4613, + "losses/dpo": 0.4805658161640167, + "losses/sft": 2.2470126152038574, + "losses/total": 0.4805658161640167, + "ref_logps/chosen": -40.115596771240234, + "ref_logps/rejected": -51.82239532470703, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8228334784507751, + "rewards/margins": 0.8293145895004272, + "rewards/rejected": -1.6521480083465576, + "step": 191 + }, + { + "epoch": 1.45, + "grad_norm": 8.622412292262098, + "learning_rate": 2.8651685393258425e-07, + "logps/chosen": -51.2635383605957, + "logps/rejected": -63.78586959838867, + "loss": 0.4851, + "losses/dpo": 0.38889288902282715, + "losses/sft": 2.7701683044433594, + "losses/total": 0.38889288902282715, + "ref_logps/chosen": -42.14338684082031, + "ref_logps/rejected": -46.94654083251953, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9120146036148071, + "rewards/margins": 0.7719184756278992, + "rewards/rejected": -1.683933138847351, + "step": 192 + }, + { + "epoch": 1.46, + "grad_norm": 7.759600823991896, + "learning_rate": 2.851123595505618e-07, + "logps/chosen": -46.17103958129883, + "logps/rejected": -60.57073974609375, + "loss": 0.5056, + "losses/dpo": 0.6096426248550415, + "losses/sft": 2.101557493209839, + "losses/total": 0.6096426248550415, + "ref_logps/chosen": -37.342018127441406, + "ref_logps/rejected": -44.97611999511719, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8829020261764526, + "rewards/margins": 0.6765601634979248, + "rewards/rejected": -1.559462308883667, + "step": 193 + }, + { + "epoch": 1.46, + "grad_norm": 8.401972111307101, + "learning_rate": 2.8370786516853936e-07, + "logps/chosen": -49.945350646972656, + "logps/rejected": -64.91703033447266, + "loss": 0.4635, + "losses/dpo": 0.5600602626800537, + "losses/sft": 1.5397871732711792, + "losses/total": 0.5600602626800537, + "ref_logps/chosen": -41.32893371582031, + "ref_logps/rejected": -47.677955627441406, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.8616417050361633, + "rewards/margins": 0.8622665405273438, + "rewards/rejected": -1.7239081859588623, + "step": 194 + }, + { + "epoch": 1.47, + "grad_norm": 8.847895232096153, + "learning_rate": 2.823033707865168e-07, + "logps/chosen": -45.08359146118164, + "logps/rejected": -65.03630828857422, + "loss": 0.5214, + "losses/dpo": 0.5835427045822144, + "losses/sft": 1.4693069458007812, + "losses/total": 0.5835427045822144, + "ref_logps/chosen": -35.359466552734375, + "ref_logps/rejected": -47.392391204833984, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9724127650260925, + "rewards/margins": 0.7919799089431763, + "rewards/rejected": -1.764392614364624, + "step": 195 + }, + { + "epoch": 1.48, + "grad_norm": 8.759081047535012, + "learning_rate": 2.8089887640449437e-07, + "logps/chosen": -45.32444763183594, + "logps/rejected": -63.420326232910156, + "loss": 0.5066, + "losses/dpo": 0.38791435956954956, + "losses/sft": 1.907132625579834, + "losses/total": 0.38791435956954956, + "ref_logps/chosen": -35.994544982910156, + "ref_logps/rejected": -46.32176971435547, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.932990550994873, + "rewards/margins": 0.776865541934967, + "rewards/rejected": -1.7098561525344849, + "step": 196 + }, + { + "epoch": 1.49, + "grad_norm": 8.54168900663033, + "learning_rate": 2.794943820224719e-07, + "logps/chosen": -53.67341613769531, + "logps/rejected": -73.58582305908203, + "loss": 0.4837, + "losses/dpo": 0.5046126842498779, + "losses/sft": 2.1525635719299316, + "losses/total": 0.5046126842498779, + "ref_logps/chosen": -42.92031478881836, + "ref_logps/rejected": -54.27576446533203, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0753098726272583, + "rewards/margins": 0.8556962013244629, + "rewards/rejected": -1.9310060739517212, + "step": 197 + }, + { + "epoch": 1.49, + "grad_norm": 9.360560561178314, + "learning_rate": 2.7808988764044943e-07, + "logps/chosen": -49.29429626464844, + "logps/rejected": -58.852745056152344, + "loss": 0.5219, + "losses/dpo": 0.6207550168037415, + "losses/sft": 1.4964567422866821, + "losses/total": 0.6207550168037415, + "ref_logps/chosen": -39.879859924316406, + "ref_logps/rejected": -42.66207504272461, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9414433836936951, + "rewards/margins": 0.6776232719421387, + "rewards/rejected": -1.6190667152404785, + "step": 198 + }, + { + "epoch": 1.5, + "grad_norm": 7.766094681277619, + "learning_rate": 2.7668539325842694e-07, + "logps/chosen": -44.67707443237305, + "logps/rejected": -62.46302032470703, + "loss": 0.4513, + "losses/dpo": 0.3586122393608093, + "losses/sft": 1.7712942361831665, + "losses/total": 0.3586122393608093, + "ref_logps/chosen": -35.756446838378906, + "ref_logps/rejected": -45.0746955871582, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8920624256134033, + "rewards/margins": 0.8467705249786377, + "rewards/rejected": -1.738832950592041, + "step": 199 + }, + { + "epoch": 1.51, + "grad_norm": 8.387391054319354, + "learning_rate": 2.752808988764045e-07, + "logps/chosen": -51.25703430175781, + "logps/rejected": -66.00263977050781, + "loss": 0.4694, + "losses/dpo": 0.4337347447872162, + "losses/sft": 2.2087419033050537, + "losses/total": 0.4337347447872162, + "ref_logps/chosen": -41.61771011352539, + "ref_logps/rejected": -48.16603469848633, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.96393221616745, + "rewards/margins": 0.8197280764579773, + "rewards/rejected": -1.7836604118347168, + "step": 200 + }, + { + "epoch": 1.52, + "grad_norm": 8.762356246957351, + "learning_rate": 2.73876404494382e-07, + "logps/chosen": -48.135372161865234, + "logps/rejected": -59.56908416748047, + "loss": 0.5349, + "losses/dpo": 0.5300882458686829, + "losses/sft": 1.96072256565094, + "losses/total": 0.5300882458686829, + "ref_logps/chosen": -39.096519470214844, + "ref_logps/rejected": -44.62074279785156, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.9038856625556946, + "rewards/margins": 0.5909484028816223, + "rewards/rejected": -1.4948341846466064, + "step": 201 + }, + { + "epoch": 1.52, + "grad_norm": 8.01612481161353, + "learning_rate": 2.7247191011235955e-07, + "logps/chosen": -48.03499984741211, + "logps/rejected": -58.95391845703125, + "loss": 0.4596, + "losses/dpo": 0.5687890648841858, + "losses/sft": 1.5117721557617188, + "losses/total": 0.5687890648841858, + "ref_logps/chosen": -40.076637268066406, + "ref_logps/rejected": -42.60528564453125, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.795836865901947, + "rewards/margins": 0.839026153087616, + "rewards/rejected": -1.634863018989563, + "step": 202 + }, + { + "epoch": 1.53, + "grad_norm": 8.170779294474082, + "learning_rate": 2.710674157303371e-07, + "logps/chosen": -53.97732925415039, + "logps/rejected": -66.41366577148438, + "loss": 0.4697, + "losses/dpo": 0.440776526927948, + "losses/sft": 2.09440279006958, + "losses/total": 0.440776526927948, + "ref_logps/chosen": -44.04298782348633, + "ref_logps/rejected": -48.34318161010742, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.9934341311454773, + "rewards/margins": 0.8136138319969177, + "rewards/rejected": -1.8070482015609741, + "step": 203 + }, + { + "epoch": 1.54, + "grad_norm": 8.062617165532144, + "learning_rate": 2.6966292134831456e-07, + "logps/chosen": -51.31819152832031, + "logps/rejected": -66.41058349609375, + "loss": 0.4809, + "losses/dpo": 0.5263036489486694, + "losses/sft": 1.6697288751602173, + "losses/total": 0.5263036489486694, + "ref_logps/chosen": -41.211219787597656, + "ref_logps/rejected": -48.88929748535156, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0106972455978394, + "rewards/margins": 0.7414315938949585, + "rewards/rejected": -1.7521288394927979, + "step": 204 + }, + { + "epoch": 1.55, + "grad_norm": 8.232167547461342, + "learning_rate": 2.682584269662921e-07, + "logps/chosen": -50.255958557128906, + "logps/rejected": -64.30512237548828, + "loss": 0.4906, + "losses/dpo": 0.5959848761558533, + "losses/sft": 2.509490966796875, + "losses/total": 0.5959848761558533, + "ref_logps/chosen": -40.21955490112305, + "ref_logps/rejected": -46.348358154296875, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -1.0036402940750122, + "rewards/margins": 0.7920358180999756, + "rewards/rejected": -1.7956762313842773, + "step": 205 + }, + { + "epoch": 1.55, + "grad_norm": 8.658885149622282, + "learning_rate": 2.668539325842696e-07, + "logps/chosen": -48.23283004760742, + "logps/rejected": -63.8531608581543, + "loss": 0.5012, + "losses/dpo": 0.3958526849746704, + "losses/sft": 1.8156052827835083, + "losses/total": 0.3958526849746704, + "ref_logps/chosen": -39.19361877441406, + "ref_logps/rejected": -46.780818939208984, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9039209485054016, + "rewards/margins": 0.8033130168914795, + "rewards/rejected": -1.7072339057922363, + "step": 206 + }, + { + "epoch": 1.56, + "grad_norm": 9.062222930248339, + "learning_rate": 2.654494382022472e-07, + "logps/chosen": -48.18266296386719, + "logps/rejected": -60.7551155090332, + "loss": 0.5163, + "losses/dpo": 0.5344985127449036, + "losses/sft": 1.5162503719329834, + "losses/total": 0.5344985127449036, + "ref_logps/chosen": -38.624839782714844, + "ref_logps/rejected": -43.208770751953125, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.9557825326919556, + "rewards/margins": 0.798852264881134, + "rewards/rejected": -1.7546348571777344, + "step": 207 + }, + { + "epoch": 1.57, + "grad_norm": 8.337997292070591, + "learning_rate": 2.640449438202247e-07, + "logps/chosen": -46.034690856933594, + "logps/rejected": -58.21810531616211, + "loss": 0.5159, + "losses/dpo": 0.5175392627716064, + "losses/sft": 1.6583523750305176, + "losses/total": 0.5175392627716064, + "ref_logps/chosen": -37.22336959838867, + "ref_logps/rejected": -42.30153274536133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8811318278312683, + "rewards/margins": 0.7105254530906677, + "rewards/rejected": -1.5916571617126465, + "step": 208 + }, + { + "epoch": 1.58, + "grad_norm": 8.605777081243177, + "learning_rate": 2.6264044943820224e-07, + "logps/chosen": -49.07252502441406, + "logps/rejected": -62.991764068603516, + "loss": 0.5234, + "losses/dpo": 0.3841923177242279, + "losses/sft": 2.0660533905029297, + "losses/total": 0.3841923177242279, + "ref_logps/chosen": -38.91202163696289, + "ref_logps/rejected": -45.201744079589844, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.0160505771636963, + "rewards/margins": 0.7629517316818237, + "rewards/rejected": -1.7790021896362305, + "step": 209 + }, + { + "epoch": 1.58, + "grad_norm": 8.594902163030543, + "learning_rate": 2.612359550561798e-07, + "logps/chosen": -48.94342803955078, + "logps/rejected": -67.45938110351562, + "loss": 0.4628, + "losses/dpo": 0.5517236590385437, + "losses/sft": 1.831754446029663, + "losses/total": 0.5517236590385437, + "ref_logps/chosen": -39.35862731933594, + "ref_logps/rejected": -48.98806381225586, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9584797620773315, + "rewards/margins": 0.8886520862579346, + "rewards/rejected": -1.8471317291259766, + "step": 210 + }, + { + "epoch": 1.59, + "grad_norm": 9.393987371477554, + "learning_rate": 2.598314606741573e-07, + "logps/chosen": -51.1754035949707, + "logps/rejected": -62.90049362182617, + "loss": 0.538, + "losses/dpo": 0.4337689280509949, + "losses/sft": 1.871553897857666, + "losses/total": 0.4337689280509949, + "ref_logps/chosen": -40.097930908203125, + "ref_logps/rejected": -44.89445877075195, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.1077474355697632, + "rewards/margins": 0.6928560137748718, + "rewards/rejected": -1.8006032705307007, + "step": 211 + }, + { + "epoch": 1.6, + "grad_norm": 8.268094387246107, + "learning_rate": 2.5842696629213486e-07, + "logps/chosen": -46.02891159057617, + "logps/rejected": -58.77110290527344, + "loss": 0.5183, + "losses/dpo": 0.3652556836605072, + "losses/sft": 2.1132707595825195, + "losses/total": 0.3652556836605072, + "ref_logps/chosen": -37.71674346923828, + "ref_logps/rejected": -43.298362731933594, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8312174677848816, + "rewards/margins": 0.7160569429397583, + "rewards/rejected": -1.5472744703292847, + "step": 212 + }, + { + "epoch": 1.61, + "grad_norm": 8.156887652455024, + "learning_rate": 2.5702247191011236e-07, + "logps/chosen": -53.53913879394531, + "logps/rejected": -65.96989440917969, + "loss": 0.454, + "losses/dpo": 0.6029412746429443, + "losses/sft": 2.5100276470184326, + "losses/total": 0.6029412746429443, + "ref_logps/chosen": -43.845726013183594, + "ref_logps/rejected": -47.28349304199219, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.9693412780761719, + "rewards/margins": 0.899299681186676, + "rewards/rejected": -1.8686408996582031, + "step": 213 + }, + { + "epoch": 1.62, + "grad_norm": 9.181481809417177, + "learning_rate": 2.5561797752808987e-07, + "logps/chosen": -51.8013801574707, + "logps/rejected": -63.46139907836914, + "loss": 0.5059, + "losses/dpo": 0.14845682680606842, + "losses/sft": 1.4499884843826294, + "losses/total": 0.14845682680606842, + "ref_logps/chosen": -42.47813415527344, + "ref_logps/rejected": -46.913028717041016, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9323242902755737, + "rewards/margins": 0.7225131988525391, + "rewards/rejected": -1.6548374891281128, + "step": 214 + }, + { + "epoch": 1.62, + "grad_norm": 15.612026068166484, + "learning_rate": 2.5421348314606737e-07, + "logps/chosen": -53.003204345703125, + "logps/rejected": -66.69940948486328, + "loss": 0.4433, + "losses/dpo": 0.37596985697746277, + "losses/sft": 1.6896804571151733, + "losses/total": 0.37596985697746277, + "ref_logps/chosen": -43.648799896240234, + "ref_logps/rejected": -48.52040481567383, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.9354407787322998, + "rewards/margins": 0.8824598789215088, + "rewards/rejected": -1.8179006576538086, + "step": 215 + }, + { + "epoch": 1.63, + "grad_norm": 9.017380971745943, + "learning_rate": 2.5280898876404493e-07, + "logps/chosen": -53.18143081665039, + "logps/rejected": -68.83674621582031, + "loss": 0.5005, + "losses/dpo": 0.9546025395393372, + "losses/sft": 2.2383298873901367, + "losses/total": 0.9546025395393372, + "ref_logps/chosen": -42.14701843261719, + "ref_logps/rejected": -50.13629913330078, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.103441834449768, + "rewards/margins": 0.7666029334068298, + "rewards/rejected": -1.8700445890426636, + "step": 216 + }, + { + "epoch": 1.64, + "grad_norm": 7.002144270086572, + "learning_rate": 2.5140449438202243e-07, + "logps/chosen": -39.812992095947266, + "logps/rejected": -58.04911804199219, + "loss": 0.4312, + "losses/dpo": 0.3237588107585907, + "losses/sft": 1.9024207592010498, + "losses/total": 0.3237588107585907, + "ref_logps/chosen": -31.948955535888672, + "ref_logps/rejected": -40.56779479980469, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.7864038348197937, + "rewards/margins": 0.9617283940315247, + "rewards/rejected": -1.7481321096420288, + "step": 217 + }, + { + "epoch": 1.65, + "grad_norm": 8.018556807115827, + "learning_rate": 2.5e-07, + "logps/chosen": -47.59326934814453, + "logps/rejected": -64.22843170166016, + "loss": 0.4616, + "losses/dpo": 0.3011893630027771, + "losses/sft": 1.4501286745071411, + "losses/total": 0.3011893630027771, + "ref_logps/chosen": -37.48702621459961, + "ref_logps/rejected": -45.30727767944336, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.0106242895126343, + "rewards/margins": 0.8814913630485535, + "rewards/rejected": -1.892115592956543, + "step": 218 + }, + { + "epoch": 1.65, + "grad_norm": 9.421297346982232, + "learning_rate": 2.485955056179775e-07, + "logps/chosen": -52.299922943115234, + "logps/rejected": -62.2135009765625, + "loss": 0.5647, + "losses/dpo": 0.6175757050514221, + "losses/sft": 2.058591365814209, + "losses/total": 0.6175757050514221, + "ref_logps/chosen": -41.575767517089844, + "ref_logps/rejected": -45.39363479614258, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0724154710769653, + "rewards/margins": 0.6095717549324036, + "rewards/rejected": -1.6819872856140137, + "step": 219 + }, + { + "epoch": 1.66, + "grad_norm": 10.845286708552022, + "learning_rate": 2.4719101123595505e-07, + "logps/chosen": -49.824981689453125, + "logps/rejected": -61.322139739990234, + "loss": 0.4823, + "losses/dpo": 0.5226192474365234, + "losses/sft": 1.8831043243408203, + "losses/total": 0.5226192474365234, + "ref_logps/chosen": -39.458709716796875, + "ref_logps/rejected": -43.33076477050781, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.0366266965866089, + "rewards/margins": 0.7625109553337097, + "rewards/rejected": -1.7991377115249634, + "step": 220 + }, + { + "epoch": 1.67, + "grad_norm": 7.736788176006532, + "learning_rate": 2.4578651685393255e-07, + "logps/chosen": -48.30406951904297, + "logps/rejected": -62.53293228149414, + "loss": 0.4473, + "losses/dpo": 0.24700236320495605, + "losses/sft": 1.5504862070083618, + "losses/total": 0.24700236320495605, + "ref_logps/chosen": -38.438228607177734, + "ref_logps/rejected": -43.876705169677734, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.9865838289260864, + "rewards/margins": 0.8790390491485596, + "rewards/rejected": -1.8656229972839355, + "step": 221 + }, + { + "epoch": 1.68, + "grad_norm": 7.9918716943613894, + "learning_rate": 2.443820224719101e-07, + "logps/chosen": -46.76765441894531, + "logps/rejected": -63.072418212890625, + "loss": 0.4597, + "losses/dpo": 0.5004298686981201, + "losses/sft": 2.039869785308838, + "losses/total": 0.5004298686981201, + "ref_logps/chosen": -37.515079498291016, + "ref_logps/rejected": -44.76315689086914, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9252572655677795, + "rewards/margins": 0.9056685566902161, + "rewards/rejected": -1.8309259414672852, + "step": 222 + }, + { + "epoch": 1.68, + "grad_norm": 8.812574527536418, + "learning_rate": 2.429775280898876e-07, + "logps/chosen": -48.68953323364258, + "logps/rejected": -70.00579071044922, + "loss": 0.4842, + "losses/dpo": 0.5064282417297363, + "losses/sft": 1.860622763633728, + "losses/total": 0.5064282417297363, + "ref_logps/chosen": -37.93241500854492, + "ref_logps/rejected": -49.59477996826172, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.0757118463516235, + "rewards/margins": 0.9653894305229187, + "rewards/rejected": -2.0411009788513184, + "step": 223 + }, + { + "epoch": 1.69, + "grad_norm": 8.262706110631338, + "learning_rate": 2.4157303370786517e-07, + "logps/chosen": -51.853031158447266, + "logps/rejected": -66.55961608886719, + "loss": 0.4643, + "losses/dpo": 0.32682162523269653, + "losses/sft": 2.5111522674560547, + "losses/total": 0.32682162523269653, + "ref_logps/chosen": -42.025306701660156, + "ref_logps/rejected": -47.904380798339844, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9827719330787659, + "rewards/margins": 0.8827516436576843, + "rewards/rejected": -1.8655235767364502, + "step": 224 + }, + { + "epoch": 1.7, + "grad_norm": 9.91340992789526, + "learning_rate": 2.401685393258427e-07, + "logps/chosen": -51.620609283447266, + "logps/rejected": -61.01588821411133, + "loss": 0.5382, + "losses/dpo": 0.47428151965141296, + "losses/sft": 1.9888023138046265, + "losses/total": 0.47428151965141296, + "ref_logps/chosen": -40.93183135986328, + "ref_logps/rejected": -43.67429733276367, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -1.0688778162002563, + "rewards/margins": 0.6652814745903015, + "rewards/rejected": -1.7341593503952026, + "step": 225 + }, + { + "epoch": 1.71, + "grad_norm": 8.353660910725022, + "learning_rate": 2.3876404494382023e-07, + "logps/chosen": -49.402103424072266, + "logps/rejected": -61.825992584228516, + "loss": 0.4796, + "losses/dpo": 0.4575900137424469, + "losses/sft": 2.0813965797424316, + "losses/total": 0.4575900137424469, + "ref_logps/chosen": -39.205528259277344, + "ref_logps/rejected": -43.332237243652344, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.019657850265503, + "rewards/margins": 0.8297175765037537, + "rewards/rejected": -1.8493753671646118, + "step": 226 + }, + { + "epoch": 1.71, + "grad_norm": 8.489694346068541, + "learning_rate": 2.3735955056179774e-07, + "logps/chosen": -49.869972229003906, + "logps/rejected": -61.926387786865234, + "loss": 0.5007, + "losses/dpo": 0.5196319222450256, + "losses/sft": 2.3197567462921143, + "losses/total": 0.5196319222450256, + "ref_logps/chosen": -40.345924377441406, + "ref_logps/rejected": -43.80282211303711, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.9524051547050476, + "rewards/margins": 0.8599514365196228, + "rewards/rejected": -1.8123565912246704, + "step": 227 + }, + { + "epoch": 1.72, + "grad_norm": 8.455711622023179, + "learning_rate": 2.3595505617977527e-07, + "logps/chosen": -48.83377456665039, + "logps/rejected": -62.39826202392578, + "loss": 0.4853, + "losses/dpo": 0.33744513988494873, + "losses/sft": 2.174797296524048, + "losses/total": 0.33744513988494873, + "ref_logps/chosen": -38.7586555480957, + "ref_logps/rejected": -44.35865783691406, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0075117349624634, + "rewards/margins": 0.7964487075805664, + "rewards/rejected": -1.8039604425430298, + "step": 228 + }, + { + "epoch": 1.73, + "grad_norm": 8.501384419530382, + "learning_rate": 2.345505617977528e-07, + "logps/chosen": -46.079429626464844, + "logps/rejected": -62.38417053222656, + "loss": 0.4593, + "losses/dpo": 0.45904022455215454, + "losses/sft": 2.14939546585083, + "losses/total": 0.45904022455215454, + "ref_logps/chosen": -37.05137634277344, + "ref_logps/rejected": -44.551025390625, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.9028058052062988, + "rewards/margins": 0.8805083632469177, + "rewards/rejected": -1.7833141088485718, + "step": 229 + }, + { + "epoch": 1.74, + "grad_norm": 8.42568627673927, + "learning_rate": 2.331460674157303e-07, + "logps/chosen": -53.63593292236328, + "logps/rejected": -67.56592559814453, + "loss": 0.4141, + "losses/dpo": 0.35481470823287964, + "losses/sft": 1.9238269329071045, + "losses/total": 0.35481470823287964, + "ref_logps/chosen": -43.60770797729492, + "ref_logps/rejected": -47.71710205078125, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.0028222799301147, + "rewards/margins": 0.9820606112480164, + "rewards/rejected": -1.9848829507827759, + "step": 230 + }, + { + "epoch": 1.74, + "grad_norm": 8.985097587921354, + "learning_rate": 2.3174157303370786e-07, + "logps/chosen": -49.890769958496094, + "logps/rejected": -64.36235046386719, + "loss": 0.5005, + "losses/dpo": 0.4699353277683258, + "losses/sft": 1.5774719715118408, + "losses/total": 0.4699353277683258, + "ref_logps/chosen": -39.884613037109375, + "ref_logps/rejected": -46.45123291015625, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0006158351898193, + "rewards/margins": 0.7904958724975586, + "rewards/rejected": -1.7911114692687988, + "step": 231 + }, + { + "epoch": 1.75, + "grad_norm": 8.37260934759885, + "learning_rate": 2.303370786516854e-07, + "logps/chosen": -47.88700485229492, + "logps/rejected": -65.87992095947266, + "loss": 0.4563, + "losses/dpo": 0.4289396107196808, + "losses/sft": 1.9490468502044678, + "losses/total": 0.4289396107196808, + "ref_logps/chosen": -38.816673278808594, + "ref_logps/rejected": -48.137718200683594, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.9070336222648621, + "rewards/margins": 0.8671874403953552, + "rewards/rejected": -1.7742209434509277, + "step": 232 + }, + { + "epoch": 1.76, + "grad_norm": 8.439180796005859, + "learning_rate": 2.2893258426966292e-07, + "logps/chosen": -47.75567626953125, + "logps/rejected": -59.802364349365234, + "loss": 0.5339, + "losses/dpo": 0.3888584077358246, + "losses/sft": 1.453101396560669, + "losses/total": 0.3888584077358246, + "ref_logps/chosen": -38.37287521362305, + "ref_logps/rejected": -43.31857681274414, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.9382801055908203, + "rewards/margins": 0.7100984454154968, + "rewards/rejected": -1.6483784914016724, + "step": 233 + }, + { + "epoch": 1.77, + "grad_norm": 8.820079708795298, + "learning_rate": 2.2752808988764045e-07, + "logps/chosen": -53.505958557128906, + "logps/rejected": -66.17211151123047, + "loss": 0.4792, + "losses/dpo": 0.5977008938789368, + "losses/sft": 1.5070393085479736, + "losses/total": 0.5977008938789368, + "ref_logps/chosen": -42.90943145751953, + "ref_logps/rejected": -47.009498596191406, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.0596526861190796, + "rewards/margins": 0.8566086292266846, + "rewards/rejected": -1.9162614345550537, + "step": 234 + }, + { + "epoch": 1.77, + "grad_norm": 8.954294680805216, + "learning_rate": 2.2612359550561795e-07, + "logps/chosen": -51.98886489868164, + "logps/rejected": -65.2154769897461, + "loss": 0.4863, + "losses/dpo": 0.4061537981033325, + "losses/sft": 1.7505851984024048, + "losses/total": 0.4061537981033325, + "ref_logps/chosen": -41.45209503173828, + "ref_logps/rejected": -46.05805969238281, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.0536775588989258, + "rewards/margins": 0.8620648980140686, + "rewards/rejected": -1.9157423973083496, + "step": 235 + }, + { + "epoch": 1.78, + "grad_norm": 8.276096992832155, + "learning_rate": 2.2471910112359549e-07, + "logps/chosen": -46.52098846435547, + "logps/rejected": -60.28398895263672, + "loss": 0.4849, + "losses/dpo": 0.4493502676486969, + "losses/sft": 1.8399590253829956, + "losses/total": 0.4493502676486969, + "ref_logps/chosen": -37.41412353515625, + "ref_logps/rejected": -42.95138931274414, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.9106867909431458, + "rewards/margins": 0.8225734233856201, + "rewards/rejected": -1.733260154724121, + "step": 236 + }, + { + "epoch": 1.79, + "grad_norm": 9.148667637325849, + "learning_rate": 2.2331460674157302e-07, + "logps/chosen": -49.735443115234375, + "logps/rejected": -68.21788787841797, + "loss": 0.49, + "losses/dpo": 0.2873924672603607, + "losses/sft": 1.5266362428665161, + "losses/total": 0.2873924672603607, + "ref_logps/chosen": -39.63804244995117, + "ref_logps/rejected": -49.29356384277344, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0097399950027466, + "rewards/margins": 0.8826919198036194, + "rewards/rejected": -1.8924317359924316, + "step": 237 + }, + { + "epoch": 1.8, + "grad_norm": 8.143852894216051, + "learning_rate": 2.2191011235955055e-07, + "logps/chosen": -48.24225616455078, + "logps/rejected": -63.4071044921875, + "loss": 0.4505, + "losses/dpo": 0.4800674617290497, + "losses/sft": 1.840663194656372, + "losses/total": 0.4800674617290497, + "ref_logps/chosen": -38.12638854980469, + "ref_logps/rejected": -44.40414047241211, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -1.0115869045257568, + "rewards/margins": 0.8887090682983398, + "rewards/rejected": -1.9002958536148071, + "step": 238 + }, + { + "epoch": 1.8, + "grad_norm": 8.978669612809192, + "learning_rate": 2.205056179775281e-07, + "logps/chosen": -49.006935119628906, + "logps/rejected": -64.8754653930664, + "loss": 0.4701, + "losses/dpo": 0.3446974754333496, + "losses/sft": 1.800228476524353, + "losses/total": 0.3446974754333496, + "ref_logps/chosen": -39.40437698364258, + "ref_logps/rejected": -46.17237091064453, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9602562189102173, + "rewards/margins": 0.9100530743598938, + "rewards/rejected": -1.8703094720840454, + "step": 239 + }, + { + "epoch": 1.81, + "grad_norm": 9.39799976416451, + "learning_rate": 2.191011235955056e-07, + "logps/chosen": -49.0966682434082, + "logps/rejected": -62.466331481933594, + "loss": 0.4979, + "losses/dpo": 0.7114862203598022, + "losses/sft": 1.6450343132019043, + "losses/total": 0.7114862203598022, + "ref_logps/chosen": -38.396942138671875, + "ref_logps/rejected": -43.12867736816406, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -1.0699726343154907, + "rewards/margins": 0.8637927770614624, + "rewards/rejected": -1.9337654113769531, + "step": 240 + }, + { + "epoch": 1.82, + "grad_norm": 7.9075518471198585, + "learning_rate": 2.1769662921348314e-07, + "logps/chosen": -49.44485092163086, + "logps/rejected": -64.67866516113281, + "loss": 0.4424, + "losses/dpo": 0.3071708679199219, + "losses/sft": 2.3602986335754395, + "losses/total": 0.3071708679199219, + "ref_logps/chosen": -39.44845199584961, + "ref_logps/rejected": -45.48222351074219, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.999640703201294, + "rewards/margins": 0.9200041890144348, + "rewards/rejected": -1.919644832611084, + "step": 241 + }, + { + "epoch": 1.83, + "grad_norm": 8.785359064590306, + "learning_rate": 2.1629213483146067e-07, + "logps/chosen": -53.305233001708984, + "logps/rejected": -65.16683959960938, + "loss": 0.4948, + "losses/dpo": 0.5694843530654907, + "losses/sft": 1.6856664419174194, + "losses/total": 0.5694843530654907, + "ref_logps/chosen": -42.595603942871094, + "ref_logps/rejected": -46.201744079589844, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.0709630250930786, + "rewards/margins": 0.8255467414855957, + "rewards/rejected": -1.8965098857879639, + "step": 242 + }, + { + "epoch": 1.83, + "grad_norm": 9.446502314051532, + "learning_rate": 2.148876404494382e-07, + "logps/chosen": -48.44091033935547, + "logps/rejected": -60.52621078491211, + "loss": 0.526, + "losses/dpo": 0.45424705743789673, + "losses/sft": 2.151371479034424, + "losses/total": 0.45424705743789673, + "ref_logps/chosen": -38.4775390625, + "ref_logps/rejected": -43.26105499267578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9963367581367493, + "rewards/margins": 0.730178952217102, + "rewards/rejected": -1.726515531539917, + "step": 243 + }, + { + "epoch": 1.84, + "grad_norm": 8.81696350872476, + "learning_rate": 2.134831460674157e-07, + "logps/chosen": -50.6672477722168, + "logps/rejected": -66.28202819824219, + "loss": 0.4731, + "losses/dpo": 0.3640851080417633, + "losses/sft": 1.5808416604995728, + "losses/total": 0.3640851080417633, + "ref_logps/chosen": -40.499908447265625, + "ref_logps/rejected": -47.59101486206055, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0167338848114014, + "rewards/margins": 0.852367103099823, + "rewards/rejected": -1.8691009283065796, + "step": 244 + }, + { + "epoch": 1.85, + "grad_norm": 9.28428850947705, + "learning_rate": 2.1207865168539323e-07, + "logps/chosen": -54.07783889770508, + "logps/rejected": -65.76669311523438, + "loss": 0.5163, + "losses/dpo": 0.8544118404388428, + "losses/sft": 1.9635812044143677, + "losses/total": 0.8544118404388428, + "ref_logps/chosen": -42.8679084777832, + "ref_logps/rejected": -46.79513931274414, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.120992660522461, + "rewards/margins": 0.7761632204055786, + "rewards/rejected": -1.897156000137329, + "step": 245 + }, + { + "epoch": 1.86, + "grad_norm": 9.752420565118218, + "learning_rate": 2.1067415730337076e-07, + "logps/chosen": -49.50055694580078, + "logps/rejected": -65.74305725097656, + "loss": 0.4852, + "losses/dpo": 0.23906151950359344, + "losses/sft": 1.417936086654663, + "losses/total": 0.23906151950359344, + "ref_logps/chosen": -39.56950378417969, + "ref_logps/rejected": -47.11359405517578, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9931050539016724, + "rewards/margins": 0.8698407411575317, + "rewards/rejected": -1.8629456758499146, + "step": 246 + }, + { + "epoch": 1.86, + "grad_norm": 9.475549639179285, + "learning_rate": 2.0926966292134832e-07, + "logps/chosen": -49.32406997680664, + "logps/rejected": -63.67836380004883, + "loss": 0.5131, + "losses/dpo": 0.4199092984199524, + "losses/sft": 1.6027624607086182, + "losses/total": 0.4199092984199524, + "ref_logps/chosen": -39.44010925292969, + "ref_logps/rejected": -46.58472442626953, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.988396942615509, + "rewards/margins": 0.7209669947624207, + "rewards/rejected": -1.7093638181686401, + "step": 247 + }, + { + "epoch": 1.87, + "grad_norm": 8.691211910246372, + "learning_rate": 2.0786516853932585e-07, + "logps/chosen": -50.76154327392578, + "logps/rejected": -63.2697868347168, + "loss": 0.4931, + "losses/dpo": 0.48489272594451904, + "losses/sft": 2.4353795051574707, + "losses/total": 0.48489272594451904, + "ref_logps/chosen": -40.483741760253906, + "ref_logps/rejected": -45.29812240600586, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0277801752090454, + "rewards/margins": 0.7693858742713928, + "rewards/rejected": -1.7971662282943726, + "step": 248 + }, + { + "epoch": 1.88, + "grad_norm": 9.872927469509172, + "learning_rate": 2.0646067415730336e-07, + "logps/chosen": -49.138465881347656, + "logps/rejected": -66.16885375976562, + "loss": 0.5055, + "losses/dpo": 0.5177363753318787, + "losses/sft": 1.834108829498291, + "losses/total": 0.5177363753318787, + "ref_logps/chosen": -38.63552474975586, + "ref_logps/rejected": -48.135929107666016, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.050294280052185, + "rewards/margins": 0.7529983520507812, + "rewards/rejected": -1.8032926321029663, + "step": 249 + }, + { + "epoch": 1.89, + "grad_norm": 9.265714326509956, + "learning_rate": 2.0505617977528089e-07, + "logps/chosen": -48.308753967285156, + "logps/rejected": -66.52729034423828, + "loss": 0.486, + "losses/dpo": 0.6348212957382202, + "losses/sft": 1.772031307220459, + "losses/total": 0.6348212957382202, + "ref_logps/chosen": -38.7848014831543, + "ref_logps/rejected": -48.895755767822266, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.9523951411247253, + "rewards/margins": 0.8107584714889526, + "rewards/rejected": -1.7631536722183228, + "step": 250 + }, + { + "epoch": 1.89, + "grad_norm": 9.119904977454695, + "learning_rate": 2.0365168539325842e-07, + "logps/chosen": -50.29534912109375, + "logps/rejected": -64.1716537475586, + "loss": 0.4977, + "losses/dpo": 0.522094190120697, + "losses/sft": 1.8622956275939941, + "losses/total": 0.522094190120697, + "ref_logps/chosen": -39.614479064941406, + "ref_logps/rejected": -45.53257751464844, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.0680873394012451, + "rewards/margins": 0.7958202958106995, + "rewards/rejected": -1.8639075756072998, + "step": 251 + }, + { + "epoch": 1.9, + "grad_norm": 8.300519651321538, + "learning_rate": 2.0224719101123595e-07, + "logps/chosen": -48.15717315673828, + "logps/rejected": -66.43309783935547, + "loss": 0.4749, + "losses/dpo": 0.7441291809082031, + "losses/sft": 2.440709352493286, + "losses/total": 0.7441291809082031, + "ref_logps/chosen": -38.77737808227539, + "ref_logps/rejected": -47.56039047241211, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.9379786252975464, + "rewards/margins": 0.9492916464805603, + "rewards/rejected": -1.887270212173462, + "step": 252 + }, + { + "epoch": 1.91, + "grad_norm": 9.317842509294723, + "learning_rate": 2.0084269662921348e-07, + "logps/chosen": -44.09011459350586, + "logps/rejected": -59.16832733154297, + "loss": 0.5178, + "losses/dpo": 0.8791393041610718, + "losses/sft": 2.401695489883423, + "losses/total": 0.8791393041610718, + "ref_logps/chosen": -35.33125305175781, + "ref_logps/rejected": -43.098087310791016, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8758863210678101, + "rewards/margins": 0.7311373353004456, + "rewards/rejected": -1.60702383518219, + "step": 253 + }, + { + "epoch": 1.92, + "grad_norm": 8.201438226460152, + "learning_rate": 1.9943820224719098e-07, + "logps/chosen": -51.51540756225586, + "logps/rejected": -67.69857788085938, + "loss": 0.4104, + "losses/dpo": 0.48046156764030457, + "losses/sft": 1.7079527378082275, + "losses/total": 0.48046156764030457, + "ref_logps/chosen": -42.20224380493164, + "ref_logps/rejected": -47.85743713378906, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.9313161373138428, + "rewards/margins": 1.0527985095977783, + "rewards/rejected": -1.9841147661209106, + "step": 254 + }, + { + "epoch": 1.92, + "grad_norm": 9.182124453243851, + "learning_rate": 1.9803370786516854e-07, + "logps/chosen": -52.86988067626953, + "logps/rejected": -64.82228088378906, + "loss": 0.4958, + "losses/dpo": 0.4478102922439575, + "losses/sft": 1.8136005401611328, + "losses/total": 0.4478102922439575, + "ref_logps/chosen": -43.32461929321289, + "ref_logps/rejected": -48.31917190551758, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.9545266032218933, + "rewards/margins": 0.6957840323448181, + "rewards/rejected": -1.650310754776001, + "step": 255 + }, + { + "epoch": 1.93, + "grad_norm": 8.664903887618609, + "learning_rate": 1.9662921348314607e-07, + "logps/chosen": -47.40139389038086, + "logps/rejected": -61.24211502075195, + "loss": 0.5069, + "losses/dpo": 0.45637544989585876, + "losses/sft": 2.080510139465332, + "losses/total": 0.45637544989585876, + "ref_logps/chosen": -37.96201705932617, + "ref_logps/rejected": -44.53689956665039, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.9439379572868347, + "rewards/margins": 0.7265833616256714, + "rewards/rejected": -1.6705212593078613, + "step": 256 + }, + { + "epoch": 1.94, + "grad_norm": 8.357609816704013, + "learning_rate": 1.952247191011236e-07, + "logps/chosen": -51.97878646850586, + "logps/rejected": -66.72062683105469, + "loss": 0.4312, + "losses/dpo": 0.3072975277900696, + "losses/sft": 1.8392665386199951, + "losses/total": 0.3072975277900696, + "ref_logps/chosen": -42.230186462402344, + "ref_logps/rejected": -47.218257904052734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9748601913452148, + "rewards/margins": 0.9753769040107727, + "rewards/rejected": -1.9502369165420532, + "step": 257 + }, + { + "epoch": 1.95, + "grad_norm": 8.434371561430744, + "learning_rate": 1.938202247191011e-07, + "logps/chosen": -50.701534271240234, + "logps/rejected": -65.6552734375, + "loss": 0.4402, + "losses/dpo": 0.4483451545238495, + "losses/sft": 1.6883811950683594, + "losses/total": 0.4483451545238495, + "ref_logps/chosen": -40.96206283569336, + "ref_logps/rejected": -46.52254104614258, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.9739474654197693, + "rewards/margins": 0.9393259286880493, + "rewards/rejected": -1.9132733345031738, + "step": 258 + }, + { + "epoch": 1.95, + "grad_norm": 8.326106947840566, + "learning_rate": 1.9241573033707863e-07, + "logps/chosen": -49.05642318725586, + "logps/rejected": -62.65263366699219, + "loss": 0.4559, + "losses/dpo": 0.43069154024124146, + "losses/sft": 2.6451079845428467, + "losses/total": 0.43069154024124146, + "ref_logps/chosen": -39.344356536865234, + "ref_logps/rejected": -43.80147933959961, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9712071418762207, + "rewards/margins": 0.9139088988304138, + "rewards/rejected": -1.8851161003112793, + "step": 259 + }, + { + "epoch": 1.96, + "grad_norm": 10.317747735678177, + "learning_rate": 1.9101123595505617e-07, + "logps/chosen": -47.29045867919922, + "logps/rejected": -60.76853942871094, + "loss": 0.5233, + "losses/dpo": 0.6233609914779663, + "losses/sft": 2.166818618774414, + "losses/total": 0.6233609914779663, + "ref_logps/chosen": -37.4037971496582, + "ref_logps/rejected": -43.672698974609375, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.9886665344238281, + "rewards/margins": 0.7209180593490601, + "rewards/rejected": -1.7095845937728882, + "step": 260 + }, + { + "epoch": 1.97, + "grad_norm": 8.570482678193486, + "learning_rate": 1.896067415730337e-07, + "logps/chosen": -47.826866149902344, + "logps/rejected": -64.69200897216797, + "loss": 0.4646, + "losses/dpo": 0.39222848415374756, + "losses/sft": 1.7622792720794678, + "losses/total": 0.39222848415374756, + "ref_logps/chosen": -37.465206146240234, + "ref_logps/rejected": -45.77225112915039, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0361659526824951, + "rewards/margins": 0.8558104634284973, + "rewards/rejected": -1.8919763565063477, + "step": 261 + }, + { + "epoch": 1.98, + "grad_norm": 8.082044540577316, + "learning_rate": 1.8820224719101123e-07, + "logps/chosen": -44.3437385559082, + "logps/rejected": -65.46928405761719, + "loss": 0.4247, + "losses/dpo": 0.25466495752334595, + "losses/sft": 1.6573872566223145, + "losses/total": 0.25466495752334595, + "ref_logps/chosen": -35.26055908203125, + "ref_logps/rejected": -46.34375, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.908318042755127, + "rewards/margins": 1.0042363405227661, + "rewards/rejected": -1.9125542640686035, + "step": 262 + }, + { + "epoch": 1.98, + "grad_norm": 7.886213605680278, + "learning_rate": 1.8679775280898876e-07, + "logps/chosen": -44.65882873535156, + "logps/rejected": -65.91732025146484, + "loss": 0.4233, + "losses/dpo": 0.37609466910362244, + "losses/sft": 1.7309682369232178, + "losses/total": 0.37609466910362244, + "ref_logps/chosen": -34.98197555541992, + "ref_logps/rejected": -45.339569091796875, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9676854610443115, + "rewards/margins": 1.0900897979736328, + "rewards/rejected": -2.0577754974365234, + "step": 263 + }, + { + "epoch": 1.99, + "grad_norm": 9.420349702400047, + "learning_rate": 1.853932584269663e-07, + "logps/chosen": -55.511512756347656, + "logps/rejected": -67.9210205078125, + "loss": 0.4846, + "losses/dpo": 0.35744136571884155, + "losses/sft": 1.9689966440200806, + "losses/total": 0.35744136571884155, + "ref_logps/chosen": -44.93099594116211, + "ref_logps/rejected": -48.529052734375, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.058051347732544, + "rewards/margins": 0.8811461925506592, + "rewards/rejected": -1.9391975402832031, + "step": 264 + }, + { + "epoch": 2.0, + "grad_norm": 8.90872581509833, + "learning_rate": 1.8398876404494382e-07, + "logps/chosen": -50.006412506103516, + "logps/rejected": -65.82926940917969, + "loss": 0.4658, + "losses/dpo": 0.42064571380615234, + "losses/sft": 1.965097427368164, + "losses/total": 0.42064571380615234, + "ref_logps/chosen": -39.1809196472168, + "ref_logps/rejected": -45.722938537597656, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.0825499296188354, + "rewards/margins": 0.9280825853347778, + "rewards/rejected": -2.0106325149536133, + "step": 265 + }, + { + "epoch": 2.01, + "grad_norm": 7.965650717330079, + "learning_rate": 1.8258426966292135e-07, + "logps/chosen": -50.00637435913086, + "logps/rejected": -65.08876037597656, + "loss": 0.4116, + "losses/dpo": 0.36081230640411377, + "losses/sft": 2.0271382331848145, + "losses/total": 0.36081230640411377, + "ref_logps/chosen": -39.65681076049805, + "ref_logps/rejected": -44.160343170166016, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.0349565744400024, + "rewards/margins": 1.0578850507736206, + "rewards/rejected": -2.092841625213623, + "step": 266 + }, + { + "epoch": 2.02, + "grad_norm": 8.213010773638015, + "learning_rate": 1.8117977528089888e-07, + "logps/chosen": -49.70448303222656, + "logps/rejected": -64.45352172851562, + "loss": 0.4328, + "losses/dpo": 0.4751141667366028, + "losses/sft": 2.163590431213379, + "losses/total": 0.4751141667366028, + "ref_logps/chosen": -39.66984558105469, + "ref_logps/rejected": -45.02206802368164, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -1.0034637451171875, + "rewards/margins": 0.9396811723709106, + "rewards/rejected": -1.943144679069519, + "step": 267 + }, + { + "epoch": 2.02, + "grad_norm": 8.627785952714778, + "learning_rate": 1.7977528089887638e-07, + "logps/chosen": -47.43844985961914, + "logps/rejected": -61.57966995239258, + "loss": 0.4366, + "losses/dpo": 0.4777096211910248, + "losses/sft": 2.011448860168457, + "losses/total": 0.4777096211910248, + "ref_logps/chosen": -37.887752532958984, + "ref_logps/rejected": -42.577537536621094, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.9550699591636658, + "rewards/margins": 0.945143461227417, + "rewards/rejected": -1.9002132415771484, + "step": 268 + }, + { + "epoch": 2.03, + "grad_norm": 9.54969996951565, + "learning_rate": 1.7837078651685391e-07, + "logps/chosen": -50.50410842895508, + "logps/rejected": -64.97512817382812, + "loss": 0.4367, + "losses/dpo": 0.4186415672302246, + "losses/sft": 2.4884321689605713, + "losses/total": 0.4186415672302246, + "ref_logps/chosen": -39.366878509521484, + "ref_logps/rejected": -44.09857940673828, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -1.1137233972549438, + "rewards/margins": 0.973931610584259, + "rewards/rejected": -2.0876548290252686, + "step": 269 + }, + { + "epoch": 2.04, + "grad_norm": 7.541865642493473, + "learning_rate": 1.7696629213483144e-07, + "logps/chosen": -53.07981491088867, + "logps/rejected": -69.67108917236328, + "loss": 0.3686, + "losses/dpo": 0.23445191979408264, + "losses/sft": 1.9627153873443604, + "losses/total": 0.23445191979408264, + "ref_logps/chosen": -41.86531448364258, + "ref_logps/rejected": -46.77862548828125, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.121450424194336, + "rewards/margins": 1.1677953004837036, + "rewards/rejected": -2.289245843887329, + "step": 270 + }, + { + "epoch": 2.05, + "grad_norm": 7.78557132067994, + "learning_rate": 1.75561797752809e-07, + "logps/chosen": -44.59587478637695, + "logps/rejected": -61.11756896972656, + "loss": 0.4304, + "losses/dpo": 0.40728461742401123, + "losses/sft": 2.1670854091644287, + "losses/total": 0.40728461742401123, + "ref_logps/chosen": -34.50210189819336, + "ref_logps/rejected": -41.35382843017578, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.009376883506775, + "rewards/margins": 0.9669971466064453, + "rewards/rejected": -1.9763740301132202, + "step": 271 + }, + { + "epoch": 2.05, + "grad_norm": 8.691528110147415, + "learning_rate": 1.741573033707865e-07, + "logps/chosen": -47.1044807434082, + "logps/rejected": -62.402366638183594, + "loss": 0.4439, + "losses/dpo": 0.6048084497451782, + "losses/sft": 2.738722801208496, + "losses/total": 0.6048084497451782, + "ref_logps/chosen": -36.921566009521484, + "ref_logps/rejected": -41.66175842285156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0182914733886719, + "rewards/margins": 1.0557701587677002, + "rewards/rejected": -2.074061632156372, + "step": 272 + }, + { + "epoch": 2.06, + "grad_norm": 7.3620175100007135, + "learning_rate": 1.7275280898876404e-07, + "logps/chosen": -51.797447204589844, + "logps/rejected": -69.3549575805664, + "loss": 0.3635, + "losses/dpo": 0.4010230600833893, + "losses/sft": 1.782325029373169, + "losses/total": 0.4010230600833893, + "ref_logps/chosen": -41.61540222167969, + "ref_logps/rejected": -47.2431526184082, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -1.0182045698165894, + "rewards/margins": 1.192975640296936, + "rewards/rejected": -2.2111802101135254, + "step": 273 + }, + { + "epoch": 2.07, + "grad_norm": 8.804306158193548, + "learning_rate": 1.7134831460674157e-07, + "logps/chosen": -51.407501220703125, + "logps/rejected": -64.26744079589844, + "loss": 0.4578, + "losses/dpo": 0.6319560408592224, + "losses/sft": 2.214840888977051, + "losses/total": 0.6319560408592224, + "ref_logps/chosen": -40.07988739013672, + "ref_logps/rejected": -43.7442741394043, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -1.1327617168426514, + "rewards/margins": 0.919555127620697, + "rewards/rejected": -2.052316665649414, + "step": 274 + }, + { + "epoch": 2.08, + "grad_norm": 9.887135650412674, + "learning_rate": 1.699438202247191e-07, + "logps/chosen": -48.88153839111328, + "logps/rejected": -64.42852783203125, + "loss": 0.4198, + "losses/dpo": 0.36265987157821655, + "losses/sft": 2.2624428272247314, + "losses/total": 0.36265987157821655, + "ref_logps/chosen": -38.071693420410156, + "ref_logps/rejected": -42.96941375732422, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.080984115600586, + "rewards/margins": 1.064927339553833, + "rewards/rejected": -2.145911455154419, + "step": 275 + }, + { + "epoch": 2.08, + "grad_norm": 7.789984027153804, + "learning_rate": 1.6853932584269663e-07, + "logps/chosen": -53.47975158691406, + "logps/rejected": -69.208251953125, + "loss": 0.4077, + "losses/dpo": 0.3306717872619629, + "losses/sft": 1.8213623762130737, + "losses/total": 0.3306717872619629, + "ref_logps/chosen": -42.26997375488281, + "ref_logps/rejected": -47.255218505859375, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.1209776401519775, + "rewards/margins": 1.0743255615234375, + "rewards/rejected": -2.195303440093994, + "step": 276 + }, + { + "epoch": 2.09, + "grad_norm": 8.577961125042698, + "learning_rate": 1.6713483146067413e-07, + "logps/chosen": -48.9437255859375, + "logps/rejected": -66.896484375, + "loss": 0.4237, + "losses/dpo": 0.375847727060318, + "losses/sft": 1.7302836179733276, + "losses/total": 0.375847727060318, + "ref_logps/chosen": -38.10498046875, + "ref_logps/rejected": -45.20066452026367, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.0838744640350342, + "rewards/margins": 1.0857088565826416, + "rewards/rejected": -2.1695830821990967, + "step": 277 + }, + { + "epoch": 2.1, + "grad_norm": 7.88842347856655, + "learning_rate": 1.6573033707865166e-07, + "logps/chosen": -50.38557815551758, + "logps/rejected": -71.69691467285156, + "loss": 0.3884, + "losses/dpo": 0.20783157646656036, + "losses/sft": 1.65842604637146, + "losses/total": 0.20783157646656036, + "ref_logps/chosen": -38.84345245361328, + "ref_logps/rejected": -47.66883850097656, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -1.1542127132415771, + "rewards/margins": 1.2485952377319336, + "rewards/rejected": -2.40280818939209, + "step": 278 + }, + { + "epoch": 2.11, + "grad_norm": 8.27071868454665, + "learning_rate": 1.6432584269662922e-07, + "logps/chosen": -50.050907135009766, + "logps/rejected": -71.91120147705078, + "loss": 0.3873, + "losses/dpo": 0.36935174465179443, + "losses/sft": 2.4310526847839355, + "losses/total": 0.36935174465179443, + "ref_logps/chosen": -38.60224914550781, + "ref_logps/rejected": -48.42378234863281, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.1448655128479004, + "rewards/margins": 1.2038761377334595, + "rewards/rejected": -2.3487415313720703, + "step": 279 + }, + { + "epoch": 2.11, + "grad_norm": 8.202604558490952, + "learning_rate": 1.6292134831460675e-07, + "logps/chosen": -49.78974533081055, + "logps/rejected": -63.648345947265625, + "loss": 0.4287, + "losses/dpo": 0.4768953025341034, + "losses/sft": 1.486172080039978, + "losses/total": 0.4768953025341034, + "ref_logps/chosen": -38.987091064453125, + "ref_logps/rejected": -42.706016540527344, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.0802651643753052, + "rewards/margins": 1.0139687061309814, + "rewards/rejected": -2.094233751296997, + "step": 280 + }, + { + "epoch": 2.12, + "grad_norm": 7.6837084768811295, + "learning_rate": 1.6151685393258428e-07, + "logps/chosen": -46.654842376708984, + "logps/rejected": -65.57063293457031, + "loss": 0.4002, + "losses/dpo": 0.25231242179870605, + "losses/sft": 1.6833255290985107, + "losses/total": 0.25231242179870605, + "ref_logps/chosen": -35.983055114746094, + "ref_logps/rejected": -43.31926345825195, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.0671789646148682, + "rewards/margins": 1.1579577922821045, + "rewards/rejected": -2.2251367568969727, + "step": 281 + }, + { + "epoch": 2.13, + "grad_norm": 7.367492054152066, + "learning_rate": 1.6011235955056178e-07, + "logps/chosen": -47.970069885253906, + "logps/rejected": -67.12972259521484, + "loss": 0.3817, + "losses/dpo": 0.2698941230773926, + "losses/sft": 2.0059823989868164, + "losses/total": 0.2698941230773926, + "ref_logps/chosen": -37.96726608276367, + "ref_logps/rejected": -44.837013244628906, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.0002803802490234, + "rewards/margins": 1.2289901971817017, + "rewards/rejected": -2.2292706966400146, + "step": 282 + }, + { + "epoch": 2.14, + "grad_norm": 8.198398756823499, + "learning_rate": 1.5870786516853931e-07, + "logps/chosen": -52.00371551513672, + "logps/rejected": -71.66854095458984, + "loss": 0.373, + "losses/dpo": 0.37371307611465454, + "losses/sft": 2.166947841644287, + "losses/total": 0.37371307611465454, + "ref_logps/chosen": -40.8586540222168, + "ref_logps/rejected": -47.34501266479492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1145060062408447, + "rewards/margins": 1.3178460597991943, + "rewards/rejected": -2.432352304458618, + "step": 283 + }, + { + "epoch": 2.14, + "grad_norm": 7.211631798978324, + "learning_rate": 1.5730337078651685e-07, + "logps/chosen": -45.30519104003906, + "logps/rejected": -67.6802749633789, + "loss": 0.3529, + "losses/dpo": 0.6495727896690369, + "losses/sft": 2.1896119117736816, + "losses/total": 0.6495727896690369, + "ref_logps/chosen": -35.870975494384766, + "ref_logps/rejected": -45.01594924926758, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9434216022491455, + "rewards/margins": 1.3230111598968506, + "rewards/rejected": -2.266432523727417, + "step": 284 + }, + { + "epoch": 2.15, + "grad_norm": 8.73649186021766, + "learning_rate": 1.5589887640449438e-07, + "logps/chosen": -51.24886703491211, + "logps/rejected": -73.5231704711914, + "loss": 0.3706, + "losses/dpo": 0.2774621248245239, + "losses/sft": 2.14704966545105, + "losses/total": 0.2774621248245239, + "ref_logps/chosen": -39.21043395996094, + "ref_logps/rejected": -48.85007858276367, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.2038426399230957, + "rewards/margins": 1.2634668350219727, + "rewards/rejected": -2.4673094749450684, + "step": 285 + }, + { + "epoch": 2.16, + "grad_norm": 9.965422332405613, + "learning_rate": 1.5449438202247188e-07, + "logps/chosen": -56.63559341430664, + "logps/rejected": -69.31437683105469, + "loss": 0.4581, + "losses/dpo": 1.019913673400879, + "losses/sft": 2.6583502292633057, + "losses/total": 1.019913673400879, + "ref_logps/chosen": -42.96681594848633, + "ref_logps/rejected": -44.904361724853516, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.3668776750564575, + "rewards/margins": 1.0741242170333862, + "rewards/rejected": -2.4410018920898438, + "step": 286 + }, + { + "epoch": 2.17, + "grad_norm": 9.04029000539786, + "learning_rate": 1.5308988764044944e-07, + "logps/chosen": -50.918128967285156, + "logps/rejected": -67.66094970703125, + "loss": 0.4491, + "losses/dpo": 0.536713182926178, + "losses/sft": 2.0050907135009766, + "losses/total": 0.536713182926178, + "ref_logps/chosen": -37.77289581298828, + "ref_logps/rejected": -43.736061096191406, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3145227432250977, + "rewards/margins": 1.0779664516448975, + "rewards/rejected": -2.392489194869995, + "step": 287 + }, + { + "epoch": 2.17, + "grad_norm": 7.9909005031619476, + "learning_rate": 1.5168539325842697e-07, + "logps/chosen": -48.0228271484375, + "logps/rejected": -72.348876953125, + "loss": 0.3569, + "losses/dpo": 0.22953173518180847, + "losses/sft": 1.752846360206604, + "losses/total": 0.22953173518180847, + "ref_logps/chosen": -36.91614532470703, + "ref_logps/rejected": -47.88288497924805, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.1106677055358887, + "rewards/margins": 1.3359307050704956, + "rewards/rejected": -2.4465982913970947, + "step": 288 + }, + { + "epoch": 2.18, + "grad_norm": 9.189297221788385, + "learning_rate": 1.502808988764045e-07, + "logps/chosen": -55.16273880004883, + "logps/rejected": -74.13533782958984, + "loss": 0.4081, + "losses/dpo": 0.39626190066337585, + "losses/sft": 2.1484994888305664, + "losses/total": 0.39626190066337585, + "ref_logps/chosen": -42.12677001953125, + "ref_logps/rejected": -48.699642181396484, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -1.3035968542099, + "rewards/margins": 1.2399725914001465, + "rewards/rejected": -2.543569326400757, + "step": 289 + }, + { + "epoch": 2.19, + "grad_norm": 8.223258235820202, + "learning_rate": 1.4887640449438203e-07, + "logps/chosen": -46.8680534362793, + "logps/rejected": -64.84513854980469, + "loss": 0.3998, + "losses/dpo": 0.38319000601768494, + "losses/sft": 2.0748698711395264, + "losses/total": 0.38319000601768494, + "ref_logps/chosen": -35.66654968261719, + "ref_logps/rejected": -42.36066818237305, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.1201505661010742, + "rewards/margins": 1.1282968521118164, + "rewards/rejected": -2.2484474182128906, + "step": 290 + }, + { + "epoch": 2.2, + "grad_norm": 9.322598904199673, + "learning_rate": 1.4747191011235953e-07, + "logps/chosen": -51.94579315185547, + "logps/rejected": -76.23491668701172, + "loss": 0.4365, + "losses/dpo": 0.6639813184738159, + "losses/sft": 3.0463194847106934, + "losses/total": 0.6639813184738159, + "ref_logps/chosen": -37.92943572998047, + "ref_logps/rejected": -49.77875518798828, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -1.4016355276107788, + "rewards/margins": 1.243980884552002, + "rewards/rejected": -2.6456165313720703, + "step": 291 + }, + { + "epoch": 2.2, + "grad_norm": 9.28502050286711, + "learning_rate": 1.4606741573033706e-07, + "logps/chosen": -51.56089782714844, + "logps/rejected": -67.3809814453125, + "loss": 0.4295, + "losses/dpo": 0.2485855668783188, + "losses/sft": 2.5399341583251953, + "losses/total": 0.2485855668783188, + "ref_logps/chosen": -38.323081970214844, + "ref_logps/rejected": -43.60871505737305, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.3237823247909546, + "rewards/margins": 1.0534443855285645, + "rewards/rejected": -2.3772268295288086, + "step": 292 + }, + { + "epoch": 2.21, + "grad_norm": 10.352671653799021, + "learning_rate": 1.446629213483146e-07, + "logps/chosen": -60.71104431152344, + "logps/rejected": -76.54723358154297, + "loss": 0.4205, + "losses/dpo": 0.5032411813735962, + "losses/sft": 2.2452447414398193, + "losses/total": 0.5032411813735962, + "ref_logps/chosen": -45.97325897216797, + "ref_logps/rejected": -49.70647430419922, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.4737780094146729, + "rewards/margins": 1.2102973461151123, + "rewards/rejected": -2.684075117111206, + "step": 293 + }, + { + "epoch": 2.22, + "grad_norm": 9.489700991538179, + "learning_rate": 1.4325842696629212e-07, + "logps/chosen": -51.33311462402344, + "logps/rejected": -72.56240844726562, + "loss": 0.4041, + "losses/dpo": 0.39366570115089417, + "losses/sft": 1.5643844604492188, + "losses/total": 0.39366570115089417, + "ref_logps/chosen": -38.85292053222656, + "ref_logps/rejected": -47.41047668457031, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.2480189800262451, + "rewards/margins": 1.2671747207641602, + "rewards/rejected": -2.5151939392089844, + "step": 294 + }, + { + "epoch": 2.23, + "grad_norm": 8.202282597323313, + "learning_rate": 1.4185393258426968e-07, + "logps/chosen": -51.06114959716797, + "logps/rejected": -74.44361114501953, + "loss": 0.3473, + "losses/dpo": 0.40783169865608215, + "losses/sft": 2.3142831325531006, + "losses/total": 0.40783169865608215, + "ref_logps/chosen": -39.6861457824707, + "ref_logps/rejected": -49.402000427246094, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.1375010013580322, + "rewards/margins": 1.3666609525680542, + "rewards/rejected": -2.504162073135376, + "step": 295 + }, + { + "epoch": 2.23, + "grad_norm": 8.616438775614668, + "learning_rate": 1.4044943820224718e-07, + "logps/chosen": -50.71443557739258, + "logps/rejected": -67.99952697753906, + "loss": 0.4191, + "losses/dpo": 0.3447108566761017, + "losses/sft": 1.6047589778900146, + "losses/total": 0.3447108566761017, + "ref_logps/chosen": -38.159549713134766, + "ref_logps/rejected": -44.2279052734375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2554888725280762, + "rewards/margins": 1.12167227268219, + "rewards/rejected": -2.3771612644195557, + "step": 296 + }, + { + "epoch": 2.24, + "grad_norm": 8.42295456953331, + "learning_rate": 1.3904494382022472e-07, + "logps/chosen": -50.11585235595703, + "logps/rejected": -69.99391174316406, + "loss": 0.4154, + "losses/dpo": 0.6180249452590942, + "losses/sft": 1.9767247438430786, + "losses/total": 0.6180249452590942, + "ref_logps/chosen": -37.855167388916016, + "ref_logps/rejected": -45.18976974487305, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.2260689735412598, + "rewards/margins": 1.2543449401855469, + "rewards/rejected": -2.4804139137268066, + "step": 297 + }, + { + "epoch": 2.25, + "grad_norm": 7.916024207524859, + "learning_rate": 1.3764044943820225e-07, + "logps/chosen": -48.01890563964844, + "logps/rejected": -71.55242919921875, + "loss": 0.3286, + "losses/dpo": 0.39366215467453003, + "losses/sft": 1.6183239221572876, + "losses/total": 0.39366215467453003, + "ref_logps/chosen": -36.65507888793945, + "ref_logps/rejected": -45.11772537231445, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1363829374313354, + "rewards/margins": 1.5070867538452148, + "rewards/rejected": -2.64346981048584, + "step": 298 + }, + { + "epoch": 2.26, + "grad_norm": 8.381587007100304, + "learning_rate": 1.3623595505617978e-07, + "logps/chosen": -48.97580337524414, + "logps/rejected": -71.05843353271484, + "loss": 0.369, + "losses/dpo": 0.4669285714626312, + "losses/sft": 2.4736876487731934, + "losses/total": 0.4669285714626312, + "ref_logps/chosen": -36.59314727783203, + "ref_logps/rejected": -46.38197708129883, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -1.2382655143737793, + "rewards/margins": 1.229379415512085, + "rewards/rejected": -2.4676451683044434, + "step": 299 + }, + { + "epoch": 2.26, + "grad_norm": 7.816178695034966, + "learning_rate": 1.3483146067415728e-07, + "logps/chosen": -48.929466247558594, + "logps/rejected": -70.30461883544922, + "loss": 0.3683, + "losses/dpo": 0.30282700061798096, + "losses/sft": 2.068610906600952, + "losses/total": 0.30282700061798096, + "ref_logps/chosen": -36.281883239746094, + "ref_logps/rejected": -44.1247444152832, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.2647581100463867, + "rewards/margins": 1.3532286882400513, + "rewards/rejected": -2.6179869174957275, + "step": 300 + }, + { + "epoch": 2.27, + "grad_norm": 9.42088429106351, + "learning_rate": 1.334269662921348e-07, + "logps/chosen": -53.99761199951172, + "logps/rejected": -68.16130828857422, + "loss": 0.4154, + "losses/dpo": 0.7334519028663635, + "losses/sft": 2.3190560340881348, + "losses/total": 0.7334519028663635, + "ref_logps/chosen": -41.94767379760742, + "ref_logps/rejected": -43.898292541503906, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.2049940824508667, + "rewards/margins": 1.2213077545166016, + "rewards/rejected": -2.4263014793395996, + "step": 301 + }, + { + "epoch": 2.28, + "grad_norm": 8.474867745213691, + "learning_rate": 1.3202247191011234e-07, + "logps/chosen": -56.917606353759766, + "logps/rejected": -78.60893249511719, + "loss": 0.3643, + "losses/dpo": 0.30533695220947266, + "losses/sft": 2.2758193016052246, + "losses/total": 0.30533695220947266, + "ref_logps/chosen": -43.154685974121094, + "ref_logps/rejected": -50.671146392822266, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.3762919902801514, + "rewards/margins": 1.4174861907958984, + "rewards/rejected": -2.793778419494629, + "step": 302 + }, + { + "epoch": 2.29, + "grad_norm": 9.098388851491974, + "learning_rate": 1.306179775280899e-07, + "logps/chosen": -52.541847229003906, + "logps/rejected": -75.27273559570312, + "loss": 0.4022, + "losses/dpo": 0.7690958380699158, + "losses/sft": 2.406214714050293, + "losses/total": 0.7690958380699158, + "ref_logps/chosen": -39.33592224121094, + "ref_logps/rejected": -50.239105224609375, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.3205927610397339, + "rewards/margins": 1.1827703714370728, + "rewards/rejected": -2.5033628940582275, + "step": 303 + }, + { + "epoch": 2.29, + "grad_norm": 7.915112908663982, + "learning_rate": 1.2921348314606743e-07, + "logps/chosen": -52.54154968261719, + "logps/rejected": -79.33431243896484, + "loss": 0.3, + "losses/dpo": 0.22580446302890778, + "losses/sft": 2.1299729347229004, + "losses/total": 0.22580446302890778, + "ref_logps/chosen": -39.62370300292969, + "ref_logps/rejected": -51.30101013183594, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.2917848825454712, + "rewards/margins": 1.511545181274414, + "rewards/rejected": -2.8033299446105957, + "step": 304 + }, + { + "epoch": 2.3, + "grad_norm": 8.60697942251823, + "learning_rate": 1.2780898876404493e-07, + "logps/chosen": -54.95063781738281, + "logps/rejected": -76.17906188964844, + "loss": 0.3787, + "losses/dpo": 0.48840370774269104, + "losses/sft": 2.068924903869629, + "losses/total": 0.48840370774269104, + "ref_logps/chosen": -40.245113372802734, + "ref_logps/rejected": -48.299339294433594, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.4705531597137451, + "rewards/margins": 1.317419409751892, + "rewards/rejected": -2.7879724502563477, + "step": 305 + }, + { + "epoch": 2.31, + "grad_norm": 8.916108799954989, + "learning_rate": 1.2640449438202246e-07, + "logps/chosen": -54.08748245239258, + "logps/rejected": -72.3311767578125, + "loss": 0.384, + "losses/dpo": 0.27609461545944214, + "losses/sft": 2.02748703956604, + "losses/total": 0.27609461545944214, + "ref_logps/chosen": -40.30693817138672, + "ref_logps/rejected": -46.038856506347656, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.3780547380447388, + "rewards/margins": 1.2511768341064453, + "rewards/rejected": -2.6292316913604736, + "step": 306 + }, + { + "epoch": 2.32, + "grad_norm": 9.08799197478477, + "learning_rate": 1.25e-07, + "logps/chosen": -54.119285583496094, + "logps/rejected": -65.3755111694336, + "loss": 0.4074, + "losses/dpo": 0.314796507358551, + "losses/sft": 2.275911808013916, + "losses/total": 0.314796507358551, + "ref_logps/chosen": -40.99217224121094, + "ref_logps/rejected": -41.43158721923828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.312711477279663, + "rewards/margins": 1.081681251525879, + "rewards/rejected": -2.394392967224121, + "step": 307 + }, + { + "epoch": 2.32, + "grad_norm": 8.103580469459054, + "learning_rate": 1.2359550561797752e-07, + "logps/chosen": -51.23326873779297, + "logps/rejected": -75.66114807128906, + "loss": 0.3713, + "losses/dpo": 0.387349396944046, + "losses/sft": 2.3062949180603027, + "losses/total": 0.387349396944046, + "ref_logps/chosen": -36.80121994018555, + "ref_logps/rejected": -47.23471450805664, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.4432051181793213, + "rewards/margins": 1.399438738822937, + "rewards/rejected": -2.842643976211548, + "step": 308 + }, + { + "epoch": 2.33, + "grad_norm": 8.263469946945373, + "learning_rate": 1.2219101123595506e-07, + "logps/chosen": -51.04032897949219, + "logps/rejected": -73.06275177001953, + "loss": 0.3571, + "losses/dpo": 0.4852275252342224, + "losses/sft": 2.0593721866607666, + "losses/total": 0.4852275252342224, + "ref_logps/chosen": -37.46986770629883, + "ref_logps/rejected": -45.448211669921875, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.3570460081100464, + "rewards/margins": 1.4044082164764404, + "rewards/rejected": -2.7614541053771973, + "step": 309 + }, + { + "epoch": 2.34, + "grad_norm": 9.638610818670363, + "learning_rate": 1.2078651685393259e-07, + "logps/chosen": -61.78599166870117, + "logps/rejected": -79.4295883178711, + "loss": 0.3814, + "losses/dpo": 0.3531866669654846, + "losses/sft": 2.5623600482940674, + "losses/total": 0.3531866669654846, + "ref_logps/chosen": -47.63096618652344, + "ref_logps/rejected": -52.177154541015625, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.4155021905899048, + "rewards/margins": 1.3097403049468994, + "rewards/rejected": -2.7252423763275146, + "step": 310 + }, + { + "epoch": 2.35, + "grad_norm": 8.100555853256143, + "learning_rate": 1.1938202247191012e-07, + "logps/chosen": -51.98362350463867, + "logps/rejected": -76.51303100585938, + "loss": 0.3451, + "losses/dpo": 0.25824400782585144, + "losses/sft": 1.8786492347717285, + "losses/total": 0.25824400782585144, + "ref_logps/chosen": -38.72646713256836, + "ref_logps/rejected": -48.84027862548828, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.3257157802581787, + "rewards/margins": 1.4415602684020996, + "rewards/rejected": -2.767275810241699, + "step": 311 + }, + { + "epoch": 2.35, + "grad_norm": 8.895756271806501, + "learning_rate": 1.1797752808988763e-07, + "logps/chosen": -54.543087005615234, + "logps/rejected": -75.70824432373047, + "loss": 0.3937, + "losses/dpo": 0.44249895215034485, + "losses/sft": 2.0751869678497314, + "losses/total": 0.44249895215034485, + "ref_logps/chosen": -40.311073303222656, + "ref_logps/rejected": -47.72747802734375, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.423201322555542, + "rewards/margins": 1.3748749494552612, + "rewards/rejected": -2.7980761528015137, + "step": 312 + }, + { + "epoch": 2.36, + "grad_norm": 9.181986369222448, + "learning_rate": 1.1657303370786515e-07, + "logps/chosen": -53.058631896972656, + "logps/rejected": -76.43999481201172, + "loss": 0.3426, + "losses/dpo": 0.37313902378082275, + "losses/sft": 1.9281624555587769, + "losses/total": 0.37313902378082275, + "ref_logps/chosen": -38.72821044921875, + "ref_logps/rejected": -47.35258483886719, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -1.433042049407959, + "rewards/margins": 1.4756982326507568, + "rewards/rejected": -2.908740282058716, + "step": 313 + }, + { + "epoch": 2.37, + "grad_norm": 10.667806756150233, + "learning_rate": 1.151685393258427e-07, + "logps/chosen": -55.91019058227539, + "logps/rejected": -73.06080627441406, + "loss": 0.4539, + "losses/dpo": 0.27354708313941956, + "losses/sft": 2.2720413208007812, + "losses/total": 0.27354708313941956, + "ref_logps/chosen": -40.70777893066406, + "ref_logps/rejected": -46.47440719604492, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.520241141319275, + "rewards/margins": 1.138399600982666, + "rewards/rejected": -2.6586403846740723, + "step": 314 + }, + { + "epoch": 2.38, + "grad_norm": 9.237157441443594, + "learning_rate": 1.1376404494382023e-07, + "logps/chosen": -52.99339294433594, + "logps/rejected": -77.64442443847656, + "loss": 0.413, + "losses/dpo": 0.5332150459289551, + "losses/sft": 2.2471044063568115, + "losses/total": 0.5332150459289551, + "ref_logps/chosen": -38.779422760009766, + "ref_logps/rejected": -49.12788772583008, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.4213968515396118, + "rewards/margins": 1.4302568435668945, + "rewards/rejected": -2.851653814315796, + "step": 315 + }, + { + "epoch": 2.38, + "grad_norm": 10.26818606773468, + "learning_rate": 1.1235955056179774e-07, + "logps/chosen": -54.45465087890625, + "logps/rejected": -77.27416229248047, + "loss": 0.3916, + "losses/dpo": 0.3256514072418213, + "losses/sft": 2.224207878112793, + "losses/total": 0.3256514072418213, + "ref_logps/chosen": -39.11296463012695, + "ref_logps/rejected": -47.37888717651367, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.5341691970825195, + "rewards/margins": 1.4553582668304443, + "rewards/rejected": -2.9895272254943848, + "step": 316 + }, + { + "epoch": 2.39, + "grad_norm": 8.908115110424331, + "learning_rate": 1.1095505617977527e-07, + "logps/chosen": -52.21783447265625, + "logps/rejected": -74.33990478515625, + "loss": 0.3491, + "losses/dpo": 0.4351283013820648, + "losses/sft": 2.3193869590759277, + "losses/total": 0.4351283013820648, + "ref_logps/chosen": -38.05769348144531, + "ref_logps/rejected": -46.181705474853516, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.4160147905349731, + "rewards/margins": 1.3998043537139893, + "rewards/rejected": -2.815819263458252, + "step": 317 + }, + { + "epoch": 2.4, + "grad_norm": 9.20983406154459, + "learning_rate": 1.095505617977528e-07, + "logps/chosen": -51.61767578125, + "logps/rejected": -72.1242446899414, + "loss": 0.4144, + "losses/dpo": 0.21413980424404144, + "losses/sft": 1.885907530784607, + "losses/total": 0.21413980424404144, + "ref_logps/chosen": -37.064613342285156, + "ref_logps/rejected": -44.01332473754883, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.4553061723709106, + "rewards/margins": 1.355785608291626, + "rewards/rejected": -2.811091899871826, + "step": 318 + }, + { + "epoch": 2.41, + "grad_norm": 9.683467779206334, + "learning_rate": 1.0814606741573033e-07, + "logps/chosen": -52.75745391845703, + "logps/rejected": -68.34822082519531, + "loss": 0.4051, + "losses/dpo": 0.5242694020271301, + "losses/sft": 1.8490060567855835, + "losses/total": 0.5242694020271301, + "ref_logps/chosen": -38.73335266113281, + "ref_logps/rejected": -42.334041595458984, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.4024099111557007, + "rewards/margins": 1.1990087032318115, + "rewards/rejected": -2.6014187335968018, + "step": 319 + }, + { + "epoch": 2.42, + "grad_norm": 10.094879725137059, + "learning_rate": 1.0674157303370785e-07, + "logps/chosen": -55.7940788269043, + "logps/rejected": -71.7145767211914, + "loss": 0.4083, + "losses/dpo": 0.33178359270095825, + "losses/sft": 2.2600364685058594, + "losses/total": 0.33178359270095825, + "ref_logps/chosen": -41.903411865234375, + "ref_logps/rejected": -45.71622848510742, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.389066457748413, + "rewards/margins": 1.2107690572738647, + "rewards/rejected": -2.5998356342315674, + "step": 320 + }, + { + "epoch": 2.42, + "grad_norm": 8.737981073393495, + "learning_rate": 1.0533707865168538e-07, + "logps/chosen": -52.43000793457031, + "logps/rejected": -70.14034271240234, + "loss": 0.4104, + "losses/dpo": 0.4298698902130127, + "losses/sft": 1.832787036895752, + "losses/total": 0.4298698902130127, + "ref_logps/chosen": -38.05841064453125, + "ref_logps/rejected": -43.76458740234375, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.4371598958969116, + "rewards/margins": 1.2004159688949585, + "rewards/rejected": -2.63757586479187, + "step": 321 + }, + { + "epoch": 2.43, + "grad_norm": 8.489155654631809, + "learning_rate": 1.0393258426966293e-07, + "logps/chosen": -54.25529479980469, + "logps/rejected": -76.93711853027344, + "loss": 0.3281, + "losses/dpo": 0.2001137137413025, + "losses/sft": 1.605088710784912, + "losses/total": 0.2001137137413025, + "ref_logps/chosen": -41.82042694091797, + "ref_logps/rejected": -49.489280700683594, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.2434866428375244, + "rewards/margins": 1.5012969970703125, + "rewards/rejected": -2.744783401489258, + "step": 322 + }, + { + "epoch": 2.44, + "grad_norm": 8.976932361180113, + "learning_rate": 1.0252808988764044e-07, + "logps/chosen": -51.483985900878906, + "logps/rejected": -75.30467224121094, + "loss": 0.3253, + "losses/dpo": 0.28753212094306946, + "losses/sft": 2.173304557800293, + "losses/total": 0.28753212094306946, + "ref_logps/chosen": -38.227813720703125, + "ref_logps/rejected": -47.102657318115234, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.3256170749664307, + "rewards/margins": 1.4945844411849976, + "rewards/rejected": -2.8202013969421387, + "step": 323 + }, + { + "epoch": 2.45, + "grad_norm": 7.593474701084862, + "learning_rate": 1.0112359550561797e-07, + "logps/chosen": -47.74129867553711, + "logps/rejected": -69.7999267578125, + "loss": 0.3669, + "losses/dpo": 0.43520650267601013, + "losses/sft": 1.8445793390274048, + "losses/total": 0.43520650267601013, + "ref_logps/chosen": -35.14937210083008, + "ref_logps/rejected": -43.280242919921875, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.259192943572998, + "rewards/margins": 1.3927757740020752, + "rewards/rejected": -2.6519687175750732, + "step": 324 + }, + { + "epoch": 2.45, + "grad_norm": 9.113140126947922, + "learning_rate": 9.971910112359549e-08, + "logps/chosen": -51.48912811279297, + "logps/rejected": -75.93363189697266, + "loss": 0.3625, + "losses/dpo": 0.271272748708725, + "losses/sft": 2.1029720306396484, + "losses/total": 0.271272748708725, + "ref_logps/chosen": -38.47105026245117, + "ref_logps/rejected": -48.987518310546875, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.3018079996109009, + "rewards/margins": 1.3928041458129883, + "rewards/rejected": -2.6946120262145996, + "step": 325 + }, + { + "epoch": 2.46, + "grad_norm": 9.203607647069793, + "learning_rate": 9.831460674157303e-08, + "logps/chosen": -56.46796417236328, + "logps/rejected": -72.37566375732422, + "loss": 0.3758, + "losses/dpo": 0.27879202365875244, + "losses/sft": 1.8894522190093994, + "losses/total": 0.27879202365875244, + "ref_logps/chosen": -43.5599365234375, + "ref_logps/rejected": -46.51646423339844, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.2908027172088623, + "rewards/margins": 1.295116901397705, + "rewards/rejected": -2.5859196186065674, + "step": 326 + }, + { + "epoch": 2.47, + "grad_norm": 8.9943305703751, + "learning_rate": 9.691011235955055e-08, + "logps/chosen": -55.962684631347656, + "logps/rejected": -77.36865997314453, + "loss": 0.383, + "losses/dpo": 0.2792072296142578, + "losses/sft": 1.8898770809173584, + "losses/total": 0.2792072296142578, + "ref_logps/chosen": -41.8868522644043, + "ref_logps/rejected": -48.986183166503906, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.4075829982757568, + "rewards/margins": 1.430665373802185, + "rewards/rejected": -2.8382484912872314, + "step": 327 + }, + { + "epoch": 2.48, + "grad_norm": 9.502289617698672, + "learning_rate": 9.550561797752808e-08, + "logps/chosen": -50.20685577392578, + "logps/rejected": -66.51139831542969, + "loss": 0.441, + "losses/dpo": 0.2940795123577118, + "losses/sft": 2.298060894012451, + "losses/total": 0.2940795123577118, + "ref_logps/chosen": -37.659244537353516, + "ref_logps/rejected": -42.670841217041016, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.254760980606079, + "rewards/margins": 1.129294514656067, + "rewards/rejected": -2.3840553760528564, + "step": 328 + }, + { + "epoch": 2.48, + "grad_norm": 9.657263439264016, + "learning_rate": 9.410112359550561e-08, + "logps/chosen": -54.32592010498047, + "logps/rejected": -69.8943862915039, + "loss": 0.4163, + "losses/dpo": 0.6257603764533997, + "losses/sft": 2.595241069793701, + "losses/total": 0.6257603764533997, + "ref_logps/chosen": -40.53725814819336, + "ref_logps/rejected": -43.76425552368164, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.37886643409729, + "rewards/margins": 1.2341458797454834, + "rewards/rejected": -2.6130123138427734, + "step": 329 + }, + { + "epoch": 2.49, + "grad_norm": 9.274122078417514, + "learning_rate": 9.269662921348314e-08, + "logps/chosen": -54.48114013671875, + "logps/rejected": -74.62823486328125, + "loss": 0.3801, + "losses/dpo": 0.46192625164985657, + "losses/sft": 2.048821449279785, + "losses/total": 0.46192625164985657, + "ref_logps/chosen": -40.628318786621094, + "ref_logps/rejected": -47.394065856933594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.385282278060913, + "rewards/margins": 1.3381340503692627, + "rewards/rejected": -2.7234160900115967, + "step": 330 + }, + { + "epoch": 2.5, + "grad_norm": 11.530277636718996, + "learning_rate": 9.129213483146067e-08, + "logps/chosen": -51.81992721557617, + "logps/rejected": -78.34001922607422, + "loss": 0.3378, + "losses/dpo": 0.36353716254234314, + "losses/sft": 2.3431830406188965, + "losses/total": 0.36353716254234314, + "ref_logps/chosen": -38.79859161376953, + "ref_logps/rejected": -50.49066925048828, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.302133560180664, + "rewards/margins": 1.4828013181686401, + "rewards/rejected": -2.7849345207214355, + "step": 331 + }, + { + "epoch": 2.51, + "grad_norm": 9.663796087127917, + "learning_rate": 8.988764044943819e-08, + "logps/chosen": -56.09492492675781, + "logps/rejected": -77.04325866699219, + "loss": 0.3814, + "losses/dpo": 0.25816428661346436, + "losses/sft": 2.8091163635253906, + "losses/total": 0.25816428661346436, + "ref_logps/chosen": -40.704925537109375, + "ref_logps/rejected": -47.98851013183594, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.5390002727508545, + "rewards/margins": 1.3664746284484863, + "rewards/rejected": -2.90547513961792, + "step": 332 + }, + { + "epoch": 2.51, + "grad_norm": 9.817894775320998, + "learning_rate": 8.848314606741572e-08, + "logps/chosen": -53.74916076660156, + "logps/rejected": -71.23921966552734, + "loss": 0.4129, + "losses/dpo": 0.5282669067382812, + "losses/sft": 2.027956962585449, + "losses/total": 0.5282669067382812, + "ref_logps/chosen": -40.569034576416016, + "ref_logps/rejected": -45.03144073486328, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.3180131912231445, + "rewards/margins": 1.302764654159546, + "rewards/rejected": -2.6207778453826904, + "step": 333 + }, + { + "epoch": 2.52, + "grad_norm": 9.915518749588111, + "learning_rate": 8.707865168539325e-08, + "logps/chosen": -53.48023986816406, + "logps/rejected": -72.4287109375, + "loss": 0.4231, + "losses/dpo": 0.4929217994213104, + "losses/sft": 2.577164888381958, + "losses/total": 0.4929217994213104, + "ref_logps/chosen": -39.79558563232422, + "ref_logps/rejected": -45.82670593261719, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.3684654235839844, + "rewards/margins": 1.291735291481018, + "rewards/rejected": -2.660200595855713, + "step": 334 + }, + { + "epoch": 2.53, + "grad_norm": 9.335165389726255, + "learning_rate": 8.567415730337078e-08, + "logps/chosen": -52.41722106933594, + "logps/rejected": -71.85494995117188, + "loss": 0.3694, + "losses/dpo": 0.317619651556015, + "losses/sft": 2.0792832374572754, + "losses/total": 0.317619651556015, + "ref_logps/chosen": -39.57048416137695, + "ref_logps/rejected": -46.20240783691406, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.2846734523773193, + "rewards/margins": 1.280580997467041, + "rewards/rejected": -2.5652544498443604, + "step": 335 + }, + { + "epoch": 2.54, + "grad_norm": 9.185058383312379, + "learning_rate": 8.426966292134831e-08, + "logps/chosen": -56.19029235839844, + "logps/rejected": -80.49638366699219, + "loss": 0.3411, + "losses/dpo": 0.2321043312549591, + "losses/sft": 1.5318742990493774, + "losses/total": 0.2321043312549591, + "ref_logps/chosen": -41.53580856323242, + "ref_logps/rejected": -50.73744201660156, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.4654479026794434, + "rewards/margins": 1.5104467868804932, + "rewards/rejected": -2.9758946895599365, + "step": 336 + }, + { + "epoch": 2.54, + "grad_norm": 8.541745993770446, + "learning_rate": 8.286516853932583e-08, + "logps/chosen": -49.73480987548828, + "logps/rejected": -73.88976287841797, + "loss": 0.3512, + "losses/dpo": 0.2616669237613678, + "losses/sft": 1.7109529972076416, + "losses/total": 0.2616669237613678, + "ref_logps/chosen": -37.30692672729492, + "ref_logps/rejected": -46.78838348388672, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.2427881956100464, + "rewards/margins": 1.4673501253128052, + "rewards/rejected": -2.7101383209228516, + "step": 337 + }, + { + "epoch": 2.55, + "grad_norm": 10.42078485613911, + "learning_rate": 8.146067415730337e-08, + "logps/chosen": -52.26924514770508, + "logps/rejected": -67.19551086425781, + "loss": 0.4575, + "losses/dpo": 0.4895854890346527, + "losses/sft": 2.276334762573242, + "losses/total": 0.4895854890346527, + "ref_logps/chosen": -39.17838668823242, + "ref_logps/rejected": -43.06511688232422, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -1.3090859651565552, + "rewards/margins": 1.1039537191390991, + "rewards/rejected": -2.4130399227142334, + "step": 338 + }, + { + "epoch": 2.56, + "grad_norm": 10.673884752576804, + "learning_rate": 8.005617977528089e-08, + "logps/chosen": -54.1285514831543, + "logps/rejected": -66.32559967041016, + "loss": 0.4616, + "losses/dpo": 0.3069703280925751, + "losses/sft": 1.7043497562408447, + "losses/total": 0.3069703280925751, + "ref_logps/chosen": -39.07845687866211, + "ref_logps/rejected": -40.795433044433594, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.5050091743469238, + "rewards/margins": 1.0480072498321533, + "rewards/rejected": -2.5530166625976562, + "step": 339 + }, + { + "epoch": 2.57, + "grad_norm": 7.734430324176608, + "learning_rate": 7.865168539325842e-08, + "logps/chosen": -51.84765625, + "logps/rejected": -73.76638793945312, + "loss": 0.343, + "losses/dpo": 0.21601220965385437, + "losses/sft": 1.731180191040039, + "losses/total": 0.21601220965385437, + "ref_logps/chosen": -38.94496154785156, + "ref_logps/rejected": -47.23394012451172, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -1.2902700901031494, + "rewards/margins": 1.3629752397537231, + "rewards/rejected": -2.653244972229004, + "step": 340 + }, + { + "epoch": 2.57, + "grad_norm": 9.718523923824968, + "learning_rate": 7.724719101123594e-08, + "logps/chosen": -54.3284797668457, + "logps/rejected": -74.42861938476562, + "loss": 0.4342, + "losses/dpo": 0.5424623489379883, + "losses/sft": 2.519442081451416, + "losses/total": 0.5424623489379883, + "ref_logps/chosen": -40.32830047607422, + "ref_logps/rejected": -47.90203094482422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4000180959701538, + "rewards/margins": 1.2526406049728394, + "rewards/rejected": -2.652658700942993, + "step": 341 + }, + { + "epoch": 2.58, + "grad_norm": 8.766478003038216, + "learning_rate": 7.584269662921348e-08, + "logps/chosen": -56.134185791015625, + "logps/rejected": -74.1839370727539, + "loss": 0.3627, + "losses/dpo": 0.2985873520374298, + "losses/sft": 2.3777570724487305, + "losses/total": 0.2985873520374298, + "ref_logps/chosen": -40.96846008300781, + "ref_logps/rejected": -45.8743896484375, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.5165728330612183, + "rewards/margins": 1.3143821954727173, + "rewards/rejected": -2.8309547901153564, + "step": 342 + }, + { + "epoch": 2.59, + "grad_norm": 10.011243908821191, + "learning_rate": 7.443820224719101e-08, + "logps/chosen": -51.62477493286133, + "logps/rejected": -70.33786010742188, + "loss": 0.4342, + "losses/dpo": 0.337339848279953, + "losses/sft": 2.341553211212158, + "losses/total": 0.337339848279953, + "ref_logps/chosen": -37.967830657958984, + "ref_logps/rejected": -45.28611755371094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.365694522857666, + "rewards/margins": 1.1394801139831543, + "rewards/rejected": -2.5051746368408203, + "step": 343 + }, + { + "epoch": 2.6, + "grad_norm": 9.59771843166304, + "learning_rate": 7.303370786516853e-08, + "logps/chosen": -51.54905319213867, + "logps/rejected": -71.37974548339844, + "loss": 0.4085, + "losses/dpo": 0.39244934916496277, + "losses/sft": 1.864844799041748, + "losses/total": 0.39244934916496277, + "ref_logps/chosen": -38.66598129272461, + "ref_logps/rejected": -45.7724609375, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.2883074283599854, + "rewards/margins": 1.272420883178711, + "rewards/rejected": -2.5607285499572754, + "step": 344 + }, + { + "epoch": 2.6, + "grad_norm": 8.700784807594031, + "learning_rate": 7.162921348314606e-08, + "logps/chosen": -56.775753021240234, + "logps/rejected": -77.216796875, + "loss": 0.3416, + "losses/dpo": 0.22181375324726105, + "losses/sft": 2.4348971843719482, + "losses/total": 0.22181375324726105, + "ref_logps/chosen": -42.833614349365234, + "ref_logps/rejected": -49.19655227661133, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -1.3942136764526367, + "rewards/margins": 1.4078103303909302, + "rewards/rejected": -2.8020238876342773, + "step": 345 + }, + { + "epoch": 2.61, + "grad_norm": 9.30159664736646, + "learning_rate": 7.022471910112359e-08, + "logps/chosen": -48.35330581665039, + "logps/rejected": -68.90961456298828, + "loss": 0.4087, + "losses/dpo": 0.43862560391426086, + "losses/sft": 1.861382007598877, + "losses/total": 0.43862560391426086, + "ref_logps/chosen": -35.64689254760742, + "ref_logps/rejected": -43.799800872802734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.270641803741455, + "rewards/margins": 1.240339756011963, + "rewards/rejected": -2.510981559753418, + "step": 346 + }, + { + "epoch": 2.62, + "grad_norm": 9.466845106547478, + "learning_rate": 6.882022471910112e-08, + "logps/chosen": -52.57171630859375, + "logps/rejected": -66.74671173095703, + "loss": 0.3995, + "losses/dpo": 0.5215581655502319, + "losses/sft": 2.0002975463867188, + "losses/total": 0.5215581655502319, + "ref_logps/chosen": -39.363014221191406, + "ref_logps/rejected": -41.616329193115234, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.3208706378936768, + "rewards/margins": 1.1921679973602295, + "rewards/rejected": -2.5130386352539062, + "step": 347 + }, + { + "epoch": 2.63, + "grad_norm": 8.897974125441753, + "learning_rate": 6.741573033707864e-08, + "logps/chosen": -54.96879577636719, + "logps/rejected": -71.28080749511719, + "loss": 0.4172, + "losses/dpo": 0.6290773749351501, + "losses/sft": 2.65497088432312, + "losses/total": 0.6290773749351501, + "ref_logps/chosen": -41.37034225463867, + "ref_logps/rejected": -45.2210578918457, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.3598453998565674, + "rewards/margins": 1.2461297512054443, + "rewards/rejected": -2.605975389480591, + "step": 348 + }, + { + "epoch": 2.63, + "grad_norm": 8.450179202589874, + "learning_rate": 6.601123595505617e-08, + "logps/chosen": -56.42803192138672, + "logps/rejected": -78.08199310302734, + "loss": 0.3279, + "losses/dpo": 0.2672095000743866, + "losses/sft": 1.7004587650299072, + "losses/total": 0.2672095000743866, + "ref_logps/chosen": -43.05862808227539, + "ref_logps/rejected": -49.85722732543945, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -1.3369402885437012, + "rewards/margins": 1.4855366945266724, + "rewards/rejected": -2.822477102279663, + "step": 349 + }, + { + "epoch": 2.64, + "grad_norm": 9.946490252779716, + "learning_rate": 6.460674157303371e-08, + "logps/chosen": -52.30256652832031, + "logps/rejected": -67.33949279785156, + "loss": 0.4179, + "losses/dpo": 0.23541654646396637, + "losses/sft": 1.6304823160171509, + "losses/total": 0.23541654646396637, + "ref_logps/chosen": -39.795745849609375, + "ref_logps/rejected": -43.05610275268555, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.2506815195083618, + "rewards/margins": 1.177657961845398, + "rewards/rejected": -2.4283392429351807, + "step": 350 + }, + { + "epoch": 2.65, + "grad_norm": 8.420756732795637, + "learning_rate": 6.320224719101123e-08, + "logps/chosen": -50.62635040283203, + "logps/rejected": -71.55708312988281, + "loss": 0.3839, + "losses/dpo": 0.3438160717487335, + "losses/sft": 1.8534799814224243, + "losses/total": 0.3438160717487335, + "ref_logps/chosen": -39.325897216796875, + "ref_logps/rejected": -46.86465835571289, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.1300455331802368, + "rewards/margins": 1.3391977548599243, + "rewards/rejected": -2.469243049621582, + "step": 351 + }, + { + "epoch": 2.66, + "grad_norm": 9.697968850403186, + "learning_rate": 6.179775280898876e-08, + "logps/chosen": -54.621578216552734, + "logps/rejected": -70.27922058105469, + "loss": 0.4183, + "losses/dpo": 0.35120806097984314, + "losses/sft": 2.030266284942627, + "losses/total": 0.35120806097984314, + "ref_logps/chosen": -41.76460266113281, + "ref_logps/rejected": -45.33925247192383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2856972217559814, + "rewards/margins": 1.2083001136779785, + "rewards/rejected": -2.49399733543396, + "step": 352 + }, + { + "epoch": 2.66, + "grad_norm": 9.120026907057964, + "learning_rate": 6.039325842696629e-08, + "logps/chosen": -52.02517318725586, + "logps/rejected": -74.53661346435547, + "loss": 0.409, + "losses/dpo": 0.3873208463191986, + "losses/sft": 1.7444610595703125, + "losses/total": 0.3873208463191986, + "ref_logps/chosen": -38.21184539794922, + "ref_logps/rejected": -49.15116882324219, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.3813323974609375, + "rewards/margins": 1.157212495803833, + "rewards/rejected": -2.5385448932647705, + "step": 353 + }, + { + "epoch": 2.67, + "grad_norm": 9.69567118291811, + "learning_rate": 5.898876404494382e-08, + "logps/chosen": -52.73221969604492, + "logps/rejected": -70.13288879394531, + "loss": 0.4226, + "losses/dpo": 0.3050675392150879, + "losses/sft": 1.7437413930892944, + "losses/total": 0.3050675392150879, + "ref_logps/chosen": -40.23515701293945, + "ref_logps/rejected": -45.879547119140625, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.2497066259384155, + "rewards/margins": 1.1756272315979004, + "rewards/rejected": -2.4253337383270264, + "step": 354 + }, + { + "epoch": 2.68, + "grad_norm": 8.584053557094956, + "learning_rate": 5.758426966292135e-08, + "logps/chosen": -57.64381408691406, + "logps/rejected": -72.0084457397461, + "loss": 0.3835, + "losses/dpo": 0.40820345282554626, + "losses/sft": 2.4096083641052246, + "losses/total": 0.40820345282554626, + "ref_logps/chosen": -43.7611198425293, + "ref_logps/rejected": -46.36372375488281, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.3882694244384766, + "rewards/margins": 1.176202654838562, + "rewards/rejected": -2.564471960067749, + "step": 355 + }, + { + "epoch": 2.69, + "grad_norm": 8.221246825721817, + "learning_rate": 5.617977528089887e-08, + "logps/chosen": -46.82723617553711, + "logps/rejected": -68.99041748046875, + "loss": 0.3539, + "losses/dpo": 0.29153013229370117, + "losses/sft": 1.427022099494934, + "losses/total": 0.29153013229370117, + "ref_logps/chosen": -35.502262115478516, + "ref_logps/rejected": -43.46721649169922, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.132498025894165, + "rewards/margins": 1.419821858406067, + "rewards/rejected": -2.5523197650909424, + "step": 356 + }, + { + "epoch": 2.69, + "grad_norm": 9.482797590566683, + "learning_rate": 5.47752808988764e-08, + "logps/chosen": -51.7399787902832, + "logps/rejected": -71.89605712890625, + "loss": 0.3979, + "losses/dpo": 0.21694956719875336, + "losses/sft": 1.9680830240249634, + "losses/total": 0.21694956719875336, + "ref_logps/chosen": -38.23194122314453, + "ref_logps/rejected": -45.57975769042969, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.350803256034851, + "rewards/margins": 1.2808265686035156, + "rewards/rejected": -2.631629705429077, + "step": 357 + }, + { + "epoch": 2.7, + "grad_norm": 8.17039044420366, + "learning_rate": 5.3370786516853926e-08, + "logps/chosen": -52.407249450683594, + "logps/rejected": -71.58457946777344, + "loss": 0.3517, + "losses/dpo": 0.45670050382614136, + "losses/sft": 2.3598852157592773, + "losses/total": 0.45670050382614136, + "ref_logps/chosen": -40.96891784667969, + "ref_logps/rejected": -45.695167541503906, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -1.143832802772522, + "rewards/margins": 1.4451087713241577, + "rewards/rejected": -2.5889415740966797, + "step": 358 + }, + { + "epoch": 2.71, + "grad_norm": 7.803291186480779, + "learning_rate": 5.196629213483146e-08, + "logps/chosen": -47.12774658203125, + "logps/rejected": -70.28164672851562, + "loss": 0.331, + "losses/dpo": 0.3020516037940979, + "losses/sft": 1.725950837135315, + "losses/total": 0.3020516037940979, + "ref_logps/chosen": -36.658851623535156, + "ref_logps/rejected": -44.756195068359375, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.0468891859054565, + "rewards/margins": 1.5056557655334473, + "rewards/rejected": -2.5525450706481934, + "step": 359 + }, + { + "epoch": 2.72, + "grad_norm": 8.941952443820963, + "learning_rate": 5.056179775280899e-08, + "logps/chosen": -51.66205596923828, + "logps/rejected": -69.529296875, + "loss": 0.3878, + "losses/dpo": 0.4756266474723816, + "losses/sft": 1.6768076419830322, + "losses/total": 0.4756266474723816, + "ref_logps/chosen": -39.167083740234375, + "ref_logps/rejected": -45.206932067871094, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.2494975328445435, + "rewards/margins": 1.1827386617660522, + "rewards/rejected": -2.4322359561920166, + "step": 360 + }, + { + "epoch": 2.72, + "grad_norm": 9.404438765920613, + "learning_rate": 4.915730337078652e-08, + "logps/chosen": -54.10792922973633, + "logps/rejected": -70.66584014892578, + "loss": 0.385, + "losses/dpo": 0.25079599022865295, + "losses/sft": 2.628451108932495, + "losses/total": 0.25079599022865295, + "ref_logps/chosen": -41.44969177246094, + "ref_logps/rejected": -44.49009704589844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2658233642578125, + "rewards/margins": 1.3517518043518066, + "rewards/rejected": -2.6175754070281982, + "step": 361 + }, + { + "epoch": 2.73, + "grad_norm": 8.339761715598929, + "learning_rate": 4.775280898876404e-08, + "logps/chosen": -52.00060272216797, + "logps/rejected": -70.13935852050781, + "loss": 0.3657, + "losses/dpo": 0.410220742225647, + "losses/sft": 2.064330577850342, + "losses/total": 0.410220742225647, + "ref_logps/chosen": -38.8874626159668, + "ref_logps/rejected": -44.29195785522461, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.311313509941101, + "rewards/margins": 1.27342689037323, + "rewards/rejected": -2.584740400314331, + "step": 362 + }, + { + "epoch": 2.74, + "grad_norm": 8.695799719443274, + "learning_rate": 4.634831460674157e-08, + "logps/chosen": -54.19892120361328, + "logps/rejected": -70.33839416503906, + "loss": 0.3834, + "losses/dpo": 0.4246940612792969, + "losses/sft": 1.6766891479492188, + "losses/total": 0.4246940612792969, + "ref_logps/chosen": -40.98695755004883, + "ref_logps/rejected": -44.57014846801758, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.3211965560913086, + "rewards/margins": 1.255626916885376, + "rewards/rejected": -2.5768234729766846, + "step": 363 + }, + { + "epoch": 2.75, + "grad_norm": 7.953708445192091, + "learning_rate": 4.4943820224719096e-08, + "logps/chosen": -51.214752197265625, + "logps/rejected": -75.3336181640625, + "loss": 0.3295, + "losses/dpo": 0.17288488149642944, + "losses/sft": 2.220893383026123, + "losses/total": 0.17288488149642944, + "ref_logps/chosen": -38.46108627319336, + "ref_logps/rejected": -48.44053268432617, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -1.2753666639328003, + "rewards/margins": 1.4139418601989746, + "rewards/rejected": -2.6893081665039062, + "step": 364 + }, + { + "epoch": 2.75, + "grad_norm": 8.821196988330435, + "learning_rate": 4.3539325842696626e-08, + "logps/chosen": -56.51776123046875, + "logps/rejected": -75.40132904052734, + "loss": 0.35, + "losses/dpo": 0.169864684343338, + "losses/sft": 2.520303964614868, + "losses/total": 0.169864684343338, + "ref_logps/chosen": -43.79001235961914, + "ref_logps/rejected": -48.247989654541016, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.2727751731872559, + "rewards/margins": 1.4425586462020874, + "rewards/rejected": -2.715333938598633, + "step": 365 + }, + { + "epoch": 2.76, + "grad_norm": 9.689618011799487, + "learning_rate": 4.213483146067416e-08, + "logps/chosen": -57.19207000732422, + "logps/rejected": -72.71266174316406, + "loss": 0.407, + "losses/dpo": 0.27488580346107483, + "losses/sft": 1.8573498725891113, + "losses/total": 0.27488580346107483, + "ref_logps/chosen": -43.24185562133789, + "ref_logps/rejected": -46.41039276123047, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.3950214385986328, + "rewards/margins": 1.2352051734924316, + "rewards/rejected": -2.6302266120910645, + "step": 366 + }, + { + "epoch": 2.77, + "grad_norm": 9.338062839876327, + "learning_rate": 4.073033707865169e-08, + "logps/chosen": -50.966712951660156, + "logps/rejected": -68.7747802734375, + "loss": 0.4169, + "losses/dpo": 0.3666359782218933, + "losses/sft": 2.0023789405822754, + "losses/total": 0.3666359782218933, + "ref_logps/chosen": -38.069068908691406, + "ref_logps/rejected": -44.10002899169922, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.2897647619247437, + "rewards/margins": 1.1777102947235107, + "rewards/rejected": -2.467475175857544, + "step": 367 + }, + { + "epoch": 2.78, + "grad_norm": 7.949833391744097, + "learning_rate": 3.932584269662921e-08, + "logps/chosen": -47.72471237182617, + "logps/rejected": -70.84202575683594, + "loss": 0.3803, + "losses/dpo": 0.3086835443973541, + "losses/sft": 1.9907643795013428, + "losses/total": 0.3086835443973541, + "ref_logps/chosen": -35.20145034790039, + "ref_logps/rejected": -44.49205780029297, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.2523258924484253, + "rewards/margins": 1.3826706409454346, + "rewards/rejected": -2.6349964141845703, + "step": 368 + }, + { + "epoch": 2.78, + "grad_norm": 9.235066572868517, + "learning_rate": 3.792134831460674e-08, + "logps/chosen": -52.30189895629883, + "logps/rejected": -70.8028335571289, + "loss": 0.3852, + "losses/dpo": 0.2891031503677368, + "losses/sft": 1.8405730724334717, + "losses/total": 0.2891031503677368, + "ref_logps/chosen": -39.477725982666016, + "ref_logps/rejected": -45.31201934814453, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.2824174165725708, + "rewards/margins": 1.266663908958435, + "rewards/rejected": -2.549081563949585, + "step": 369 + }, + { + "epoch": 2.79, + "grad_norm": 9.190353581688715, + "learning_rate": 3.6516853932584266e-08, + "logps/chosen": -49.18308639526367, + "logps/rejected": -67.69566345214844, + "loss": 0.4, + "losses/dpo": 0.3436277508735657, + "losses/sft": 2.0218303203582764, + "losses/total": 0.3436277508735657, + "ref_logps/chosen": -36.68975830078125, + "ref_logps/rejected": -43.71223449707031, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.2493327856063843, + "rewards/margins": 1.1490094661712646, + "rewards/rejected": -2.3983423709869385, + "step": 370 + }, + { + "epoch": 2.8, + "grad_norm": 8.148479571122722, + "learning_rate": 3.5112359550561796e-08, + "logps/chosen": -51.570220947265625, + "logps/rejected": -71.7831039428711, + "loss": 0.3307, + "losses/dpo": 0.5651198625564575, + "losses/sft": 2.007855176925659, + "losses/total": 0.5651198625564575, + "ref_logps/chosen": -40.369407653808594, + "ref_logps/rejected": -46.21443176269531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1200807094573975, + "rewards/margins": 1.4367868900299072, + "rewards/rejected": -2.5568673610687256, + "step": 371 + }, + { + "epoch": 2.81, + "grad_norm": 9.663237502055296, + "learning_rate": 3.370786516853932e-08, + "logps/chosen": -55.40485382080078, + "logps/rejected": -73.81289672851562, + "loss": 0.4029, + "losses/dpo": 0.7689430713653564, + "losses/sft": 1.9435756206512451, + "losses/total": 0.7689430713653564, + "ref_logps/chosen": -42.52079772949219, + "ref_logps/rejected": -47.747249603271484, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2884055376052856, + "rewards/margins": 1.3181602954864502, + "rewards/rejected": -2.6065659523010254, + "step": 372 + }, + { + "epoch": 2.82, + "grad_norm": 8.41997319865235, + "learning_rate": 3.230337078651686e-08, + "logps/chosen": -55.254249572753906, + "logps/rejected": -81.1642074584961, + "loss": 0.3146, + "losses/dpo": 0.3366415500640869, + "losses/sft": 1.8378387689590454, + "losses/total": 0.3366415500640869, + "ref_logps/chosen": -41.7273063659668, + "ref_logps/rejected": -51.86448669433594, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -1.3526947498321533, + "rewards/margins": 1.577277421951294, + "rewards/rejected": -2.929971933364868, + "step": 373 + }, + { + "epoch": 2.82, + "grad_norm": 9.08048419718233, + "learning_rate": 3.089887640449438e-08, + "logps/chosen": -52.86750793457031, + "logps/rejected": -77.41828918457031, + "loss": 0.3294, + "losses/dpo": 0.23508216440677643, + "losses/sft": 1.5872150659561157, + "losses/total": 0.23508216440677643, + "ref_logps/chosen": -39.141883850097656, + "ref_logps/rejected": -49.06714630126953, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3725626468658447, + "rewards/margins": 1.4625511169433594, + "rewards/rejected": -2.835113763809204, + "step": 374 + }, + { + "epoch": 2.83, + "grad_norm": 9.149313086592246, + "learning_rate": 2.949438202247191e-08, + "logps/chosen": -50.02390670776367, + "logps/rejected": -75.56754302978516, + "loss": 0.362, + "losses/dpo": 0.6652272939682007, + "losses/sft": 2.926239252090454, + "losses/total": 0.6652272939682007, + "ref_logps/chosen": -37.26597595214844, + "ref_logps/rejected": -48.15172576904297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2757928371429443, + "rewards/margins": 1.4657888412475586, + "rewards/rejected": -2.741581916809082, + "step": 375 + }, + { + "epoch": 2.84, + "grad_norm": 9.114150173411574, + "learning_rate": 2.8089887640449436e-08, + "logps/chosen": -55.91659164428711, + "logps/rejected": -74.04378509521484, + "loss": 0.3668, + "losses/dpo": 0.2893008589744568, + "losses/sft": 2.1376471519470215, + "losses/total": 0.2893008589744568, + "ref_logps/chosen": -42.005611419677734, + "ref_logps/rejected": -46.799495697021484, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.3910987377166748, + "rewards/margins": 1.333329677581787, + "rewards/rejected": -2.724428415298462, + "step": 376 + }, + { + "epoch": 2.85, + "grad_norm": 10.02587161317734, + "learning_rate": 2.6685393258426963e-08, + "logps/chosen": -53.21587371826172, + "logps/rejected": -71.60870361328125, + "loss": 0.4256, + "losses/dpo": 0.7751315236091614, + "losses/sft": 2.11029314994812, + "losses/total": 0.7751315236091614, + "ref_logps/chosen": -40.663108825683594, + "ref_logps/rejected": -46.480648040771484, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.2552767992019653, + "rewards/margins": 1.25752854347229, + "rewards/rejected": -2.512805700302124, + "step": 377 + }, + { + "epoch": 2.85, + "grad_norm": 8.810937586248796, + "learning_rate": 2.5280898876404493e-08, + "logps/chosen": -52.51762008666992, + "logps/rejected": -77.57078552246094, + "loss": 0.3381, + "losses/dpo": 0.28002500534057617, + "losses/sft": 1.5633399486541748, + "losses/total": 0.28002500534057617, + "ref_logps/chosen": -39.36627960205078, + "ref_logps/rejected": -50.626708984375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.315134048461914, + "rewards/margins": 1.3792742490768433, + "rewards/rejected": -2.6944081783294678, + "step": 378 + }, + { + "epoch": 2.86, + "grad_norm": 9.293485858540686, + "learning_rate": 2.387640449438202e-08, + "logps/chosen": -51.378074645996094, + "logps/rejected": -64.74043273925781, + "loss": 0.4262, + "losses/dpo": 0.7660055756568909, + "losses/sft": 2.2007508277893066, + "losses/total": 0.7660055756568909, + "ref_logps/chosen": -39.29324722290039, + "ref_logps/rejected": -41.33624267578125, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -1.2084828615188599, + "rewards/margins": 1.131935954093933, + "rewards/rejected": -2.340418815612793, + "step": 379 + }, + { + "epoch": 2.87, + "grad_norm": 8.825439696536032, + "learning_rate": 2.2471910112359548e-08, + "logps/chosen": -55.97784423828125, + "logps/rejected": -78.24275207519531, + "loss": 0.345, + "losses/dpo": 0.33630573749542236, + "losses/sft": 2.7268307209014893, + "losses/total": 0.33630573749542236, + "ref_logps/chosen": -41.14834976196289, + "ref_logps/rejected": -49.048004150390625, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -1.4829493761062622, + "rewards/margins": 1.4365259408950806, + "rewards/rejected": -2.9194750785827637, + "step": 380 + }, + { + "epoch": 2.88, + "grad_norm": 8.683259186904719, + "learning_rate": 2.106741573033708e-08, + "logps/chosen": -52.257469177246094, + "logps/rejected": -67.984130859375, + "loss": 0.4087, + "losses/dpo": 0.3859608471393585, + "losses/sft": 1.8246614933013916, + "losses/total": 0.3859608471393585, + "ref_logps/chosen": -38.84019470214844, + "ref_logps/rejected": -43.60353088378906, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.341727375984192, + "rewards/margins": 1.0963327884674072, + "rewards/rejected": -2.4380600452423096, + "step": 381 + }, + { + "epoch": 2.88, + "grad_norm": 8.270857768884154, + "learning_rate": 1.9662921348314606e-08, + "logps/chosen": -54.033363342285156, + "logps/rejected": -77.31697082519531, + "loss": 0.3287, + "losses/dpo": 0.19281096756458282, + "losses/sft": 1.8291985988616943, + "losses/total": 0.19281096756458282, + "ref_logps/chosen": -39.56074523925781, + "ref_logps/rejected": -47.55027770996094, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.4472615718841553, + "rewards/margins": 1.5294086933135986, + "rewards/rejected": -2.9766697883605957, + "step": 382 + }, + { + "epoch": 2.89, + "grad_norm": 9.68267538629506, + "learning_rate": 1.8258426966292133e-08, + "logps/chosen": -53.385498046875, + "logps/rejected": -68.63336181640625, + "loss": 0.391, + "losses/dpo": 0.5480431318283081, + "losses/sft": 2.3586955070495605, + "losses/total": 0.5480431318283081, + "ref_logps/chosen": -40.53166198730469, + "ref_logps/rejected": -44.218746185302734, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.2853829860687256, + "rewards/margins": 1.156078815460205, + "rewards/rejected": -2.4414615631103516, + "step": 383 + }, + { + "epoch": 2.9, + "grad_norm": 8.588925065399472, + "learning_rate": 1.685393258426966e-08, + "logps/chosen": -53.107017517089844, + "logps/rejected": -73.8092041015625, + "loss": 0.3472, + "losses/dpo": 0.5091351866722107, + "losses/sft": 2.4279067516326904, + "losses/total": 0.5091351866722107, + "ref_logps/chosen": -39.874427795410156, + "ref_logps/rejected": -47.138092041015625, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.3232589960098267, + "rewards/margins": 1.34385085105896, + "rewards/rejected": -2.667109966278076, + "step": 384 + }, + { + "epoch": 2.91, + "grad_norm": 9.061222152581863, + "learning_rate": 1.544943820224719e-08, + "logps/chosen": -55.36473846435547, + "logps/rejected": -71.19318389892578, + "loss": 0.3925, + "losses/dpo": 0.7466526627540588, + "losses/sft": 2.359135627746582, + "losses/total": 0.7466526627540588, + "ref_logps/chosen": -41.025726318359375, + "ref_logps/rejected": -44.06968688964844, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.4339020252227783, + "rewards/margins": 1.2784475088119507, + "rewards/rejected": -2.7123494148254395, + "step": 385 + }, + { + "epoch": 2.91, + "grad_norm": 9.909969914239525, + "learning_rate": 1.4044943820224718e-08, + "logps/chosen": -51.63406753540039, + "logps/rejected": -77.14066314697266, + "loss": 0.3813, + "losses/dpo": 0.6047529578208923, + "losses/sft": 1.7509853839874268, + "losses/total": 0.6047529578208923, + "ref_logps/chosen": -37.65089416503906, + "ref_logps/rejected": -48.93791198730469, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.398316740989685, + "rewards/margins": 1.4219584465026855, + "rewards/rejected": -2.82027530670166, + "step": 386 + }, + { + "epoch": 2.92, + "grad_norm": 8.501319268805696, + "learning_rate": 1.2640449438202247e-08, + "logps/chosen": -53.17372512817383, + "logps/rejected": -68.52401733398438, + "loss": 0.3433, + "losses/dpo": 0.2757856249809265, + "losses/sft": 2.1045045852661133, + "losses/total": 0.2757856249809265, + "ref_logps/chosen": -40.52703857421875, + "ref_logps/rejected": -42.728126525878906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.264668345451355, + "rewards/margins": 1.3149209022521973, + "rewards/rejected": -2.579589366912842, + "step": 387 + }, + { + "epoch": 2.93, + "grad_norm": 8.718869385679747, + "learning_rate": 1.1235955056179774e-08, + "logps/chosen": -54.90761184692383, + "logps/rejected": -70.40953826904297, + "loss": 0.3789, + "losses/dpo": 0.5766834020614624, + "losses/sft": 2.222163200378418, + "losses/total": 0.5766834020614624, + "ref_logps/chosen": -41.6505012512207, + "ref_logps/rejected": -44.63560485839844, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -1.3257105350494385, + "rewards/margins": 1.2516822814941406, + "rewards/rejected": -2.577392816543579, + "step": 388 + }, + { + "epoch": 2.94, + "grad_norm": 9.426380018310086, + "learning_rate": 9.831460674157303e-09, + "logps/chosen": -54.11854934692383, + "logps/rejected": -71.68621826171875, + "loss": 0.386, + "losses/dpo": 0.20794588327407837, + "losses/sft": 2.1166539192199707, + "losses/total": 0.20794588327407837, + "ref_logps/chosen": -40.966426849365234, + "ref_logps/rejected": -45.38508605957031, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.3152116537094116, + "rewards/margins": 1.3149020671844482, + "rewards/rejected": -2.6301136016845703, + "step": 389 + }, + { + "epoch": 2.94, + "grad_norm": 10.46768940793886, + "learning_rate": 8.42696629213483e-09, + "logps/chosen": -54.164424896240234, + "logps/rejected": -71.98051452636719, + "loss": 0.4893, + "losses/dpo": 0.39178702235221863, + "losses/sft": 2.2996134757995605, + "losses/total": 0.39178702235221863, + "ref_logps/chosen": -40.31678009033203, + "ref_logps/rejected": -46.53054428100586, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.384764313697815, + "rewards/margins": 1.1602333784103394, + "rewards/rejected": -2.5449976921081543, + "step": 390 + }, + { + "epoch": 2.95, + "grad_norm": 8.941416739373434, + "learning_rate": 7.022471910112359e-09, + "logps/chosen": -53.463233947753906, + "logps/rejected": -71.62788391113281, + "loss": 0.3318, + "losses/dpo": 0.2409718632698059, + "losses/sft": 1.6294231414794922, + "losses/total": 0.2409718632698059, + "ref_logps/chosen": -41.0477294921875, + "ref_logps/rejected": -44.89318084716797, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -1.2415508031845093, + "rewards/margins": 1.4319190979003906, + "rewards/rejected": -2.6734697818756104, + "step": 391 + }, + { + "epoch": 2.96, + "grad_norm": 8.98210941116973, + "learning_rate": 5.617977528089887e-09, + "logps/chosen": -54.616600036621094, + "logps/rejected": -73.2689208984375, + "loss": 0.3832, + "losses/dpo": 0.43616408109664917, + "losses/sft": 2.2494640350341797, + "losses/total": 0.43616408109664917, + "ref_logps/chosen": -40.2120361328125, + "ref_logps/rejected": -46.42675018310547, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -1.4404562711715698, + "rewards/margins": 1.2437611818313599, + "rewards/rejected": -2.684217691421509, + "step": 392 + }, + { + "epoch": 2.97, + "grad_norm": 9.151862468783703, + "learning_rate": 4.213483146067415e-09, + "logps/chosen": -51.817893981933594, + "logps/rejected": -69.46862030029297, + "loss": 0.3906, + "losses/dpo": 0.2829042077064514, + "losses/sft": 2.443455696105957, + "losses/total": 0.2829042077064514, + "ref_logps/chosen": -38.848846435546875, + "ref_logps/rejected": -43.35674285888672, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.2969045639038086, + "rewards/margins": 1.3142831325531006, + "rewards/rejected": -2.61118745803833, + "step": 393 + }, + { + "epoch": 2.97, + "grad_norm": 10.678776530633808, + "learning_rate": 2.8089887640449435e-09, + "logps/chosen": -54.69252014160156, + "logps/rejected": -73.899658203125, + "loss": 0.4591, + "losses/dpo": 0.4431733191013336, + "losses/sft": 2.1250791549682617, + "losses/total": 0.4431733191013336, + "ref_logps/chosen": -39.718658447265625, + "ref_logps/rejected": -48.00882339477539, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4973857402801514, + "rewards/margins": 1.0916969776153564, + "rewards/rejected": -2.589082717895508, + "step": 394 + }, + { + "epoch": 2.98, + "grad_norm": 8.192368817594446, + "learning_rate": 1.4044943820224717e-09, + "logps/chosen": -50.57025909423828, + "logps/rejected": -69.49359130859375, + "loss": 0.3625, + "losses/dpo": 0.550754964351654, + "losses/sft": 2.1057140827178955, + "losses/total": 0.550754964351654, + "ref_logps/chosen": -38.39902877807617, + "ref_logps/rejected": -43.90056228637695, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2171236276626587, + "rewards/margins": 1.342179298400879, + "rewards/rejected": -2.559302806854248, + "step": 395 + }, + { + "epoch": 2.99, + "grad_norm": 8.91219728509694, + "learning_rate": 0.0, + "logps/chosen": -56.36674499511719, + "logps/rejected": -75.95218658447266, + "loss": 0.3695, + "losses/dpo": 0.45165300369262695, + "losses/sft": 1.7463542222976685, + "losses/total": 0.45165300369262695, + "ref_logps/chosen": -42.22056198120117, + "ref_logps/rejected": -48.152099609375, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.4146177768707275, + "rewards/margins": 1.3653908967971802, + "rewards/rejected": -2.7800087928771973, + "step": 396 + }, + { + "epoch": 2.99, + "step": 396, + "total_flos": 0.0, + "train_loss": 0.5140665640132596, + "train_runtime": 34070.7646, + "train_samples_per_second": 1.493, + "train_steps_per_second": 0.012 + } + ], + "logging_steps": 1.0, + "max_steps": 396, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 70, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}