diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4566 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 2907, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -2.809938669204712, + "logits/rejected": -2.8543002605438232, + "logps/chosen": -108.84485626220703, + "logps/rejected": -104.8216552734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -3.0993778705596924, + "logits/rejected": -3.087177276611328, + "logps/chosen": -240.08148193359375, + "logps/rejected": -212.52203369140625, + "loss": 0.6929, + "rewards/accuracies": 0.5555555820465088, + "rewards/chosen": 0.004180057905614376, + "rewards/margins": 0.010151715017855167, + "rewards/rejected": -0.005971657112240791, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.9975037574768066, + "logits/rejected": -3.020071268081665, + "logps/chosen": -277.745361328125, + "logps/rejected": -251.31045532226562, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0006220510113053024, + "rewards/margins": 0.009054403752088547, + "rewards/rejected": -0.009676454588770866, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -3.0415549278259277, + "logits/rejected": -3.058453321456909, + "logps/chosen": -269.6055603027344, + "logps/rejected": -234.25210571289062, + "loss": 0.6757, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.023628996685147285, + "rewards/margins": 0.044883329421281815, + "rewards/rejected": -0.02125433087348938, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -3.0699856281280518, + "logits/rejected": -3.061784505844116, + "logps/chosen": -313.98760986328125, + "logps/rejected": -281.23638916015625, + "loss": 0.6528, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06499715149402618, + "rewards/margins": 0.10613612085580826, + "rewards/rejected": -0.041138969361782074, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.979653835296631, + "logits/rejected": -2.9869930744171143, + "logps/chosen": -340.1573486328125, + "logps/rejected": -217.7376708984375, + "loss": 0.6327, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.06802070885896683, + "rewards/margins": 0.1756124347448349, + "rewards/rejected": -0.10759172588586807, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.9975168704986572, + "logits/rejected": -2.9845681190490723, + "logps/chosen": -255.650146484375, + "logps/rejected": -238.10183715820312, + "loss": 0.6021, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.003270727349445224, + "rewards/margins": 0.21992146968841553, + "rewards/rejected": -0.2231922149658203, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -3.0794243812561035, + "logits/rejected": -3.0562033653259277, + "logps/chosen": -333.88580322265625, + "logps/rejected": -250.5960693359375, + "loss": 0.5817, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1207752674818039, + "rewards/margins": 0.4231022000312805, + "rewards/rejected": -0.30232688784599304, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -3.0469672679901123, + "logits/rejected": -3.0105535984039307, + "logps/chosen": -262.3305969238281, + "logps/rejected": -227.277587890625, + "loss": 0.5512, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13038447499275208, + "rewards/margins": 0.5708122253417969, + "rewards/rejected": -0.4404277801513672, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -3.0962467193603516, + "logits/rejected": -3.0591776371002197, + "logps/chosen": -260.2414855957031, + "logps/rejected": -228.8573760986328, + "loss": 0.5069, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.16786682605743408, + "rewards/margins": 0.885595440864563, + "rewards/rejected": -0.7177285552024841, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.9466209411621094, + "logits/rejected": -2.9162044525146484, + "logps/chosen": -279.3992614746094, + "logps/rejected": -209.82199096679688, + "loss": 0.5397, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1838654726743698, + "rewards/margins": 0.7952971458435059, + "rewards/rejected": -0.6114317774772644, + "step": 100 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -3.00329327583313, + "eval_logits/rejected": -2.9742367267608643, + "eval_logps/chosen": -295.24578857421875, + "eval_logps/rejected": -251.75856018066406, + "eval_loss": 0.5210586190223694, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": 0.12868967652320862, + "eval_rewards/margins": 0.813798725605011, + "eval_rewards/rejected": -0.6851091384887695, + "eval_runtime": 83.4523, + "eval_samples_per_second": 23.966, + "eval_steps_per_second": 0.755, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.9794344902038574, + "logits/rejected": -2.931662082672119, + "logps/chosen": -259.1096496582031, + "logps/rejected": -254.18624877929688, + "loss": 0.5465, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.052236903458833694, + "rewards/margins": 0.7087007761001587, + "rewards/rejected": -0.7609376311302185, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -3.0205585956573486, + "logits/rejected": -2.995251417160034, + "logps/chosen": -328.3795166015625, + "logps/rejected": -251.7208709716797, + "loss": 0.529, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04647420346736908, + "rewards/margins": 0.65358567237854, + "rewards/rejected": -0.7000598311424255, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -3.0943045616149902, + "logits/rejected": -3.045710563659668, + "logps/chosen": -290.30181884765625, + "logps/rejected": -255.75521850585938, + "loss": 0.5239, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15326093137264252, + "rewards/margins": 0.905988335609436, + "rewards/rejected": -0.7527275085449219, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -3.0118863582611084, + "logits/rejected": -2.9938008785247803, + "logps/chosen": -272.8241271972656, + "logps/rejected": -227.0712890625, + "loss": 0.4998, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.17975559830665588, + "rewards/margins": 0.9572590589523315, + "rewards/rejected": -0.7775036096572876, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -3.014378786087036, + "logits/rejected": -3.011509895324707, + "logps/chosen": -276.01239013671875, + "logps/rejected": -243.67691040039062, + "loss": 0.5232, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.018592316657304764, + "rewards/margins": 0.9336894750595093, + "rewards/rejected": -0.9150971174240112, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -3.080382823944092, + "logits/rejected": -3.083763837814331, + "logps/chosen": -319.6217346191406, + "logps/rejected": -261.18292236328125, + "loss": 0.4862, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07249774783849716, + "rewards/margins": 1.1103062629699707, + "rewards/rejected": -1.1828041076660156, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.9972214698791504, + "logits/rejected": -2.9871556758880615, + "logps/chosen": -280.0781555175781, + "logps/rejected": -237.6610107421875, + "loss": 0.5081, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24302425980567932, + "rewards/margins": 0.6678057312965393, + "rewards/rejected": -0.9108299016952515, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -3.060957193374634, + "logits/rejected": -3.0258703231811523, + "logps/chosen": -223.1851348876953, + "logps/rejected": -215.2241668701172, + "loss": 0.4735, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.10333947092294693, + "rewards/margins": 0.957025408744812, + "rewards/rejected": -1.0603649616241455, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -3.0781607627868652, + "logits/rejected": -3.0256905555725098, + "logps/chosen": -273.40521240234375, + "logps/rejected": -211.7317657470703, + "loss": 0.513, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.06486662477254868, + "rewards/margins": 1.1147286891937256, + "rewards/rejected": -1.049862027168274, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -3.0979599952697754, + "logits/rejected": -3.034374713897705, + "logps/chosen": -228.7999725341797, + "logps/rejected": -172.1134796142578, + "loss": 0.4919, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19804462790489197, + "rewards/margins": 0.8455147743225098, + "rewards/rejected": -1.0435593128204346, + "step": 200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -3.089780569076538, + "eval_logits/rejected": -3.068847179412842, + "eval_logps/chosen": -296.25518798828125, + "eval_logps/rejected": -256.5061340332031, + "eval_loss": 0.48726680874824524, + "eval_rewards/accuracies": 0.7896825671195984, + "eval_rewards/chosen": 0.027751244604587555, + "eval_rewards/margins": 1.1876167058944702, + "eval_rewards/rejected": -1.159865379333496, + "eval_runtime": 85.3677, + "eval_samples_per_second": 23.428, + "eval_steps_per_second": 0.738, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -3.0770697593688965, + "logits/rejected": -3.0069308280944824, + "logps/chosen": -305.48211669921875, + "logps/rejected": -228.2117919921875, + "loss": 0.4486, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06476946175098419, + "rewards/margins": 1.3383097648620605, + "rewards/rejected": -1.4030791521072388, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.9890925884246826, + "logits/rejected": -2.9934885501861572, + "logps/chosen": -230.14501953125, + "logps/rejected": -242.2181396484375, + "loss": 0.5072, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.388418048620224, + "rewards/margins": 1.1304852962493896, + "rewards/rejected": -1.5189034938812256, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -3.037731647491455, + "logits/rejected": -3.0365240573883057, + "logps/chosen": -246.9208984375, + "logps/rejected": -229.67221069335938, + "loss": 0.5144, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21415448188781738, + "rewards/margins": 1.0256799459457397, + "rewards/rejected": -1.2398344278335571, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -3.164783000946045, + "logits/rejected": -3.105828285217285, + "logps/chosen": -323.3183288574219, + "logps/rejected": -223.78634643554688, + "loss": 0.4627, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.21443960070610046, + "rewards/margins": 1.0258036851882935, + "rewards/rejected": -1.2402431964874268, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -3.1041345596313477, + "logits/rejected": -3.085723876953125, + "logps/chosen": -266.01995849609375, + "logps/rejected": -250.32656860351562, + "loss": 0.4745, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4707525372505188, + "rewards/margins": 1.0131726264953613, + "rewards/rejected": -1.4839251041412354, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -3.0145013332366943, + "logits/rejected": -3.0256457328796387, + "logps/chosen": -280.07232666015625, + "logps/rejected": -225.8538818359375, + "loss": 0.5126, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.49882835149765015, + "rewards/margins": 1.2246119976043701, + "rewards/rejected": -1.723440408706665, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -3.1204419136047363, + "logits/rejected": -3.060319423675537, + "logps/chosen": -303.67218017578125, + "logps/rejected": -246.41006469726562, + "loss": 0.5243, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3401849865913391, + "rewards/margins": 0.9501386880874634, + "rewards/rejected": -1.2903234958648682, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -3.066493272781372, + "logits/rejected": -3.0354180335998535, + "logps/chosen": -282.0846862792969, + "logps/rejected": -242.1811981201172, + "loss": 0.5183, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42353829741477966, + "rewards/margins": 1.0371509790420532, + "rewards/rejected": -1.4606893062591553, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -3.1275315284729004, + "logits/rejected": -3.0769925117492676, + "logps/chosen": -279.0789794921875, + "logps/rejected": -230.20285034179688, + "loss": 0.5104, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23357586562633514, + "rewards/margins": 0.9790124893188477, + "rewards/rejected": -1.2125883102416992, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.982798165137615e-07, + "logits/chosen": -3.0245261192321777, + "logits/rejected": -2.995250701904297, + "logps/chosen": -237.4323272705078, + "logps/rejected": -246.7605743408203, + "loss": 0.4802, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5877944231033325, + "rewards/margins": 0.8804357647895813, + "rewards/rejected": -1.4682300090789795, + "step": 300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -3.0827579498291016, + "eval_logits/rejected": -3.0494439601898193, + "eval_logps/chosen": -298.7669372558594, + "eval_logps/rejected": -258.1646423339844, + "eval_loss": 0.5026515126228333, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -0.22342802584171295, + "eval_rewards/margins": 1.102290391921997, + "eval_rewards/rejected": -1.3257185220718384, + "eval_runtime": 84.4222, + "eval_samples_per_second": 23.69, + "eval_steps_per_second": 0.746, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.963685015290519e-07, + "logits/chosen": -3.1253762245178223, + "logits/rejected": -3.056403636932373, + "logps/chosen": -326.9117431640625, + "logps/rejected": -284.8948059082031, + "loss": 0.5358, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14515358209609985, + "rewards/margins": 1.0375834703445435, + "rewards/rejected": -1.182737112045288, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.944571865443424e-07, + "logits/chosen": -3.059190273284912, + "logits/rejected": -3.0378201007843018, + "logps/chosen": -255.5692596435547, + "logps/rejected": -201.07577514648438, + "loss": 0.4657, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1327451914548874, + "rewards/margins": 1.4812240600585938, + "rewards/rejected": -1.6139692068099976, + "step": 320 + }, + { + "epoch": 0.34, + "learning_rate": 4.92545871559633e-07, + "logits/chosen": -3.066437005996704, + "logits/rejected": -3.0722594261169434, + "logps/chosen": -332.57073974609375, + "logps/rejected": -253.54833984375, + "loss": 0.506, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.022808248177170753, + "rewards/margins": 1.1694247722625732, + "rewards/rejected": -1.1466165781021118, + "step": 330 + }, + { + "epoch": 0.35, + "learning_rate": 4.906345565749235e-07, + "logits/chosen": -2.9793455600738525, + "logits/rejected": -2.957947254180908, + "logps/chosen": -246.28836059570312, + "logps/rejected": -238.5087890625, + "loss": 0.5348, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2783312201499939, + "rewards/margins": 0.9609475135803223, + "rewards/rejected": -1.2392786741256714, + "step": 340 + }, + { + "epoch": 0.36, + "learning_rate": 4.88723241590214e-07, + "logits/chosen": -2.998023271560669, + "logits/rejected": -2.9771180152893066, + "logps/chosen": -307.2586975097656, + "logps/rejected": -251.5006866455078, + "loss": 0.4786, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11792813241481781, + "rewards/margins": 1.2310340404510498, + "rewards/rejected": -1.3489621877670288, + "step": 350 + }, + { + "epoch": 0.37, + "learning_rate": 4.868119266055046e-07, + "logits/chosen": -3.030704975128174, + "logits/rejected": -3.0520386695861816, + "logps/chosen": -295.72540283203125, + "logps/rejected": -283.2640075683594, + "loss": 0.4889, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5819287896156311, + "rewards/margins": 1.3330873250961304, + "rewards/rejected": -1.9150161743164062, + "step": 360 + }, + { + "epoch": 0.38, + "learning_rate": 4.849006116207951e-07, + "logits/chosen": -3.0758090019226074, + "logits/rejected": -3.0432868003845215, + "logps/chosen": -293.5560607910156, + "logps/rejected": -271.2206115722656, + "loss": 0.5067, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.4366377294063568, + "rewards/margins": 1.4355862140655518, + "rewards/rejected": -1.8722240924835205, + "step": 370 + }, + { + "epoch": 0.39, + "learning_rate": 4.829892966360856e-07, + "logits/chosen": -3.0619266033172607, + "logits/rejected": -3.067351818084717, + "logps/chosen": -315.32940673828125, + "logps/rejected": -284.5444030761719, + "loss": 0.533, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.23806679248809814, + "rewards/margins": 1.37300705909729, + "rewards/rejected": -1.6110738515853882, + "step": 380 + }, + { + "epoch": 0.4, + "learning_rate": 4.810779816513762e-07, + "logits/chosen": -2.9711368083953857, + "logits/rejected": -2.97477388381958, + "logps/chosen": -250.7084197998047, + "logps/rejected": -238.7926025390625, + "loss": 0.521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6350131034851074, + "rewards/margins": 0.9497348666191101, + "rewards/rejected": -1.5847480297088623, + "step": 390 + }, + { + "epoch": 0.41, + "learning_rate": 4.791666666666667e-07, + "logits/chosen": -2.890068531036377, + "logits/rejected": -2.8569467067718506, + "logps/chosen": -278.987060546875, + "logps/rejected": -212.63363647460938, + "loss": 0.5134, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.20403578877449036, + "rewards/margins": 1.445931315422058, + "rewards/rejected": -1.6499669551849365, + "step": 400 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.9178526401519775, + "eval_logits/rejected": -2.884305238723755, + "eval_logps/chosen": -299.4102478027344, + "eval_logps/rejected": -261.616943359375, + "eval_loss": 0.5097789764404297, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.28775671124458313, + "eval_rewards/margins": 1.3831894397735596, + "eval_rewards/rejected": -1.6709461212158203, + "eval_runtime": 85.2881, + "eval_samples_per_second": 23.45, + "eval_steps_per_second": 0.739, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 4.772553516819572e-07, + "logits/chosen": -2.901624917984009, + "logits/rejected": -2.869872808456421, + "logps/chosen": -292.89959716796875, + "logps/rejected": -284.45684814453125, + "loss": 0.5223, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2602219879627228, + "rewards/margins": 1.1454756259918213, + "rewards/rejected": -1.4056975841522217, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 4.753440366972477e-07, + "logits/chosen": -3.0204930305480957, + "logits/rejected": -2.975163221359253, + "logps/chosen": -234.13818359375, + "logps/rejected": -242.6905059814453, + "loss": 0.5389, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4058782458305359, + "rewards/margins": 0.8633907437324524, + "rewards/rejected": -1.2692689895629883, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 4.7343272171253825e-07, + "logits/chosen": -3.014406681060791, + "logits/rejected": -2.977393627166748, + "logps/chosen": -251.76904296875, + "logps/rejected": -243.6968231201172, + "loss": 0.5389, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.34599775075912476, + "rewards/margins": 1.2039783000946045, + "rewards/rejected": -1.549976110458374, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 4.715214067278288e-07, + "logits/chosen": -2.9510791301727295, + "logits/rejected": -2.8885598182678223, + "logps/chosen": -286.24798583984375, + "logps/rejected": -241.7139892578125, + "loss": 0.4669, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.2214752435684204, + "rewards/margins": 1.800271987915039, + "rewards/rejected": -2.021747350692749, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 4.696100917431192e-07, + "logits/chosen": -3.0806918144226074, + "logits/rejected": -3.0243117809295654, + "logps/chosen": -324.5965881347656, + "logps/rejected": -282.03594970703125, + "loss": 0.5064, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.27235084772109985, + "rewards/margins": 0.9739311337471008, + "rewards/rejected": -1.2462819814682007, + "step": 450 + }, + { + "epoch": 0.47, + "learning_rate": 4.6769877675840974e-07, + "logits/chosen": -2.9995086193084717, + "logits/rejected": -2.9964065551757812, + "logps/chosen": -262.6032409667969, + "logps/rejected": -252.2246551513672, + "loss": 0.4866, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.4137532711029053, + "rewards/margins": 1.2080386877059937, + "rewards/rejected": -1.621792197227478, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.6578746177370027e-07, + "logits/chosen": -2.9163661003112793, + "logits/rejected": -2.9061973094940186, + "logps/chosen": -201.60374450683594, + "logps/rejected": -205.3343048095703, + "loss": 0.4674, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.28440338373184204, + "rewards/margins": 1.4812262058258057, + "rewards/rejected": -1.765629768371582, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.638761467889908e-07, + "logits/chosen": -2.9122395515441895, + "logits/rejected": -2.8503425121307373, + "logps/chosen": -287.53436279296875, + "logps/rejected": -242.94888305664062, + "loss": 0.5033, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.3852013349533081, + "rewards/margins": 1.852294921875, + "rewards/rejected": -2.2374961376190186, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.6196483180428133e-07, + "logits/chosen": -2.9731476306915283, + "logits/rejected": -2.965475559234619, + "logps/chosen": -285.62945556640625, + "logps/rejected": -235.3878631591797, + "loss": 0.5059, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.01807239092886448, + "rewards/margins": 1.6012237071990967, + "rewards/rejected": -1.6192958354949951, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.600535168195718e-07, + "logits/chosen": -2.9572091102600098, + "logits/rejected": -2.9516422748565674, + "logps/chosen": -218.7193603515625, + "logps/rejected": -225.1012420654297, + "loss": 0.4534, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3155478835105896, + "rewards/margins": 0.9508973956108093, + "rewards/rejected": -1.2664451599121094, + "step": 500 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -3.0181987285614014, + "eval_logits/rejected": -2.980436325073242, + "eval_logps/chosen": -298.340576171875, + "eval_logps/rejected": -261.2433166503906, + "eval_loss": 0.49045199155807495, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.18078860640525818, + "eval_rewards/margins": 1.452793002128601, + "eval_rewards/rejected": -1.6335817575454712, + "eval_runtime": 86.1119, + "eval_samples_per_second": 23.226, + "eval_steps_per_second": 0.732, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.5814220183486234e-07, + "logits/chosen": -2.905066967010498, + "logits/rejected": -2.841881275177002, + "logps/chosen": -303.79681396484375, + "logps/rejected": -282.34051513671875, + "loss": 0.5138, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.47309979796409607, + "rewards/margins": 1.0634024143218994, + "rewards/rejected": -1.5365021228790283, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.562308868501529e-07, + "logits/chosen": -2.857588291168213, + "logits/rejected": -2.8893682956695557, + "logps/chosen": -264.73577880859375, + "logps/rejected": -275.3564758300781, + "loss": 0.5008, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2937226891517639, + "rewards/margins": 1.5019558668136597, + "rewards/rejected": -1.7956784963607788, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.543195718654434e-07, + "logits/chosen": -2.914440393447876, + "logits/rejected": -2.902010440826416, + "logps/chosen": -230.23721313476562, + "logps/rejected": -215.05258178710938, + "loss": 0.5034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4968380928039551, + "rewards/margins": 0.6932927966117859, + "rewards/rejected": -1.1901309490203857, + "step": 530 + }, + { + "epoch": 0.56, + "learning_rate": 4.5240825688073394e-07, + "logits/chosen": -3.00209379196167, + "logits/rejected": -2.9657745361328125, + "logps/chosen": -291.4919128417969, + "logps/rejected": -249.2689208984375, + "loss": 0.5174, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5610731244087219, + "rewards/margins": 1.5831964015960693, + "rewards/rejected": -2.1442694664001465, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 4.504969418960244e-07, + "logits/chosen": -2.939059257507324, + "logits/rejected": -2.934699535369873, + "logps/chosen": -267.82537841796875, + "logps/rejected": -286.34893798828125, + "loss": 0.5132, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7965409159660339, + "rewards/margins": 1.2346630096435547, + "rewards/rejected": -2.0312037467956543, + "step": 550 + }, + { + "epoch": 0.58, + "learning_rate": 4.4858562691131495e-07, + "logits/chosen": -2.9643239974975586, + "logits/rejected": -2.970371723175049, + "logps/chosen": -303.12457275390625, + "logps/rejected": -302.5494689941406, + "loss": 0.5128, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6967397928237915, + "rewards/margins": 1.166548490524292, + "rewards/rejected": -1.863288164138794, + "step": 560 + }, + { + "epoch": 0.59, + "learning_rate": 4.466743119266055e-07, + "logits/chosen": -2.896851062774658, + "logits/rejected": -2.875133752822876, + "logps/chosen": -287.5796813964844, + "logps/rejected": -261.62530517578125, + "loss": 0.4649, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7515207529067993, + "rewards/margins": 1.2609531879425049, + "rewards/rejected": -2.0124735832214355, + "step": 570 + }, + { + "epoch": 0.6, + "learning_rate": 4.44762996941896e-07, + "logits/chosen": -2.923142910003662, + "logits/rejected": -2.85493540763855, + "logps/chosen": -303.4167175292969, + "logps/rejected": -275.49822998046875, + "loss": 0.4969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.43573087453842163, + "rewards/margins": 1.485872507095337, + "rewards/rejected": -1.9216034412384033, + "step": 580 + }, + { + "epoch": 0.61, + "learning_rate": 4.4285168195718655e-07, + "logits/chosen": -2.8740463256835938, + "logits/rejected": -2.8674261569976807, + "logps/chosen": -239.81216430664062, + "logps/rejected": -256.44781494140625, + "loss": 0.4749, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5639557838439941, + "rewards/margins": 1.198778748512268, + "rewards/rejected": -1.7627344131469727, + "step": 590 + }, + { + "epoch": 0.62, + "learning_rate": 4.40940366972477e-07, + "logits/chosen": -2.9853415489196777, + "logits/rejected": -2.9935359954833984, + "logps/chosen": -242.20938110351562, + "logps/rejected": -233.65548706054688, + "loss": 0.4976, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4875938892364502, + "rewards/margins": 1.0177834033966064, + "rewards/rejected": -1.5053772926330566, + "step": 600 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.972991466522217, + "eval_logits/rejected": -2.9266481399536133, + "eval_logps/chosen": -298.8059387207031, + "eval_logps/rejected": -260.2930603027344, + "eval_loss": 0.4871741831302643, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -0.22732341289520264, + "eval_rewards/margins": 1.3112375736236572, + "eval_rewards/rejected": -1.5385609865188599, + "eval_runtime": 85.4126, + "eval_samples_per_second": 23.416, + "eval_steps_per_second": 0.738, + "step": 600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3902905198776756e-07, + "logits/chosen": -2.962296962738037, + "logits/rejected": -2.944753408432007, + "logps/chosen": -282.0075378417969, + "logps/rejected": -245.8345184326172, + "loss": 0.4831, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3591349720954895, + "rewards/margins": 1.316572904586792, + "rewards/rejected": -1.6757080554962158, + "step": 610 + }, + { + "epoch": 0.64, + "learning_rate": 4.371177370030581e-07, + "logits/chosen": -2.9345905780792236, + "logits/rejected": -2.911076545715332, + "logps/chosen": -283.3570251464844, + "logps/rejected": -251.5830841064453, + "loss": 0.4764, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.44033902883529663, + "rewards/margins": 1.620408296585083, + "rewards/rejected": -2.0607473850250244, + "step": 620 + }, + { + "epoch": 0.65, + "learning_rate": 4.352064220183486e-07, + "logits/chosen": -2.9641098976135254, + "logits/rejected": -2.9336962699890137, + "logps/chosen": -224.51687622070312, + "logps/rejected": -220.58651733398438, + "loss": 0.5132, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6185753345489502, + "rewards/margins": 1.2681429386138916, + "rewards/rejected": -1.8867183923721313, + "step": 630 + }, + { + "epoch": 0.66, + "learning_rate": 4.3329510703363915e-07, + "logits/chosen": -3.050631046295166, + "logits/rejected": -2.9990761280059814, + "logps/chosen": -263.21337890625, + "logps/rejected": -234.34375, + "loss": 0.5008, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5043941140174866, + "rewards/margins": 1.566725254058838, + "rewards/rejected": -2.071119546890259, + "step": 640 + }, + { + "epoch": 0.67, + "learning_rate": 4.313837920489297e-07, + "logits/chosen": -2.907442808151245, + "logits/rejected": -2.897782802581787, + "logps/chosen": -240.8859405517578, + "logps/rejected": -245.74880981445312, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5294120907783508, + "rewards/margins": 1.4188861846923828, + "rewards/rejected": -1.948298454284668, + "step": 650 + }, + { + "epoch": 0.68, + "learning_rate": 4.2947247706422016e-07, + "logits/chosen": -2.8966031074523926, + "logits/rejected": -2.8596677780151367, + "logps/chosen": -265.93841552734375, + "logps/rejected": -256.66900634765625, + "loss": 0.479, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.824979305267334, + "rewards/margins": 1.305490255355835, + "rewards/rejected": -2.130469560623169, + "step": 660 + }, + { + "epoch": 0.69, + "learning_rate": 4.275611620795107e-07, + "logits/chosen": -2.9146366119384766, + "logits/rejected": -2.8707921504974365, + "logps/chosen": -321.3832702636719, + "logps/rejected": -260.4658508300781, + "loss": 0.4993, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8080131411552429, + "rewards/margins": 0.9972589612007141, + "rewards/rejected": -1.8052723407745361, + "step": 670 + }, + { + "epoch": 0.7, + "learning_rate": 4.2564984709480123e-07, + "logits/chosen": -2.9579291343688965, + "logits/rejected": -2.92578125, + "logps/chosen": -300.4450988769531, + "logps/rejected": -243.27645874023438, + "loss": 0.576, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7343733906745911, + "rewards/margins": 1.0802533626556396, + "rewards/rejected": -1.814626693725586, + "step": 680 + }, + { + "epoch": 0.71, + "learning_rate": 4.2373853211009176e-07, + "logits/chosen": -2.968165159225464, + "logits/rejected": -2.956108331680298, + "logps/chosen": -269.56500244140625, + "logps/rejected": -258.65631103515625, + "loss": 0.521, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3433496952056885, + "rewards/margins": 1.492470145225525, + "rewards/rejected": -1.8358198404312134, + "step": 690 + }, + { + "epoch": 0.72, + "learning_rate": 4.2182721712538224e-07, + "logits/chosen": -2.9497225284576416, + "logits/rejected": -2.924623966217041, + "logps/chosen": -293.396484375, + "logps/rejected": -236.59671020507812, + "loss": 0.5452, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5475293397903442, + "rewards/margins": 1.052236795425415, + "rewards/rejected": -1.5997662544250488, + "step": 700 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.968550443649292, + "eval_logits/rejected": -2.937694787979126, + "eval_logps/chosen": -301.3451843261719, + "eval_logps/rejected": -261.7586364746094, + "eval_loss": 0.4888325333595276, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.48125144839286804, + "eval_rewards/margins": 1.2038646936416626, + "eval_rewards/rejected": -1.6851160526275635, + "eval_runtime": 85.7614, + "eval_samples_per_second": 23.321, + "eval_steps_per_second": 0.735, + "step": 700 + }, + { + "epoch": 0.73, + "learning_rate": 4.199159021406727e-07, + "logits/chosen": -2.8664116859436035, + "logits/rejected": -2.904461622238159, + "logps/chosen": -250.817138671875, + "logps/rejected": -236.36422729492188, + "loss": 0.5166, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6456762552261353, + "rewards/margins": 1.1793797016143799, + "rewards/rejected": -1.8250560760498047, + "step": 710 + }, + { + "epoch": 0.74, + "learning_rate": 4.1800458715596325e-07, + "logits/chosen": -2.9268696308135986, + "logits/rejected": -2.853827953338623, + "logps/chosen": -317.4768981933594, + "logps/rejected": -273.88787841796875, + "loss": 0.5413, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4396774172782898, + "rewards/margins": 1.1779446601867676, + "rewards/rejected": -1.6176220178604126, + "step": 720 + }, + { + "epoch": 0.75, + "learning_rate": 4.160932721712538e-07, + "logits/chosen": -2.8954315185546875, + "logits/rejected": -2.8464741706848145, + "logps/chosen": -293.01678466796875, + "logps/rejected": -276.9455261230469, + "loss": 0.5042, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5548242330551147, + "rewards/margins": 1.2215098142623901, + "rewards/rejected": -1.7763340473175049, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.141819571865443e-07, + "logits/chosen": -2.851848602294922, + "logits/rejected": -2.798680067062378, + "logps/chosen": -262.6050109863281, + "logps/rejected": -218.8506317138672, + "loss": 0.4847, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7612582445144653, + "rewards/margins": 1.4158737659454346, + "rewards/rejected": -2.1771321296691895, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.1227064220183485e-07, + "logits/chosen": -2.8720107078552246, + "logits/rejected": -2.8650383949279785, + "logps/chosen": -267.54156494140625, + "logps/rejected": -254.5974578857422, + "loss": 0.4604, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6114904284477234, + "rewards/margins": 1.7151912450790405, + "rewards/rejected": -2.326681613922119, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.103593272171253e-07, + "logits/chosen": -2.842667818069458, + "logits/rejected": -2.8490633964538574, + "logps/chosen": -280.39593505859375, + "logps/rejected": -285.4444274902344, + "loss": 0.5362, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7370551824569702, + "rewards/margins": 1.257871389389038, + "rewards/rejected": -1.9949266910552979, + "step": 760 + }, + { + "epoch": 0.79, + "learning_rate": 4.0844801223241586e-07, + "logits/chosen": -2.9199094772338867, + "logits/rejected": -2.8772215843200684, + "logps/chosen": -287.48944091796875, + "logps/rejected": -255.93338012695312, + "loss": 0.4611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39787834882736206, + "rewards/margins": 1.0551578998565674, + "rewards/rejected": -1.4530361890792847, + "step": 770 + }, + { + "epoch": 0.8, + "learning_rate": 4.065366972477064e-07, + "logits/chosen": -2.8562026023864746, + "logits/rejected": -2.8700435161590576, + "logps/chosen": -293.22271728515625, + "logps/rejected": -266.9279479980469, + "loss": 0.4804, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5370479822158813, + "rewards/margins": 1.5146362781524658, + "rewards/rejected": -2.0516841411590576, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.046253822629969e-07, + "logits/chosen": -2.8632709980010986, + "logits/rejected": -2.8351528644561768, + "logps/chosen": -268.63116455078125, + "logps/rejected": -248.3597412109375, + "loss": 0.4616, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.23348459601402283, + "rewards/margins": 1.344719409942627, + "rewards/rejected": -1.5782040357589722, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0271406727828745e-07, + "logits/chosen": -2.8927769660949707, + "logits/rejected": -2.8393540382385254, + "logps/chosen": -264.2203063964844, + "logps/rejected": -240.07052612304688, + "loss": 0.5342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6227496862411499, + "rewards/margins": 1.1725116968154907, + "rewards/rejected": -1.7952613830566406, + "step": 800 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.8821091651916504, + "eval_logits/rejected": -2.8434085845947266, + "eval_logps/chosen": -300.23773193359375, + "eval_logps/rejected": -264.129150390625, + "eval_loss": 0.47742241621017456, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": -0.3705042004585266, + "eval_rewards/margins": 1.5516674518585205, + "eval_rewards/rejected": -1.9221714735031128, + "eval_runtime": 84.847, + "eval_samples_per_second": 23.572, + "eval_steps_per_second": 0.743, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.00802752293578e-07, + "logits/chosen": -2.839601993560791, + "logits/rejected": -2.832960605621338, + "logps/chosen": -276.8193359375, + "logps/rejected": -253.14517211914062, + "loss": 0.4653, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4897233545780182, + "rewards/margins": 1.3689448833465576, + "rewards/rejected": -1.858668327331543, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9889143730886847e-07, + "logits/chosen": -2.838459014892578, + "logits/rejected": -2.8137266635894775, + "logps/chosen": -314.2509460449219, + "logps/rejected": -267.2218017578125, + "loss": 0.4597, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.45112401247024536, + "rewards/margins": 1.6043586730957031, + "rewards/rejected": -2.0554826259613037, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.96980122324159e-07, + "logits/chosen": -2.8855397701263428, + "logits/rejected": -2.8772969245910645, + "logps/chosen": -271.2967224121094, + "logps/rejected": -224.9015655517578, + "loss": 0.4821, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6153538227081299, + "rewards/margins": 1.476227879524231, + "rewards/rejected": -2.0915818214416504, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.9506880733944953e-07, + "logits/chosen": -2.904127597808838, + "logits/rejected": -2.8686683177948, + "logps/chosen": -261.08575439453125, + "logps/rejected": -243.058837890625, + "loss": 0.5536, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7326072454452515, + "rewards/margins": 1.5098663568496704, + "rewards/rejected": -2.242473602294922, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9315749235474006e-07, + "logits/chosen": -2.910623788833618, + "logits/rejected": -2.9078075885772705, + "logps/chosen": -283.50946044921875, + "logps/rejected": -288.6430358886719, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6883010864257812, + "rewards/margins": 1.5665154457092285, + "rewards/rejected": -2.2548165321350098, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.912461773700306e-07, + "logits/chosen": -2.881904125213623, + "logits/rejected": -2.8731162548065186, + "logps/chosen": -304.5970458984375, + "logps/rejected": -308.7020263671875, + "loss": 0.5145, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7746303677558899, + "rewards/margins": 1.2819398641586304, + "rewards/rejected": -2.056570291519165, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8933486238532107e-07, + "logits/chosen": -2.7776718139648438, + "logits/rejected": -2.780966281890869, + "logps/chosen": -321.63397216796875, + "logps/rejected": -256.59344482421875, + "loss": 0.4833, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6265315413475037, + "rewards/margins": 1.2452455759048462, + "rewards/rejected": -1.8717771768569946, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.874235474006116e-07, + "logits/chosen": -2.8302550315856934, + "logits/rejected": -2.7907934188842773, + "logps/chosen": -295.404541015625, + "logps/rejected": -253.10470581054688, + "loss": 0.498, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7017183303833008, + "rewards/margins": 1.4433740377426147, + "rewards/rejected": -2.145092248916626, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8551223241590214e-07, + "logits/chosen": -2.821937084197998, + "logits/rejected": -2.7967801094055176, + "logps/chosen": -262.8708801269531, + "logps/rejected": -247.0980987548828, + "loss": 0.5049, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6857427358627319, + "rewards/margins": 1.306744933128357, + "rewards/rejected": -1.9924876689910889, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.8360091743119267e-07, + "logits/chosen": -2.826765298843384, + "logits/rejected": -2.796332597732544, + "logps/chosen": -267.5680847167969, + "logps/rejected": -244.30905151367188, + "loss": 0.5014, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3354864716529846, + "rewards/margins": 1.4798164367675781, + "rewards/rejected": -1.815303087234497, + "step": 900 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.878131628036499, + "eval_logits/rejected": -2.8338723182678223, + "eval_logps/chosen": -298.9295959472656, + "eval_logps/rejected": -261.7012634277344, + "eval_loss": 0.4813868999481201, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -0.23969438672065735, + "eval_rewards/margins": 1.4396847486495972, + "eval_rewards/rejected": -1.6793793439865112, + "eval_runtime": 85.2741, + "eval_samples_per_second": 23.454, + "eval_steps_per_second": 0.739, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.816896024464832e-07, + "logits/chosen": -2.856722831726074, + "logits/rejected": -2.767521381378174, + "logps/chosen": -239.7253875732422, + "logps/rejected": -228.12472534179688, + "loss": 0.5082, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.3585191071033478, + "rewards/margins": 1.4834765195846558, + "rewards/rejected": -1.8419954776763916, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.797782874617737e-07, + "logits/chosen": -2.827901601791382, + "logits/rejected": -2.7887134552001953, + "logps/chosen": -283.5338439941406, + "logps/rejected": -246.73648071289062, + "loss": 0.4744, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3626176714897156, + "rewards/margins": 1.1291471719741821, + "rewards/rejected": -1.491764783859253, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.778669724770642e-07, + "logits/chosen": -2.7931838035583496, + "logits/rejected": -2.784943103790283, + "logps/chosen": -264.00958251953125, + "logps/rejected": -217.96646118164062, + "loss": 0.4845, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3267980217933655, + "rewards/margins": 1.5182520151138306, + "rewards/rejected": -1.8450498580932617, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7595565749235474e-07, + "logits/chosen": -2.897451400756836, + "logits/rejected": -2.854799747467041, + "logps/chosen": -289.0576171875, + "logps/rejected": -229.93984985351562, + "loss": 0.4933, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2656581401824951, + "rewards/margins": 1.4493383169174194, + "rewards/rejected": -1.714996576309204, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.740443425076452e-07, + "logits/chosen": -2.855194568634033, + "logits/rejected": -2.843090295791626, + "logps/chosen": -285.1914978027344, + "logps/rejected": -263.72344970703125, + "loss": 0.4782, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.46451419591903687, + "rewards/margins": 1.2706360816955566, + "rewards/rejected": -1.7351500988006592, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7213302752293575e-07, + "logits/chosen": -2.8054327964782715, + "logits/rejected": -2.819227695465088, + "logps/chosen": -276.37347412109375, + "logps/rejected": -247.4856719970703, + "loss": 0.474, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.40697455406188965, + "rewards/margins": 1.0086820125579834, + "rewards/rejected": -1.4156566858291626, + "step": 960 + }, + { + "epoch": 1.0, + "learning_rate": 3.702217125382263e-07, + "logits/chosen": -2.800783634185791, + "logits/rejected": -2.7564892768859863, + "logps/chosen": -290.7643127441406, + "logps/rejected": -270.8573303222656, + "loss": 0.4028, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.32486557960510254, + "rewards/margins": 1.6888189315795898, + "rewards/rejected": -2.0136847496032715, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.6831039755351677e-07, + "logits/chosen": -2.8075499534606934, + "logits/rejected": -2.798651933670044, + "logps/chosen": -256.40228271484375, + "logps/rejected": -276.94976806640625, + "loss": 0.0877, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9983769655227661, + "rewards/margins": 4.673001289367676, + "rewards/rejected": -3.6746246814727783, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.663990825688073e-07, + "logits/chosen": -2.7263360023498535, + "logits/rejected": -2.6973845958709717, + "logps/chosen": -244.35220336914062, + "logps/rejected": -270.0553283691406, + "loss": 0.1026, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.787170946598053, + "rewards/margins": 5.061522006988525, + "rewards/rejected": -4.274351119995117, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6448776758409783e-07, + "logits/chosen": -2.7410221099853516, + "logits/rejected": -2.718764543533325, + "logps/chosen": -276.5079650878906, + "logps/rejected": -291.8220520019531, + "loss": 0.0785, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8010635375976562, + "rewards/margins": 4.786960124969482, + "rewards/rejected": -3.985896348953247, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.8067638874053955, + "eval_logits/rejected": -2.7560861110687256, + "eval_logps/chosen": -303.0184326171875, + "eval_logps/rejected": -270.12823486328125, + "eval_loss": 0.4820875823497772, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -0.648577868938446, + "eval_rewards/margins": 1.8734972476959229, + "eval_rewards/rejected": -2.5220751762390137, + "eval_runtime": 86.0373, + "eval_samples_per_second": 23.246, + "eval_steps_per_second": 0.732, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6257645259938836e-07, + "logits/chosen": -2.725844621658325, + "logits/rejected": -2.7488369941711426, + "logps/chosen": -244.24826049804688, + "logps/rejected": -285.93255615234375, + "loss": 0.0766, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5290582776069641, + "rewards/margins": 5.142174243927002, + "rewards/rejected": -4.6131157875061035, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.606651376146789e-07, + "logits/chosen": -2.6446681022644043, + "logits/rejected": -2.60050106048584, + "logps/chosen": -243.2852325439453, + "logps/rejected": -234.6096649169922, + "loss": 0.0837, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5728534460067749, + "rewards/margins": 4.691143989562988, + "rewards/rejected": -4.118290424346924, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5875382262996937e-07, + "logits/chosen": -2.730254650115967, + "logits/rejected": -2.707766056060791, + "logps/chosen": -269.9300231933594, + "logps/rejected": -323.731689453125, + "loss": 0.0731, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7072321176528931, + "rewards/margins": 4.812115669250488, + "rewards/rejected": -4.104883193969727, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.568425076452599e-07, + "logits/chosen": -2.798232316970825, + "logits/rejected": -2.7622432708740234, + "logps/chosen": -283.02587890625, + "logps/rejected": -288.06927490234375, + "loss": 0.0847, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6590858697891235, + "rewards/margins": 5.309805870056152, + "rewards/rejected": -4.650720119476318, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.5493119266055044e-07, + "logits/chosen": -2.6808838844299316, + "logits/rejected": -2.67130708694458, + "logps/chosen": -262.48748779296875, + "logps/rejected": -240.3964080810547, + "loss": 0.0914, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.24962154030799866, + "rewards/margins": 4.924658298492432, + "rewards/rejected": -4.675037384033203, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5301987767584097e-07, + "logits/chosen": -2.70487380027771, + "logits/rejected": -2.6635613441467285, + "logps/chosen": -256.9989318847656, + "logps/rejected": -302.34051513671875, + "loss": 0.0807, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3633723855018616, + "rewards/margins": 5.548757553100586, + "rewards/rejected": -5.185385704040527, + "step": 1060 + }, + { + "epoch": 1.1, + "learning_rate": 3.511085626911315e-07, + "logits/chosen": -2.808043956756592, + "logits/rejected": -2.8087692260742188, + "logps/chosen": -323.17535400390625, + "logps/rejected": -289.61834716796875, + "loss": 0.1467, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.2529454827308655, + "rewards/margins": 5.11926794052124, + "rewards/rejected": -4.8663225173950195, + "step": 1070 + }, + { + "epoch": 1.11, + "learning_rate": 3.49197247706422e-07, + "logits/chosen": -2.775644302368164, + "logits/rejected": -2.7150864601135254, + "logps/chosen": -226.1105194091797, + "logps/rejected": -252.4014434814453, + "loss": 0.1123, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5964186787605286, + "rewards/margins": 4.527159690856934, + "rewards/rejected": -5.123579025268555, + "step": 1080 + }, + { + "epoch": 1.12, + "learning_rate": 3.472859327217125e-07, + "logits/chosen": -2.668936014175415, + "logits/rejected": -2.6859848499298096, + "logps/chosen": -291.4830627441406, + "logps/rejected": -304.6649475097656, + "loss": 0.0945, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.12123675644397736, + "rewards/margins": 5.1819868087768555, + "rewards/rejected": -5.303224086761475, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4537461773700304e-07, + "logits/chosen": -2.767573595046997, + "logits/rejected": -2.67921781539917, + "logps/chosen": -213.55044555664062, + "logps/rejected": -232.8217315673828, + "loss": 0.0883, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.168074369430542, + "rewards/margins": 4.9372406005859375, + "rewards/rejected": -5.105315208435059, + "step": 1100 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -2.8337366580963135, + "eval_logits/rejected": -2.7830586433410645, + "eval_logps/chosen": -309.7097473144531, + "eval_logps/rejected": -278.2621154785156, + "eval_loss": 0.5074188709259033, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -1.317708969116211, + "eval_rewards/margins": 2.017754077911377, + "eval_rewards/rejected": -3.335462808609009, + "eval_runtime": 84.7751, + "eval_samples_per_second": 23.592, + "eval_steps_per_second": 0.743, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.434633027522936e-07, + "logits/chosen": -2.7691617012023926, + "logits/rejected": -2.7546744346618652, + "logps/chosen": -278.5716857910156, + "logps/rejected": -283.21502685546875, + "loss": 0.0887, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.48537954688072205, + "rewards/margins": 5.524861812591553, + "rewards/rejected": -5.039482593536377, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.415519877675841e-07, + "logits/chosen": -2.7980878353118896, + "logits/rejected": -2.808043956756592, + "logps/chosen": -240.988525390625, + "logps/rejected": -296.6959228515625, + "loss": 0.0991, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.005442300345748663, + "rewards/margins": 4.918468952178955, + "rewards/rejected": -4.923911094665527, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.3964067278287464e-07, + "logits/chosen": -2.8718693256378174, + "logits/rejected": -2.8258144855499268, + "logps/chosen": -284.6709289550781, + "logps/rejected": -264.33795166015625, + "loss": 0.0988, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2022373229265213, + "rewards/margins": 5.098201274871826, + "rewards/rejected": -4.895963668823242, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.377293577981651e-07, + "logits/chosen": -2.737651824951172, + "logits/rejected": -2.75420880317688, + "logps/chosen": -230.8458251953125, + "logps/rejected": -276.8215637207031, + "loss": 0.0859, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.16446110606193542, + "rewards/margins": 5.473117828369141, + "rewards/rejected": -5.308656215667725, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3581804281345565e-07, + "logits/chosen": -2.8187365531921387, + "logits/rejected": -2.750450611114502, + "logps/chosen": -287.13604736328125, + "logps/rejected": -268.45318603515625, + "loss": 0.078, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4848671853542328, + "rewards/margins": 5.562216758728027, + "rewards/rejected": -5.0773491859436035, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.339067278287462e-07, + "logits/chosen": -2.625655174255371, + "logits/rejected": -2.6164321899414062, + "logps/chosen": -253.10617065429688, + "logps/rejected": -279.876220703125, + "loss": 0.0656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3776094317436218, + "rewards/margins": 5.922922134399414, + "rewards/rejected": -5.545313358306885, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.319954128440367e-07, + "logits/chosen": -2.8032162189483643, + "logits/rejected": -2.757336139678955, + "logps/chosen": -283.8640441894531, + "logps/rejected": -268.12109375, + "loss": 0.0696, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.19224987924098969, + "rewards/margins": 5.265924453735352, + "rewards/rejected": -5.07367467880249, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.3008409785932725e-07, + "logits/chosen": -2.699878215789795, + "logits/rejected": -2.6233882904052734, + "logps/chosen": -251.9101104736328, + "logps/rejected": -270.0425720214844, + "loss": 0.0769, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0026382445357739925, + "rewards/margins": 5.298295497894287, + "rewards/rejected": -5.295657157897949, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.2817278287461773e-07, + "logits/chosen": -2.767627716064453, + "logits/rejected": -2.7735066413879395, + "logps/chosen": -235.3647003173828, + "logps/rejected": -294.1983337402344, + "loss": 0.083, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.19832327961921692, + "rewards/margins": 5.44091272354126, + "rewards/rejected": -5.242589473724365, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.262614678899082e-07, + "logits/chosen": -2.7307627201080322, + "logits/rejected": -2.7215757369995117, + "logps/chosen": -251.027099609375, + "logps/rejected": -315.58026123046875, + "loss": 0.086, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.10485126823186874, + "rewards/margins": 5.883278846740723, + "rewards/rejected": -5.988129615783691, + "step": 1200 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -2.834700345993042, + "eval_logits/rejected": -2.787626266479492, + "eval_logps/chosen": -307.7827453613281, + "eval_logps/rejected": -277.5297546386719, + "eval_loss": 0.5000655651092529, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -1.1250085830688477, + "eval_rewards/margins": 2.137223720550537, + "eval_rewards/rejected": -3.262232542037964, + "eval_runtime": 85.7069, + "eval_samples_per_second": 23.335, + "eval_steps_per_second": 0.735, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2435015290519874e-07, + "logits/chosen": -2.7468724250793457, + "logits/rejected": -2.7094569206237793, + "logps/chosen": -232.368896484375, + "logps/rejected": -263.92901611328125, + "loss": 0.0689, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06489093601703644, + "rewards/margins": 5.3409833908081055, + "rewards/rejected": -5.405873775482178, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2243883792048927e-07, + "logits/chosen": -2.7789623737335205, + "logits/rejected": -2.772648811340332, + "logps/chosen": -290.0833435058594, + "logps/rejected": -341.7529602050781, + "loss": 0.0932, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.19088497757911682, + "rewards/margins": 5.593591213226318, + "rewards/rejected": -5.402706146240234, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.205275229357798e-07, + "logits/chosen": -2.7190937995910645, + "logits/rejected": -2.684502124786377, + "logps/chosen": -299.87225341796875, + "logps/rejected": -335.17327880859375, + "loss": 0.095, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.03153310716152191, + "rewards/margins": 5.8492231369018555, + "rewards/rejected": -5.880755424499512, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.186162079510703e-07, + "logits/chosen": -2.7524490356445312, + "logits/rejected": -2.7430176734924316, + "logps/chosen": -285.627197265625, + "logps/rejected": -323.4786071777344, + "loss": 0.0928, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02745187282562256, + "rewards/margins": 5.191933631896973, + "rewards/rejected": -5.1644816398620605, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.167048929663608e-07, + "logits/chosen": -2.788243293762207, + "logits/rejected": -2.7167086601257324, + "logps/chosen": -236.2114715576172, + "logps/rejected": -248.34506225585938, + "loss": 0.0924, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3661441206932068, + "rewards/margins": 4.600642204284668, + "rewards/rejected": -4.9667863845825195, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.1479357798165134e-07, + "logits/chosen": -2.8230297565460205, + "logits/rejected": -2.785630464553833, + "logps/chosen": -293.46527099609375, + "logps/rejected": -290.85736083984375, + "loss": 0.0949, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.026639381423592567, + "rewards/margins": 5.191295623779297, + "rewards/rejected": -5.164656639099121, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.128822629969419e-07, + "logits/chosen": -2.7404916286468506, + "logits/rejected": -2.7222659587860107, + "logps/chosen": -289.47161865234375, + "logps/rejected": -309.0167541503906, + "loss": 0.0909, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.054047178477048874, + "rewards/margins": 6.036001682281494, + "rewards/rejected": -6.090048789978027, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.109709480122324e-07, + "logits/chosen": -2.74894380569458, + "logits/rejected": -2.7412331104278564, + "logps/chosen": -240.25439453125, + "logps/rejected": -276.5160827636719, + "loss": 0.0935, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3607255518436432, + "rewards/margins": 5.7335524559021, + "rewards/rejected": -5.372827053070068, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0905963302752294e-07, + "logits/chosen": -2.625156879425049, + "logits/rejected": -2.6567492485046387, + "logps/chosen": -246.95443725585938, + "logps/rejected": -286.4045104980469, + "loss": 0.0966, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.23297014832496643, + "rewards/margins": 5.24931001663208, + "rewards/rejected": -5.482280254364014, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.071483180428134e-07, + "logits/chosen": -2.8050503730773926, + "logits/rejected": -2.7595503330230713, + "logps/chosen": -328.2964782714844, + "logps/rejected": -270.40875244140625, + "loss": 0.0919, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3086862564086914, + "rewards/margins": 5.241825103759766, + "rewards/rejected": -4.933138847351074, + "step": 1300 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.8075647354125977, + "eval_logits/rejected": -2.766228675842285, + "eval_logps/chosen": -310.404541015625, + "eval_logps/rejected": -280.4382629394531, + "eval_loss": 0.5053635835647583, + "eval_rewards/accuracies": 0.8015872836112976, + "eval_rewards/chosen": -1.3871887922286987, + "eval_rewards/margins": 2.1658928394317627, + "eval_rewards/rejected": -3.55308198928833, + "eval_runtime": 84.9554, + "eval_samples_per_second": 23.542, + "eval_steps_per_second": 0.742, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0523700305810395e-07, + "logits/chosen": -2.7835230827331543, + "logits/rejected": -2.7681832313537598, + "logps/chosen": -272.99322509765625, + "logps/rejected": -310.72442626953125, + "loss": 0.1015, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.010780900716781616, + "rewards/margins": 5.337492942810059, + "rewards/rejected": -5.348274230957031, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.033256880733945e-07, + "logits/chosen": -2.7876617908477783, + "logits/rejected": -2.765178918838501, + "logps/chosen": -248.750244140625, + "logps/rejected": -260.992919921875, + "loss": 0.0854, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5619943737983704, + "rewards/margins": 4.9182329177856445, + "rewards/rejected": -5.480227470397949, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.01414373088685e-07, + "logits/chosen": -2.791926622390747, + "logits/rejected": -2.6914966106414795, + "logps/chosen": -287.69439697265625, + "logps/rejected": -290.97320556640625, + "loss": 0.0953, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2892554998397827, + "rewards/margins": 5.625554084777832, + "rewards/rejected": -5.914809226989746, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.9950305810397555e-07, + "logits/chosen": -2.842695951461792, + "logits/rejected": -2.8599178791046143, + "logps/chosen": -283.12359619140625, + "logps/rejected": -292.62762451171875, + "loss": 0.1007, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5628482103347778, + "rewards/margins": 6.214916229248047, + "rewards/rejected": -6.777764320373535, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.9759174311926603e-07, + "logits/chosen": -2.841775417327881, + "logits/rejected": -2.7714896202087402, + "logps/chosen": -273.721435546875, + "logps/rejected": -287.3270263671875, + "loss": 0.0833, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3608776926994324, + "rewards/margins": 5.559767246246338, + "rewards/rejected": -5.920644283294678, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9568042813455656e-07, + "logits/chosen": -2.7543766498565674, + "logits/rejected": -2.696719169616699, + "logps/chosen": -286.90130615234375, + "logps/rejected": -254.12673950195312, + "loss": 0.1065, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6098439693450928, + "rewards/margins": 4.60134744644165, + "rewards/rejected": -5.2111921310424805, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.937691131498471e-07, + "logits/chosen": -2.839272975921631, + "logits/rejected": -2.737689971923828, + "logps/chosen": -292.3916931152344, + "logps/rejected": -307.9238586425781, + "loss": 0.091, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.15980760753154755, + "rewards/margins": 6.0921101570129395, + "rewards/rejected": -6.251918315887451, + "step": 1370 + }, + { + "epoch": 1.42, + "learning_rate": 2.918577981651376e-07, + "logits/chosen": -2.7997069358825684, + "logits/rejected": -2.779470205307007, + "logps/chosen": -241.4885711669922, + "logps/rejected": -284.999267578125, + "loss": 0.0908, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.28731173276901245, + "rewards/margins": 5.128489017486572, + "rewards/rejected": -5.415801048278809, + "step": 1380 + }, + { + "epoch": 1.43, + "learning_rate": 2.8994648318042816e-07, + "logits/chosen": -2.825129985809326, + "logits/rejected": -2.843670129776001, + "logps/chosen": -274.7068176269531, + "logps/rejected": -274.2774353027344, + "loss": 0.0852, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.1482582539319992, + "rewards/margins": 5.11671257019043, + "rewards/rejected": -5.264970779418945, + "step": 1390 + }, + { + "epoch": 1.44, + "learning_rate": 2.8803516819571863e-07, + "logits/chosen": -2.8828132152557373, + "logits/rejected": -2.855530261993408, + "logps/chosen": -315.52923583984375, + "logps/rejected": -307.06707763671875, + "loss": 0.105, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.19713237881660461, + "rewards/margins": 5.789515495300293, + "rewards/rejected": -5.59238338470459, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_logits/chosen": -2.829054117202759, + "eval_logits/rejected": -2.787668228149414, + "eval_logps/chosen": -311.6723327636719, + "eval_logps/rejected": -281.1881103515625, + "eval_loss": 0.5085237622261047, + "eval_rewards/accuracies": 0.7817460298538208, + "eval_rewards/chosen": -1.5139707326889038, + "eval_rewards/margins": 2.1140964031219482, + "eval_rewards/rejected": -3.6280672550201416, + "eval_runtime": 84.6223, + "eval_samples_per_second": 23.634, + "eval_steps_per_second": 0.744, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.8612385321100917e-07, + "logits/chosen": -2.6863460540771484, + "logits/rejected": -2.7248947620391846, + "logps/chosen": -237.7042694091797, + "logps/rejected": -293.4941101074219, + "loss": 0.096, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.03336421772837639, + "rewards/margins": 5.846810340881348, + "rewards/rejected": -5.813446044921875, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.842125382262997e-07, + "logits/chosen": -2.7969777584075928, + "logits/rejected": -2.7567477226257324, + "logps/chosen": -258.79718017578125, + "logps/rejected": -280.1507263183594, + "loss": 0.0844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.021525543183088303, + "rewards/margins": 5.631020545959473, + "rewards/rejected": -5.652545928955078, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8230122324159023e-07, + "logits/chosen": -2.7350478172302246, + "logits/rejected": -2.7465858459472656, + "logps/chosen": -296.23486328125, + "logps/rejected": -338.95587158203125, + "loss": 0.0885, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.08497120440006256, + "rewards/margins": 5.93708610534668, + "rewards/rejected": -6.022058486938477, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.8038990825688076e-07, + "logits/chosen": -2.8355910778045654, + "logits/rejected": -2.7613484859466553, + "logps/chosen": -230.56863403320312, + "logps/rejected": -241.2537078857422, + "loss": 0.0959, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.43812817335128784, + "rewards/margins": 4.630409240722656, + "rewards/rejected": -5.068537712097168, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.784785932721712e-07, + "logits/chosen": -2.7863059043884277, + "logits/rejected": -2.7627620697021484, + "logps/chosen": -269.6027526855469, + "logps/rejected": -288.1315002441406, + "loss": 0.1224, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.12829892337322235, + "rewards/margins": 6.043244361877441, + "rewards/rejected": -5.914945125579834, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.765672782874617e-07, + "logits/chosen": -2.7619357109069824, + "logits/rejected": -2.7393484115600586, + "logps/chosen": -282.9959716796875, + "logps/rejected": -261.0349426269531, + "loss": 0.0842, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.06687188148498535, + "rewards/margins": 5.253376007080078, + "rewards/rejected": -5.186504364013672, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.7465596330275225e-07, + "logits/chosen": -2.807286500930786, + "logits/rejected": -2.7439327239990234, + "logps/chosen": -281.26104736328125, + "logps/rejected": -242.0012664794922, + "loss": 0.0905, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.22755661606788635, + "rewards/margins": 5.756918430328369, + "rewards/rejected": -5.9844746589660645, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.727446483180428e-07, + "logits/chosen": -2.7947564125061035, + "logits/rejected": -2.7915396690368652, + "logps/chosen": -266.6559143066406, + "logps/rejected": -293.7073669433594, + "loss": 0.0741, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4066391587257385, + "rewards/margins": 5.554805278778076, + "rewards/rejected": -5.96144437789917, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.708333333333333e-07, + "logits/chosen": -2.6909992694854736, + "logits/rejected": -2.635457992553711, + "logps/chosen": -259.1954345703125, + "logps/rejected": -297.5155944824219, + "loss": 0.0782, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6428125500679016, + "rewards/margins": 5.484756946563721, + "rewards/rejected": -6.127569198608398, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.6892201834862385e-07, + "logits/chosen": -2.8311927318573, + "logits/rejected": -2.7327089309692383, + "logps/chosen": -266.77490234375, + "logps/rejected": -270.7399597167969, + "loss": 0.0714, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.03750241920351982, + "rewards/margins": 5.840561866760254, + "rewards/rejected": -5.878064155578613, + "step": 1500 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.833139181137085, + "eval_logits/rejected": -2.7888331413269043, + "eval_logps/chosen": -315.17449951171875, + "eval_logps/rejected": -285.445068359375, + "eval_loss": 0.5215712785720825, + "eval_rewards/accuracies": 0.7460317611694336, + "eval_rewards/chosen": -1.8641860485076904, + "eval_rewards/margins": 2.1895742416381836, + "eval_rewards/rejected": -4.053760528564453, + "eval_runtime": 85.2599, + "eval_samples_per_second": 23.458, + "eval_steps_per_second": 0.739, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6701070336391433e-07, + "logits/chosen": -2.792778253555298, + "logits/rejected": -2.752930164337158, + "logps/chosen": -321.5592956542969, + "logps/rejected": -298.44171142578125, + "loss": 0.0751, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.13722172379493713, + "rewards/margins": 5.923539161682129, + "rewards/rejected": -5.786317825317383, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6509938837920486e-07, + "logits/chosen": -2.699197769165039, + "logits/rejected": -2.7083370685577393, + "logps/chosen": -236.6324005126953, + "logps/rejected": -276.34002685546875, + "loss": 0.0918, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.2362903654575348, + "rewards/margins": 5.470263481140137, + "rewards/rejected": -5.233973026275635, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.631880733944954e-07, + "logits/chosen": -2.8059258460998535, + "logits/rejected": -2.814614772796631, + "logps/chosen": -260.8307800292969, + "logps/rejected": -273.9322204589844, + "loss": 0.0795, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10708916187286377, + "rewards/margins": 5.207016468048096, + "rewards/rejected": -5.314105033874512, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.612767584097859e-07, + "logits/chosen": -2.8370559215545654, + "logits/rejected": -2.8082430362701416, + "logps/chosen": -265.97998046875, + "logps/rejected": -253.841064453125, + "loss": 0.0958, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06110060214996338, + "rewards/margins": 4.913632392883301, + "rewards/rejected": -4.852531909942627, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.5936544342507646e-07, + "logits/chosen": -2.792498826980591, + "logits/rejected": -2.776141405105591, + "logps/chosen": -269.06744384765625, + "logps/rejected": -290.98565673828125, + "loss": 0.1037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.20791736245155334, + "rewards/margins": 5.7748517990112305, + "rewards/rejected": -5.5669355392456055, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.5745412844036693e-07, + "logits/chosen": -2.7609810829162598, + "logits/rejected": -2.7718801498413086, + "logps/chosen": -291.72613525390625, + "logps/rejected": -273.72003173828125, + "loss": 0.0868, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20280487835407257, + "rewards/margins": 5.068221092224121, + "rewards/rejected": -5.271025657653809, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5554281345565747e-07, + "logits/chosen": -2.8320767879486084, + "logits/rejected": -2.772150993347168, + "logps/chosen": -267.63543701171875, + "logps/rejected": -300.33319091796875, + "loss": 0.0819, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3804650604724884, + "rewards/margins": 5.666358470916748, + "rewards/rejected": -6.046823024749756, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.53631498470948e-07, + "logits/chosen": -2.8861734867095947, + "logits/rejected": -2.8094377517700195, + "logps/chosen": -290.4794006347656, + "logps/rejected": -290.64398193359375, + "loss": 0.0779, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.03389785438776016, + "rewards/margins": 5.959391117095947, + "rewards/rejected": -5.925492763519287, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5172018348623853e-07, + "logits/chosen": -2.791463613510132, + "logits/rejected": -2.7679646015167236, + "logps/chosen": -296.26373291015625, + "logps/rejected": -297.009521484375, + "loss": 0.0821, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.11777625232934952, + "rewards/margins": 5.88539457321167, + "rewards/rejected": -6.003170967102051, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.4980886850152906e-07, + "logits/chosen": -2.8138768672943115, + "logits/rejected": -2.6688551902770996, + "logps/chosen": -286.35675048828125, + "logps/rejected": -278.78131103515625, + "loss": 0.0874, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.002650848124176264, + "rewards/margins": 5.6529927253723145, + "rewards/rejected": -5.6503424644470215, + "step": 1600 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.8315298557281494, + "eval_logits/rejected": -2.775137424468994, + "eval_logps/chosen": -311.60955810546875, + "eval_logps/rejected": -282.18365478515625, + "eval_loss": 0.5050424337387085, + "eval_rewards/accuracies": 0.7420634627342224, + "eval_rewards/chosen": -1.5076885223388672, + "eval_rewards/margins": 2.2199289798736572, + "eval_rewards/rejected": -3.7276177406311035, + "eval_runtime": 84.8937, + "eval_samples_per_second": 23.559, + "eval_steps_per_second": 0.742, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.478975535168196e-07, + "logits/chosen": -2.8036742210388184, + "logits/rejected": -2.7059030532836914, + "logps/chosen": -268.060791015625, + "logps/rejected": -258.54937744140625, + "loss": 0.086, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1585809290409088, + "rewards/margins": 4.931711673736572, + "rewards/rejected": -5.090291976928711, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.459862385321101e-07, + "logits/chosen": -2.8540966510772705, + "logits/rejected": -2.816868782043457, + "logps/chosen": -315.4597473144531, + "logps/rejected": -307.907958984375, + "loss": 0.0887, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.18126711249351501, + "rewards/margins": 6.035513877868652, + "rewards/rejected": -5.854246616363525, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.440749235474006e-07, + "logits/chosen": -2.788999080657959, + "logits/rejected": -2.7234809398651123, + "logps/chosen": -292.61163330078125, + "logps/rejected": -296.74871826171875, + "loss": 0.0821, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.10972774028778076, + "rewards/margins": 6.098582744598389, + "rewards/rejected": -5.988855361938477, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.421636085626911e-07, + "logits/chosen": -2.6809146404266357, + "logits/rejected": -2.6293673515319824, + "logps/chosen": -264.6792297363281, + "logps/rejected": -270.60107421875, + "loss": 0.0705, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5329539179801941, + "rewards/margins": 6.360047340393066, + "rewards/rejected": -5.827093601226807, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.402522935779816e-07, + "logits/chosen": -2.7440340518951416, + "logits/rejected": -2.7100114822387695, + "logps/chosen": -293.00799560546875, + "logps/rejected": -320.46685791015625, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029893267899751663, + "rewards/margins": 6.438266754150391, + "rewards/rejected": -6.4083733558654785, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3834097859327215e-07, + "logits/chosen": -2.7580788135528564, + "logits/rejected": -2.7617766857147217, + "logps/chosen": -245.4547119140625, + "logps/rejected": -284.69964599609375, + "loss": 0.0942, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2982054352760315, + "rewards/margins": 5.770577430725098, + "rewards/rejected": -6.068782329559326, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.3642966360856268e-07, + "logits/chosen": -2.8160476684570312, + "logits/rejected": -2.745145797729492, + "logps/chosen": -277.74945068359375, + "logps/rejected": -277.59478759765625, + "loss": 0.0911, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02770283818244934, + "rewards/margins": 5.381051063537598, + "rewards/rejected": -5.3533477783203125, + "step": 1670 + }, + { + "epoch": 1.73, + "learning_rate": 2.345183486238532e-07, + "logits/chosen": -2.6783447265625, + "logits/rejected": -2.657252788543701, + "logps/chosen": -262.15625, + "logps/rejected": -323.332763671875, + "loss": 0.077, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.23324215412139893, + "rewards/margins": 5.4570722579956055, + "rewards/rejected": -5.690314769744873, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 2.3260703363914372e-07, + "logits/chosen": -2.7587218284606934, + "logits/rejected": -2.762028455734253, + "logps/chosen": -313.23101806640625, + "logps/rejected": -298.82940673828125, + "loss": 0.0921, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3798483610153198, + "rewards/margins": 5.513558864593506, + "rewards/rejected": -5.1337103843688965, + "step": 1690 + }, + { + "epoch": 1.75, + "learning_rate": 2.3069571865443425e-07, + "logits/chosen": -2.797136068344116, + "logits/rejected": -2.7841556072235107, + "logps/chosen": -239.9384765625, + "logps/rejected": -299.2468566894531, + "loss": 0.063, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4451209008693695, + "rewards/margins": 5.984606742858887, + "rewards/rejected": -6.429728031158447, + "step": 1700 + }, + { + "epoch": 1.75, + "eval_logits/chosen": -2.805445432662964, + "eval_logits/rejected": -2.7470343112945557, + "eval_logps/chosen": -315.9737854003906, + "eval_logps/rejected": -289.32904052734375, + "eval_loss": 0.5349620580673218, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": -1.944112777709961, + "eval_rewards/margins": 2.4980452060699463, + "eval_rewards/rejected": -4.442158222198486, + "eval_runtime": 85.6472, + "eval_samples_per_second": 23.352, + "eval_steps_per_second": 0.736, + "step": 1700 + }, + { + "epoch": 1.76, + "learning_rate": 2.2878440366972476e-07, + "logits/chosen": -2.8598368167877197, + "logits/rejected": -2.8258631229400635, + "logps/chosen": -285.8379821777344, + "logps/rejected": -356.3085021972656, + "loss": 0.0798, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6007496118545532, + "rewards/margins": 6.229714393615723, + "rewards/rejected": -6.8304643630981445, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.268730886850153e-07, + "logits/chosen": -2.6834561824798584, + "logits/rejected": -2.7135403156280518, + "logps/chosen": -272.9288024902344, + "logps/rejected": -275.1758117675781, + "loss": 0.0757, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.0667431578040123, + "rewards/margins": 5.4009833335876465, + "rewards/rejected": -5.467726230621338, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.249617737003058e-07, + "logits/chosen": -2.7773818969726562, + "logits/rejected": -2.7113940715789795, + "logps/chosen": -297.2420349121094, + "logps/rejected": -332.0787048339844, + "loss": 0.0993, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06141236424446106, + "rewards/margins": 6.285152912139893, + "rewards/rejected": -6.223740577697754, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.2305045871559633e-07, + "logits/chosen": -2.776603937149048, + "logits/rejected": -2.6816258430480957, + "logps/chosen": -284.12457275390625, + "logps/rejected": -295.02203369140625, + "loss": 0.0807, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.05037654563784599, + "rewards/margins": 5.701298713684082, + "rewards/rejected": -5.751675128936768, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2113914373088686e-07, + "logits/chosen": -2.8667399883270264, + "logits/rejected": -2.844064235687256, + "logps/chosen": -312.84820556640625, + "logps/rejected": -323.13372802734375, + "loss": 0.0877, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.20965537428855896, + "rewards/margins": 5.274796485900879, + "rewards/rejected": -5.484452247619629, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1922782874617736e-07, + "logits/chosen": -2.7705488204956055, + "logits/rejected": -2.794954776763916, + "logps/chosen": -237.9602813720703, + "logps/rejected": -337.15008544921875, + "loss": 0.0808, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3575199246406555, + "rewards/margins": 6.5947370529174805, + "rewards/rejected": -6.9522576332092285, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1731651376146787e-07, + "logits/chosen": -2.721431255340576, + "logits/rejected": -2.7328743934631348, + "logps/chosen": -238.32852172851562, + "logps/rejected": -295.3879089355469, + "loss": 0.0828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5729081630706787, + "rewards/margins": 5.2664055824279785, + "rewards/rejected": -5.83931303024292, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.154051987767584e-07, + "logits/chosen": -2.788494825363159, + "logits/rejected": -2.7784392833709717, + "logps/chosen": -275.90283203125, + "logps/rejected": -276.9033508300781, + "loss": 0.0768, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5950279235839844, + "rewards/margins": 5.067400932312012, + "rewards/rejected": -5.662428855895996, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.134938837920489e-07, + "logits/chosen": -2.7264552116394043, + "logits/rejected": -2.6988372802734375, + "logps/chosen": -206.7015380859375, + "logps/rejected": -257.45721435546875, + "loss": 0.0542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9315367937088013, + "rewards/margins": 5.3646650314331055, + "rewards/rejected": -6.296202659606934, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1158256880733944e-07, + "logits/chosen": -2.823498487472534, + "logits/rejected": -2.7179205417633057, + "logps/chosen": -311.6935729980469, + "logps/rejected": -289.9866943359375, + "loss": 0.0786, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7751389741897583, + "rewards/margins": 5.937049388885498, + "rewards/rejected": -6.712187767028809, + "step": 1800 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.812049627304077, + "eval_logits/rejected": -2.7543678283691406, + "eval_logps/chosen": -316.8768615722656, + "eval_logps/rejected": -289.1434020996094, + "eval_loss": 0.5376091599464417, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -2.0344181060791016, + "eval_rewards/margins": 2.389177083969116, + "eval_rewards/rejected": -4.4235944747924805, + "eval_runtime": 85.2124, + "eval_samples_per_second": 23.471, + "eval_steps_per_second": 0.739, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.0967125382262994e-07, + "logits/chosen": -2.795349597930908, + "logits/rejected": -2.711717128753662, + "logps/chosen": -277.3824157714844, + "logps/rejected": -291.24517822265625, + "loss": 0.0868, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18659140169620514, + "rewards/margins": 6.222006320953369, + "rewards/rejected": -6.408597469329834, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0775993883792048e-07, + "logits/chosen": -2.792492628097534, + "logits/rejected": -2.6954171657562256, + "logps/chosen": -289.41339111328125, + "logps/rejected": -272.28094482421875, + "loss": 0.0782, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2770845293998718, + "rewards/margins": 5.3945207595825195, + "rewards/rejected": -5.671605110168457, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.05848623853211e-07, + "logits/chosen": -2.863590717315674, + "logits/rejected": -2.73067045211792, + "logps/chosen": -299.4620666503906, + "logps/rejected": -290.4524841308594, + "loss": 0.0845, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.15114930272102356, + "rewards/margins": 6.369837284088135, + "rewards/rejected": -6.218688011169434, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0393730886850151e-07, + "logits/chosen": -2.8121609687805176, + "logits/rejected": -2.760335922241211, + "logps/chosen": -302.7115478515625, + "logps/rejected": -290.25579833984375, + "loss": 0.0758, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6565725803375244, + "rewards/margins": 5.57368803024292, + "rewards/rejected": -6.230259895324707, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0202599388379205e-07, + "logits/chosen": -2.8598790168762207, + "logits/rejected": -2.8427786827087402, + "logps/chosen": -284.7428283691406, + "logps/rejected": -347.52874755859375, + "loss": 0.0846, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6115126609802246, + "rewards/margins": 5.790914058685303, + "rewards/rejected": -6.402426242828369, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 2.0011467889908258e-07, + "logits/chosen": -2.843400716781616, + "logits/rejected": -2.7829298973083496, + "logps/chosen": -307.7172546386719, + "logps/rejected": -240.32095336914062, + "loss": 0.0842, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.43064993619918823, + "rewards/margins": 5.67532205581665, + "rewards/rejected": -6.1059722900390625, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9820336391437308e-07, + "logits/chosen": -2.8097338676452637, + "logits/rejected": -2.7943801879882812, + "logps/chosen": -259.96142578125, + "logps/rejected": -280.31768798828125, + "loss": 0.0766, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.47454309463500977, + "rewards/margins": 6.007216453552246, + "rewards/rejected": -6.481759548187256, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9629204892966362e-07, + "logits/chosen": -2.9081008434295654, + "logits/rejected": -2.850368022918701, + "logps/chosen": -285.292236328125, + "logps/rejected": -275.22113037109375, + "loss": 0.0707, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.27541738748550415, + "rewards/margins": 5.751465797424316, + "rewards/rejected": -6.026882648468018, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.943807339449541e-07, + "logits/chosen": -2.783125400543213, + "logits/rejected": -2.7831761837005615, + "logps/chosen": -270.49285888671875, + "logps/rejected": -308.9680480957031, + "loss": 0.0781, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5280567407608032, + "rewards/margins": 5.9228010177612305, + "rewards/rejected": -6.450858116149902, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.9246941896024463e-07, + "logits/chosen": -2.9015605449676514, + "logits/rejected": -2.908259630203247, + "logps/chosen": -266.2379455566406, + "logps/rejected": -324.3561706542969, + "loss": 0.1117, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6570742726325989, + "rewards/margins": 6.570813179016113, + "rewards/rejected": -7.227886199951172, + "step": 1900 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.8857781887054443, + "eval_logits/rejected": -2.836524486541748, + "eval_logps/chosen": -315.7684020996094, + "eval_logps/rejected": -285.27667236328125, + "eval_loss": 0.5334831476211548, + "eval_rewards/accuracies": 0.7817460298538208, + "eval_rewards/chosen": -1.9235715866088867, + "eval_rewards/margins": 2.1133482456207275, + "eval_rewards/rejected": -4.036919593811035, + "eval_runtime": 84.5094, + "eval_samples_per_second": 23.666, + "eval_steps_per_second": 0.745, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9055810397553516e-07, + "logits/chosen": -2.8626060485839844, + "logits/rejected": -2.8419241905212402, + "logps/chosen": -268.6602783203125, + "logps/rejected": -280.9430236816406, + "loss": 0.0908, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.532307505607605, + "rewards/margins": 4.841288089752197, + "rewards/rejected": -5.373595237731934, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8864678899082566e-07, + "logits/chosen": -2.781276226043701, + "logits/rejected": -2.77004337310791, + "logps/chosen": -282.97320556640625, + "logps/rejected": -262.6290283203125, + "loss": 0.08, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.720298171043396, + "rewards/margins": 4.898207187652588, + "rewards/rejected": -5.618504524230957, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.867354740061162e-07, + "logits/chosen": -2.870204210281372, + "logits/rejected": -2.822707176208496, + "logps/chosen": -266.18487548828125, + "logps/rejected": -297.3182067871094, + "loss": 0.102, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9683756828308105, + "rewards/margins": 5.399904727935791, + "rewards/rejected": -6.368279933929443, + "step": 1930 + }, + { + "epoch": 2.0, + "learning_rate": 1.8482415902140673e-07, + "logits/chosen": -2.838547945022583, + "logits/rejected": -2.8035788536071777, + "logps/chosen": -265.2318420410156, + "logps/rejected": -289.78155517578125, + "loss": 0.0632, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2611059546470642, + "rewards/margins": 6.193750858306885, + "rewards/rejected": -6.454856872558594, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.8291284403669723e-07, + "logits/chosen": -2.805657386779785, + "logits/rejected": -2.8378701210021973, + "logps/chosen": -252.8870391845703, + "logps/rejected": -289.4842224121094, + "loss": 0.0223, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.05115882679820061, + "rewards/margins": 6.3867082595825195, + "rewards/rejected": -6.4378662109375, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8100152905198777e-07, + "logits/chosen": -2.664461851119995, + "logits/rejected": -2.614348888397217, + "logps/chosen": -273.43170166015625, + "logps/rejected": -358.24163818359375, + "loss": 0.015, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.45168575644493103, + "rewards/margins": 7.5087785720825195, + "rewards/rejected": -7.960465431213379, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7909021406727827e-07, + "logits/chosen": -2.8354854583740234, + "logits/rejected": -2.696690082550049, + "logps/chosen": -269.34149169921875, + "logps/rejected": -288.9095153808594, + "loss": 0.0221, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5660437941551208, + "rewards/margins": 6.704123497009277, + "rewards/rejected": -7.270166873931885, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.771788990825688e-07, + "logits/chosen": -2.8703505992889404, + "logits/rejected": -2.855757236480713, + "logps/chosen": -288.74615478515625, + "logps/rejected": -321.55267333984375, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2832430303096771, + "rewards/margins": 7.4998979568481445, + "rewards/rejected": -7.783141136169434, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 1.7526758409785934e-07, + "logits/chosen": -2.862212657928467, + "logits/rejected": -2.7549288272857666, + "logps/chosen": -317.54901123046875, + "logps/rejected": -333.6669921875, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.910609245300293, + "rewards/margins": 7.41512393951416, + "rewards/rejected": -8.325732231140137, + "step": 1990 + }, + { + "epoch": 2.06, + "learning_rate": 1.7335626911314984e-07, + "logits/chosen": -2.747159719467163, + "logits/rejected": -2.723252296447754, + "logps/chosen": -241.98416137695312, + "logps/rejected": -286.20263671875, + "loss": 0.0175, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5073554515838623, + "rewards/margins": 7.292289733886719, + "rewards/rejected": -8.799646377563477, + "step": 2000 + }, + { + "epoch": 2.06, + "eval_logits/chosen": -2.8335580825805664, + "eval_logits/rejected": -2.7736215591430664, + "eval_logps/chosen": -324.7882385253906, + "eval_logps/rejected": -302.55865478515625, + "eval_loss": 0.5881706476211548, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -2.825556755065918, + "eval_rewards/margins": 2.939563512802124, + "eval_rewards/rejected": -5.765120506286621, + "eval_runtime": 85.7099, + "eval_samples_per_second": 23.335, + "eval_steps_per_second": 0.735, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 1.7144495412844037e-07, + "logits/chosen": -2.900343418121338, + "logits/rejected": -2.7985424995422363, + "logps/chosen": -309.027587890625, + "logps/rejected": -329.75128173828125, + "loss": 0.0402, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8946213722229004, + "rewards/margins": 7.797415256500244, + "rewards/rejected": -8.692037582397461, + "step": 2010 + }, + { + "epoch": 2.08, + "learning_rate": 1.6953363914373088e-07, + "logits/chosen": -2.8973569869995117, + "logits/rejected": -2.883995532989502, + "logps/chosen": -304.8821716308594, + "logps/rejected": -307.1224060058594, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.414725124835968, + "rewards/margins": 8.143148422241211, + "rewards/rejected": -8.55787467956543, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 1.6762232415902138e-07, + "logits/chosen": -2.8249168395996094, + "logits/rejected": -2.755976438522339, + "logps/chosen": -271.30511474609375, + "logps/rejected": -302.35174560546875, + "loss": 0.0183, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.110582947731018, + "rewards/margins": 7.069375038146973, + "rewards/rejected": -8.179957389831543, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.6571100917431192e-07, + "logits/chosen": -2.9022743701934814, + "logits/rejected": -2.8274800777435303, + "logps/chosen": -272.07720947265625, + "logps/rejected": -290.59954833984375, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42111706733703613, + "rewards/margins": 7.790069580078125, + "rewards/rejected": -8.211187362670898, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6379969418960242e-07, + "logits/chosen": -2.8297278881073, + "logits/rejected": -2.8095757961273193, + "logps/chosen": -325.51202392578125, + "logps/rejected": -352.79632568359375, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47327151894569397, + "rewards/margins": 8.03354549407959, + "rewards/rejected": -8.506816864013672, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6188837920489295e-07, + "logits/chosen": -2.8418140411376953, + "logits/rejected": -2.7544631958007812, + "logps/chosen": -307.0643005371094, + "logps/rejected": -305.1767578125, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7374988794326782, + "rewards/margins": 7.880400657653809, + "rewards/rejected": -8.617899894714355, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5997706422018349e-07, + "logits/chosen": -2.834254741668701, + "logits/rejected": -2.8546204566955566, + "logps/chosen": -274.09295654296875, + "logps/rejected": -375.5848388671875, + "loss": 0.0133, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.05463988706469536, + "rewards/margins": 8.266778945922852, + "rewards/rejected": -8.321417808532715, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.58065749235474e-07, + "logits/chosen": -2.8460822105407715, + "logits/rejected": -2.7883737087249756, + "logps/chosen": -345.500732421875, + "logps/rejected": -345.20587158203125, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2549740970134735, + "rewards/margins": 8.054521560668945, + "rewards/rejected": -8.30949592590332, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5615443425076452e-07, + "logits/chosen": -2.6797385215759277, + "logits/rejected": -2.59342622756958, + "logps/chosen": -272.5843200683594, + "logps/rejected": -320.541015625, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7525440454483032, + "rewards/margins": 8.50687026977539, + "rewards/rejected": -9.25941276550293, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5424311926605506e-07, + "logits/chosen": -2.7577497959136963, + "logits/rejected": -2.6880905628204346, + "logps/chosen": -252.0814208984375, + "logps/rejected": -281.2613220214844, + "loss": 0.0145, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5481234788894653, + "rewards/margins": 7.643919944763184, + "rewards/rejected": -9.19204330444336, + "step": 2100 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.8019275665283203, + "eval_logits/rejected": -2.7453346252441406, + "eval_logps/chosen": -328.32196044921875, + "eval_logps/rejected": -307.42218017578125, + "eval_loss": 0.615982174873352, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -3.178926944732666, + "eval_rewards/margins": 3.072545289993286, + "eval_rewards/rejected": -6.251472473144531, + "eval_runtime": 84.2778, + "eval_samples_per_second": 23.731, + "eval_steps_per_second": 0.748, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.5233180428134556e-07, + "logits/chosen": -2.852083683013916, + "logits/rejected": -2.787797451019287, + "logps/chosen": -282.0943908691406, + "logps/rejected": -307.4315490722656, + "loss": 0.0118, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6628628969192505, + "rewards/margins": 7.550381660461426, + "rewards/rejected": -8.213244438171387, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.504204892966361e-07, + "logits/chosen": -2.7512869834899902, + "logits/rejected": -2.71586012840271, + "logps/chosen": -313.5958557128906, + "logps/rejected": -345.6764831542969, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2950427532196045, + "rewards/margins": 8.80932331085205, + "rewards/rejected": -10.104366302490234, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.485091743119266e-07, + "logits/chosen": -2.8325581550598145, + "logits/rejected": -2.8173718452453613, + "logps/chosen": -269.61077880859375, + "logps/rejected": -339.17193603515625, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3449245691299438, + "rewards/margins": 7.745511054992676, + "rewards/rejected": -9.090435028076172, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.465978593272171e-07, + "logits/chosen": -2.779770851135254, + "logits/rejected": -2.6894686222076416, + "logps/chosen": -214.4927520751953, + "logps/rejected": -242.42904663085938, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4138853549957275, + "rewards/margins": 7.569909572601318, + "rewards/rejected": -8.983795166015625, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4468654434250764e-07, + "logits/chosen": -2.7460098266601562, + "logits/rejected": -2.768864154815674, + "logps/chosen": -330.4348449707031, + "logps/rejected": -376.062744140625, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.25889283418655396, + "rewards/margins": 9.242613792419434, + "rewards/rejected": -9.501506805419922, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.4277522935779814e-07, + "logits/chosen": -2.822489023208618, + "logits/rejected": -2.694692373275757, + "logps/chosen": -284.2818298339844, + "logps/rejected": -331.88665771484375, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7468432188034058, + "rewards/margins": 7.9905853271484375, + "rewards/rejected": -8.737428665161133, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4086391437308867e-07, + "logits/chosen": -2.805720090866089, + "logits/rejected": -2.83075213432312, + "logps/chosen": -313.9166564941406, + "logps/rejected": -366.8902282714844, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2891316413879395, + "rewards/margins": 8.313146591186523, + "rewards/rejected": -9.602277755737305, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.389525993883792e-07, + "logits/chosen": -2.74735689163208, + "logits/rejected": -2.7133688926696777, + "logps/chosen": -248.0203399658203, + "logps/rejected": -302.1765441894531, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5222628116607666, + "rewards/margins": 8.545629501342773, + "rewards/rejected": -10.067892074584961, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.370412844036697e-07, + "logits/chosen": -2.8076796531677246, + "logits/rejected": -2.7455930709838867, + "logps/chosen": -269.29156494140625, + "logps/rejected": -332.31109619140625, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5531013011932373, + "rewards/margins": 8.036242485046387, + "rewards/rejected": -9.589344024658203, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3512996941896024e-07, + "logits/chosen": -2.7460780143737793, + "logits/rejected": -2.71882963180542, + "logps/chosen": -244.74636840820312, + "logps/rejected": -301.77496337890625, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5790854692459106, + "rewards/margins": 8.768037796020508, + "rewards/rejected": -10.347124099731445, + "step": 2200 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.775843381881714, + "eval_logits/rejected": -2.71360445022583, + "eval_logps/chosen": -335.1671142578125, + "eval_logps/rejected": -318.3191223144531, + "eval_loss": 0.6675200462341309, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -3.8634424209594727, + "eval_rewards/margins": 3.4777252674102783, + "eval_rewards/rejected": -7.34116792678833, + "eval_runtime": 85.3675, + "eval_samples_per_second": 23.428, + "eval_steps_per_second": 0.738, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3321865443425075e-07, + "logits/chosen": -2.7431907653808594, + "logits/rejected": -2.666405200958252, + "logps/chosen": -288.3621826171875, + "logps/rejected": -333.54486083984375, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3674020767211914, + "rewards/margins": 8.582910537719727, + "rewards/rejected": -9.950313568115234, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3130733944954128e-07, + "logits/chosen": -2.7455010414123535, + "logits/rejected": -2.6907782554626465, + "logps/chosen": -300.69732666015625, + "logps/rejected": -345.77606201171875, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5738954544067383, + "rewards/margins": 8.692358016967773, + "rewards/rejected": -9.266255378723145, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.293960244648318e-07, + "logits/chosen": -2.8134102821350098, + "logits/rejected": -2.746748447418213, + "logps/chosen": -320.0693664550781, + "logps/rejected": -329.53509521484375, + "loss": 0.0119, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5276217460632324, + "rewards/margins": 8.22559928894043, + "rewards/rejected": -9.75322151184082, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2748470948012232e-07, + "logits/chosen": -2.7713024616241455, + "logits/rejected": -2.7079224586486816, + "logps/chosen": -283.7737121582031, + "logps/rejected": -363.48126220703125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2624105215072632, + "rewards/margins": 8.956193923950195, + "rewards/rejected": -10.218603134155273, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.2557339449541285e-07, + "logits/chosen": -2.824502944946289, + "logits/rejected": -2.775770425796509, + "logps/chosen": -302.90985107421875, + "logps/rejected": -334.00262451171875, + "loss": 0.0185, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.50070059299469, + "rewards/margins": 7.946015357971191, + "rewards/rejected": -9.446715354919434, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2366207951070336e-07, + "logits/chosen": -2.7960762977600098, + "logits/rejected": -2.7740378379821777, + "logps/chosen": -290.36663818359375, + "logps/rejected": -346.4615478515625, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7257287502288818, + "rewards/margins": 8.58584213256836, + "rewards/rejected": -10.31157112121582, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.217507645259939e-07, + "logits/chosen": -2.7129945755004883, + "logits/rejected": -2.7220640182495117, + "logps/chosen": -288.4734802246094, + "logps/rejected": -330.5152893066406, + "loss": 0.0062, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3350961208343506, + "rewards/margins": 8.772871017456055, + "rewards/rejected": -10.107967376708984, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.198394495412844e-07, + "logits/chosen": -2.786776542663574, + "logits/rejected": -2.7793478965759277, + "logps/chosen": -297.0083923339844, + "logps/rejected": -333.83319091796875, + "loss": 0.0174, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.304938554763794, + "rewards/margins": 8.177055358886719, + "rewards/rejected": -9.48199462890625, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 1.1792813455657493e-07, + "logits/chosen": -2.669079303741455, + "logits/rejected": -2.658374786376953, + "logps/chosen": -286.50946044921875, + "logps/rejected": -317.75653076171875, + "loss": 0.0151, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6052253246307373, + "rewards/margins": 8.337682723999023, + "rewards/rejected": -9.942907333374023, + "step": 2290 + }, + { + "epoch": 2.37, + "learning_rate": 1.1601681957186543e-07, + "logits/chosen": -2.7891390323638916, + "logits/rejected": -2.755884885787964, + "logps/chosen": -295.21063232421875, + "logps/rejected": -317.76275634765625, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0556640625, + "rewards/margins": 8.071990966796875, + "rewards/rejected": -9.127655029296875, + "step": 2300 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -2.7460498809814453, + "eval_logits/rejected": -2.684108018875122, + "eval_logps/chosen": -333.36492919921875, + "eval_logps/rejected": -315.51007080078125, + "eval_loss": 0.655481219291687, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -3.6832268238067627, + "eval_rewards/margins": 3.3770339488983154, + "eval_rewards/rejected": -7.060261249542236, + "eval_runtime": 85.5955, + "eval_samples_per_second": 23.366, + "eval_steps_per_second": 0.736, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 1.1410550458715595e-07, + "logits/chosen": -2.7361907958984375, + "logits/rejected": -2.736358642578125, + "logps/chosen": -276.85589599609375, + "logps/rejected": -338.77020263671875, + "loss": 0.0172, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.2412726879119873, + "rewards/margins": 7.8215155601501465, + "rewards/rejected": -9.062788009643555, + "step": 2310 + }, + { + "epoch": 2.39, + "learning_rate": 1.1219418960244648e-07, + "logits/chosen": -2.7203598022460938, + "logits/rejected": -2.7212142944335938, + "logps/chosen": -240.63095092773438, + "logps/rejected": -359.0272216796875, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4586249589920044, + "rewards/margins": 8.883589744567871, + "rewards/rejected": -10.342214584350586, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 1.10282874617737e-07, + "logits/chosen": -2.7475290298461914, + "logits/rejected": -2.7314276695251465, + "logps/chosen": -257.7377014160156, + "logps/rejected": -323.4758605957031, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1093502044677734, + "rewards/margins": 8.267637252807617, + "rewards/rejected": -10.37698745727539, + "step": 2330 + }, + { + "epoch": 2.41, + "learning_rate": 1.0837155963302752e-07, + "logits/chosen": -2.5238680839538574, + "logits/rejected": -2.5990004539489746, + "logps/chosen": -231.6787567138672, + "logps/rejected": -353.30950927734375, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3277311325073242, + "rewards/margins": 9.420309066772461, + "rewards/rejected": -10.748041152954102, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0646024464831804e-07, + "logits/chosen": -2.6679482460021973, + "logits/rejected": -2.676323652267456, + "logps/chosen": -298.315185546875, + "logps/rejected": -342.7939147949219, + "loss": 0.0189, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5310230255126953, + "rewards/margins": 9.822240829467773, + "rewards/rejected": -11.353262901306152, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0454892966360856e-07, + "logits/chosen": -2.601186990737915, + "logits/rejected": -2.6746459007263184, + "logps/chosen": -277.2555847167969, + "logps/rejected": -338.47589111328125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6174863576889038, + "rewards/margins": 9.97658634185791, + "rewards/rejected": -11.594073295593262, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0263761467889908e-07, + "logits/chosen": -2.6298816204071045, + "logits/rejected": -2.5947744846343994, + "logps/chosen": -280.8351135253906, + "logps/rejected": -345.2566223144531, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2241837978363037, + "rewards/margins": 9.190905570983887, + "rewards/rejected": -10.415090560913086, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.007262996941896e-07, + "logits/chosen": -2.5886502265930176, + "logits/rejected": -2.556889057159424, + "logps/chosen": -287.4285583496094, + "logps/rejected": -309.5324401855469, + "loss": 0.0139, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0298352241516113, + "rewards/margins": 8.339336395263672, + "rewards/rejected": -10.369171142578125, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.881498470948011e-08, + "logits/chosen": -2.694936752319336, + "logits/rejected": -2.571977376937866, + "logps/chosen": -301.35894775390625, + "logps/rejected": -316.75201416015625, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.882354497909546, + "rewards/margins": 8.532295227050781, + "rewards/rejected": -10.414649963378906, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.690366972477065e-08, + "logits/chosen": -2.6898765563964844, + "logits/rejected": -2.5868537425994873, + "logps/chosen": -295.41241455078125, + "logps/rejected": -324.84417724609375, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7099082469940186, + "rewards/margins": 8.85558795928955, + "rewards/rejected": -10.565495491027832, + "step": 2400 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.6922011375427246, + "eval_logits/rejected": -2.6255180835723877, + "eval_logps/chosen": -334.0754699707031, + "eval_logps/rejected": -316.68048095703125, + "eval_loss": 0.6598305702209473, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": -3.754283905029297, + "eval_rewards/margins": 3.4230215549468994, + "eval_rewards/rejected": -7.177304744720459, + "eval_runtime": 86.0431, + "eval_samples_per_second": 23.244, + "eval_steps_per_second": 0.732, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.499235474006116e-08, + "logits/chosen": -2.6745896339416504, + "logits/rejected": -2.6082608699798584, + "logps/chosen": -333.4400329589844, + "logps/rejected": -326.5425720214844, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5270661115646362, + "rewards/margins": 8.994717597961426, + "rewards/rejected": -10.521783828735352, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.308103975535168e-08, + "logits/chosen": -2.687208414077759, + "logits/rejected": -2.6537442207336426, + "logps/chosen": -306.5165100097656, + "logps/rejected": -356.3597717285156, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8680790066719055, + "rewards/margins": 8.81937026977539, + "rewards/rejected": -9.68744945526123, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.116972477064219e-08, + "logits/chosen": -2.739415168762207, + "logits/rejected": -2.630178213119507, + "logps/chosen": -307.1014709472656, + "logps/rejected": -374.5372009277344, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7804549932479858, + "rewards/margins": 8.495887756347656, + "rewards/rejected": -10.276342391967773, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.925840978593272e-08, + "logits/chosen": -2.6330296993255615, + "logits/rejected": -2.686793804168701, + "logps/chosen": -229.73330688476562, + "logps/rejected": -348.29071044921875, + "loss": 0.0163, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2960717678070068, + "rewards/margins": 8.63543701171875, + "rewards/rejected": -9.931509017944336, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.734709480122324e-08, + "logits/chosen": -2.619152545928955, + "logits/rejected": -2.581078052520752, + "logps/chosen": -273.90545654296875, + "logps/rejected": -345.945556640625, + "loss": 0.0139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.8120664358139038, + "rewards/margins": 9.089773178100586, + "rewards/rejected": -10.901840209960938, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.543577981651376e-08, + "logits/chosen": -2.6768343448638916, + "logits/rejected": -2.583284378051758, + "logps/chosen": -376.90606689453125, + "logps/rejected": -364.72235107421875, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1708290576934814, + "rewards/margins": 8.805766105651855, + "rewards/rejected": -9.976595878601074, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.352446483180428e-08, + "logits/chosen": -2.7543387413024902, + "logits/rejected": -2.6443049907684326, + "logps/chosen": -283.18182373046875, + "logps/rejected": -305.6678771972656, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0859144926071167, + "rewards/margins": 8.413625717163086, + "rewards/rejected": -9.499540328979492, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.161314984709481e-08, + "logits/chosen": -2.748798131942749, + "logits/rejected": -2.701770782470703, + "logps/chosen": -297.24285888671875, + "logps/rejected": -334.27093505859375, + "loss": 0.0051, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1504005193710327, + "rewards/margins": 9.096171379089355, + "rewards/rejected": -10.24657154083252, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.970183486238531e-08, + "logits/chosen": -2.660088300704956, + "logits/rejected": -2.6433253288269043, + "logps/chosen": -277.38043212890625, + "logps/rejected": -327.1907653808594, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4239822626113892, + "rewards/margins": 9.63415813446045, + "rewards/rejected": -11.058141708374023, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.779051987767583e-08, + "logits/chosen": -2.743652820587158, + "logits/rejected": -2.7099053859710693, + "logps/chosen": -326.1851806640625, + "logps/rejected": -345.4455261230469, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8251110315322876, + "rewards/margins": 9.0416898727417, + "rewards/rejected": -10.866801261901855, + "step": 2500 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.6937167644500732, + "eval_logits/rejected": -2.624516725540161, + "eval_logps/chosen": -342.0002746582031, + "eval_logps/rejected": -328.22021484375, + "eval_loss": 0.7043530941009521, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -4.546760082244873, + "eval_rewards/margins": 3.784513235092163, + "eval_rewards/rejected": -8.33127498626709, + "eval_runtime": 84.8082, + "eval_samples_per_second": 23.583, + "eval_steps_per_second": 0.743, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.587920489296635e-08, + "logits/chosen": -2.6863226890563965, + "logits/rejected": -2.669240951538086, + "logps/chosen": -314.3838195800781, + "logps/rejected": -351.1210632324219, + "loss": 0.0123, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9106096029281616, + "rewards/margins": 9.383336067199707, + "rewards/rejected": -11.2939453125, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.396788990825688e-08, + "logits/chosen": -2.724083423614502, + "logits/rejected": -2.6847984790802, + "logps/chosen": -288.16204833984375, + "logps/rejected": -320.74652099609375, + "loss": 0.0137, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.156667947769165, + "rewards/margins": 8.251482009887695, + "rewards/rejected": -10.408149719238281, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.20565749235474e-08, + "logits/chosen": -2.6141371726989746, + "logits/rejected": -2.545928478240967, + "logps/chosen": -272.58782958984375, + "logps/rejected": -309.7486267089844, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7828295230865479, + "rewards/margins": 9.371472358703613, + "rewards/rejected": -11.15429973602295, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 7.014525993883792e-08, + "logits/chosen": -2.6228842735290527, + "logits/rejected": -2.546093463897705, + "logps/chosen": -297.15032958984375, + "logps/rejected": -337.7318420410156, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7396106719970703, + "rewards/margins": 8.880767822265625, + "rewards/rejected": -10.620378494262695, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.823394495412843e-08, + "logits/chosen": -2.5964651107788086, + "logits/rejected": -2.535104513168335, + "logps/chosen": -291.49932861328125, + "logps/rejected": -332.07098388671875, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4745142459869385, + "rewards/margins": 10.255570411682129, + "rewards/rejected": -11.730084419250488, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.632262996941895e-08, + "logits/chosen": -2.6409924030303955, + "logits/rejected": -2.5037286281585693, + "logps/chosen": -274.09661865234375, + "logps/rejected": -345.69451904296875, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.433541178703308, + "rewards/margins": 9.810760498046875, + "rewards/rejected": -11.244300842285156, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.441131498470948e-08, + "logits/chosen": -2.6809632778167725, + "logits/rejected": -2.661353588104248, + "logps/chosen": -267.7644958496094, + "logps/rejected": -315.9635009765625, + "loss": 0.0234, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3080449104309082, + "rewards/margins": 8.25956916809082, + "rewards/rejected": -9.56761360168457, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.25e-08, + "logits/chosen": -2.7494421005249023, + "logits/rejected": -2.681250810623169, + "logps/chosen": -329.81817626953125, + "logps/rejected": -351.62689208984375, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1257236003875732, + "rewards/margins": 8.99251937866211, + "rewards/rejected": -10.118242263793945, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.058868501529052e-08, + "logits/chosen": -2.802464008331299, + "logits/rejected": -2.6624038219451904, + "logps/chosen": -305.5362243652344, + "logps/rejected": -341.27130126953125, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3455936908721924, + "rewards/margins": 9.046780586242676, + "rewards/rejected": -10.392374038696289, + "step": 2590 + }, + { + "epoch": 2.68, + "learning_rate": 5.8677370030581035e-08, + "logits/chosen": -2.6268625259399414, + "logits/rejected": -2.6364901065826416, + "logps/chosen": -281.02655029296875, + "logps/rejected": -321.53936767578125, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2788012027740479, + "rewards/margins": 9.457226753234863, + "rewards/rejected": -10.736028671264648, + "step": 2600 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.7350168228149414, + "eval_logits/rejected": -2.671557664871216, + "eval_logps/chosen": -336.4403381347656, + "eval_logps/rejected": -321.674072265625, + "eval_loss": 0.6755052208900452, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": -3.9907662868499756, + "eval_rewards/margins": 3.6858959197998047, + "eval_rewards/rejected": -7.676662921905518, + "eval_runtime": 85.0069, + "eval_samples_per_second": 23.528, + "eval_steps_per_second": 0.741, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 5.6766055045871554e-08, + "logits/chosen": -2.7218501567840576, + "logits/rejected": -2.701134443283081, + "logps/chosen": -280.87249755859375, + "logps/rejected": -342.28692626953125, + "loss": 0.0099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8790122866630554, + "rewards/margins": 8.924139022827148, + "rewards/rejected": -9.803152084350586, + "step": 2610 + }, + { + "epoch": 2.7, + "learning_rate": 5.485474006116208e-08, + "logits/chosen": -2.749300718307495, + "logits/rejected": -2.7427945137023926, + "logps/chosen": -281.42535400390625, + "logps/rejected": -320.20855712890625, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7675425410270691, + "rewards/margins": 9.114829063415527, + "rewards/rejected": -9.88237190246582, + "step": 2620 + }, + { + "epoch": 2.71, + "learning_rate": 5.294342507645259e-08, + "logits/chosen": -2.718909978866577, + "logits/rejected": -2.6332194805145264, + "logps/chosen": -330.5658264160156, + "logps/rejected": -326.0030517578125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.26304030418396, + "rewards/margins": 9.22779369354248, + "rewards/rejected": -10.49083423614502, + "step": 2630 + }, + { + "epoch": 2.72, + "learning_rate": 5.1032110091743117e-08, + "logits/chosen": -2.6946017742156982, + "logits/rejected": -2.578606128692627, + "logps/chosen": -244.9853515625, + "logps/rejected": -326.56829833984375, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.163881778717041, + "rewards/margins": 8.259695053100586, + "rewards/rejected": -10.423576354980469, + "step": 2640 + }, + { + "epoch": 2.73, + "learning_rate": 4.9120795107033635e-08, + "logits/chosen": -2.692305564880371, + "logits/rejected": -2.621889591217041, + "logps/chosen": -299.7417907714844, + "logps/rejected": -354.70037841796875, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.746453881263733, + "rewards/margins": 9.412019729614258, + "rewards/rejected": -11.158472061157227, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.7209480122324154e-08, + "logits/chosen": -2.742924928665161, + "logits/rejected": -2.680655002593994, + "logps/chosen": -288.6100158691406, + "logps/rejected": -372.38848876953125, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.150846004486084, + "rewards/margins": 9.178352355957031, + "rewards/rejected": -11.329198837280273, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.529816513761467e-08, + "logits/chosen": -2.695166826248169, + "logits/rejected": -2.6738569736480713, + "logps/chosen": -326.481689453125, + "logps/rejected": -396.2838134765625, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.571830153465271, + "rewards/margins": 9.341531753540039, + "rewards/rejected": -10.913363456726074, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.33868501529052e-08, + "logits/chosen": -2.7478764057159424, + "logits/rejected": -2.7026572227478027, + "logps/chosen": -257.5367126464844, + "logps/rejected": -314.3909606933594, + "loss": 0.0145, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.009021282196045, + "rewards/margins": 8.854787826538086, + "rewards/rejected": -10.863809585571289, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.147553516819572e-08, + "logits/chosen": -2.660367012023926, + "logits/rejected": -2.6164801120758057, + "logps/chosen": -253.05874633789062, + "logps/rejected": -334.41650390625, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8409353494644165, + "rewards/margins": 9.41588020324707, + "rewards/rejected": -11.256814956665039, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.9564220183486236e-08, + "logits/chosen": -2.6420648097991943, + "logits/rejected": -2.6223690509796143, + "logps/chosen": -286.759521484375, + "logps/rejected": -299.20928955078125, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.680275559425354, + "rewards/margins": 8.588552474975586, + "rewards/rejected": -10.268828392028809, + "step": 2700 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -2.751840829849243, + "eval_logits/rejected": -2.689521551132202, + "eval_logps/chosen": -338.3857727050781, + "eval_logps/rejected": -323.7826232910156, + "eval_loss": 0.6889848709106445, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -4.185306549072266, + "eval_rewards/margins": 3.702207565307617, + "eval_rewards/rejected": -7.887514114379883, + "eval_runtime": 84.5931, + "eval_samples_per_second": 23.643, + "eval_steps_per_second": 0.745, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.7652905198776755e-08, + "logits/chosen": -2.7268526554107666, + "logits/rejected": -2.670037031173706, + "logps/chosen": -318.42047119140625, + "logps/rejected": -328.3361511230469, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8269201517105103, + "rewards/margins": 8.519743919372559, + "rewards/rejected": -10.346662521362305, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.574159021406728e-08, + "logits/chosen": -2.7178609371185303, + "logits/rejected": -2.655142307281494, + "logps/chosen": -294.76824951171875, + "logps/rejected": -407.04388427734375, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5894997119903564, + "rewards/margins": 9.621997833251953, + "rewards/rejected": -11.21149730682373, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.383027522935779e-08, + "logits/chosen": -2.7463743686676025, + "logits/rejected": -2.7335140705108643, + "logps/chosen": -292.24530029296875, + "logps/rejected": -352.7088928222656, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4958622455596924, + "rewards/margins": 9.347841262817383, + "rewards/rejected": -10.843703269958496, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.191896024464832e-08, + "logits/chosen": -2.74155592918396, + "logits/rejected": -2.7045648097991943, + "logps/chosen": -302.1263732910156, + "logps/rejected": -347.8245544433594, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5156089067459106, + "rewards/margins": 9.854185104370117, + "rewards/rejected": -11.369794845581055, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 3.0007645259938836e-08, + "logits/chosen": -2.686781406402588, + "logits/rejected": -2.6489763259887695, + "logps/chosen": -251.529296875, + "logps/rejected": -336.72222900390625, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2218445539474487, + "rewards/margins": 10.793516159057617, + "rewards/rejected": -12.015360832214355, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.809633027522936e-08, + "logits/chosen": -2.693544864654541, + "logits/rejected": -2.607612133026123, + "logps/chosen": -274.08245849609375, + "logps/rejected": -329.0926513671875, + "loss": 0.0158, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8746064901351929, + "rewards/margins": 9.00143814086914, + "rewards/rejected": -10.876044273376465, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.6185015290519877e-08, + "logits/chosen": -2.6773712635040283, + "logits/rejected": -2.657872200012207, + "logps/chosen": -292.4349365234375, + "logps/rejected": -358.54864501953125, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.64081609249115, + "rewards/margins": 8.695267677307129, + "rewards/rejected": -10.336084365844727, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.4273700305810396e-08, + "logits/chosen": -2.694716691970825, + "logits/rejected": -2.671049118041992, + "logps/chosen": -332.70367431640625, + "logps/rejected": -374.2784729003906, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9321603775024414, + "rewards/margins": 9.577836990356445, + "rewards/rejected": -11.509998321533203, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.2362385321100918e-08, + "logits/chosen": -2.7195866107940674, + "logits/rejected": -2.6434059143066406, + "logps/chosen": -305.3413391113281, + "logps/rejected": -334.30450439453125, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7689650058746338, + "rewards/margins": 9.231801986694336, + "rewards/rejected": -11.00076675415039, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 2.0451070336391437e-08, + "logits/chosen": -2.658266544342041, + "logits/rejected": -2.5843617916107178, + "logps/chosen": -287.137451171875, + "logps/rejected": -307.26544189453125, + "loss": 0.0126, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1759697198867798, + "rewards/margins": 9.180164337158203, + "rewards/rejected": -10.356133460998535, + "step": 2800 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.7376251220703125, + "eval_logits/rejected": -2.6751596927642822, + "eval_logps/chosen": -339.3250427246094, + "eval_logps/rejected": -325.0658874511719, + "eval_loss": 0.688874363899231, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -4.279238224029541, + "eval_rewards/margins": 3.736605644226074, + "eval_rewards/rejected": -8.015843391418457, + "eval_runtime": 85.0292, + "eval_samples_per_second": 23.521, + "eval_steps_per_second": 0.741, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.8539755351681956e-08, + "logits/chosen": -2.6495871543884277, + "logits/rejected": -2.643451690673828, + "logps/chosen": -288.79229736328125, + "logps/rejected": -351.61859130859375, + "loss": 0.0141, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1320414543151855, + "rewards/margins": 8.717375755310059, + "rewards/rejected": -10.849416732788086, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6628440366972478e-08, + "logits/chosen": -2.7097463607788086, + "logits/rejected": -2.6647915840148926, + "logps/chosen": -312.65277099609375, + "logps/rejected": -338.9766540527344, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.018944025039673, + "rewards/margins": 8.279979705810547, + "rewards/rejected": -10.29892349243164, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4717125382262997e-08, + "logits/chosen": -2.728929042816162, + "logits/rejected": -2.7036802768707275, + "logps/chosen": -297.9712219238281, + "logps/rejected": -358.2961120605469, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.438098907470703, + "rewards/margins": 8.373188018798828, + "rewards/rejected": -10.811286926269531, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2805810397553517e-08, + "logits/chosen": -2.7534146308898926, + "logits/rejected": -2.661689519882202, + "logps/chosen": -296.6317443847656, + "logps/rejected": -343.7471008300781, + "loss": 0.0172, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4898061752319336, + "rewards/margins": 9.379873275756836, + "rewards/rejected": -10.869680404663086, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0894495412844038e-08, + "logits/chosen": -2.649125099182129, + "logits/rejected": -2.626152515411377, + "logps/chosen": -279.52276611328125, + "logps/rejected": -341.8113708496094, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7075916528701782, + "rewards/margins": 8.933737754821777, + "rewards/rejected": -10.64133071899414, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.983180428134555e-09, + "logits/chosen": -2.7289493083953857, + "logits/rejected": -2.6224875450134277, + "logps/chosen": -290.38720703125, + "logps/rejected": -301.3743896484375, + "loss": 0.0113, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.2014358043670654, + "rewards/margins": 8.64903736114502, + "rewards/rejected": -10.850473403930664, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 7.071865443425076e-09, + "logits/chosen": -2.655238389968872, + "logits/rejected": -2.6466026306152344, + "logps/chosen": -304.779296875, + "logps/rejected": -347.95611572265625, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1112747192382812, + "rewards/margins": 9.25969123840332, + "rewards/rejected": -11.370965957641602, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 5.1605504587155965e-09, + "logits/chosen": -2.741295099258423, + "logits/rejected": -2.6439971923828125, + "logps/chosen": -299.2157897949219, + "logps/rejected": -346.70355224609375, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3350167274475098, + "rewards/margins": 9.48717212677002, + "rewards/rejected": -10.822189331054688, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 3.249235474006116e-09, + "logits/chosen": -2.727517604827881, + "logits/rejected": -2.720759868621826, + "logps/chosen": -297.76129150390625, + "logps/rejected": -341.95086669921875, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0693154335021973, + "rewards/margins": 8.433469772338867, + "rewards/rejected": -10.502785682678223, + "step": 2890 + }, + { + "epoch": 2.99, + "learning_rate": 1.3379204892966359e-09, + "logits/chosen": -2.724641799926758, + "logits/rejected": -2.6374106407165527, + "logps/chosen": -285.85882568359375, + "logps/rejected": -324.15582275390625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.20442533493042, + "rewards/margins": 8.951725959777832, + "rewards/rejected": -11.156152725219727, + "step": 2900 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -2.7404043674468994, + "eval_logits/rejected": -2.6788272857666016, + "eval_logps/chosen": -339.67138671875, + "eval_logps/rejected": -325.6390380859375, + "eval_loss": 0.6886436939239502, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -4.313872814178467, + "eval_rewards/margins": 3.759286642074585, + "eval_rewards/rejected": -8.073159217834473, + "eval_runtime": 84.4823, + "eval_samples_per_second": 23.674, + "eval_steps_per_second": 0.746, + "step": 2900 + }, + { + "epoch": 3.0, + "step": 2907, + "total_flos": 0.0, + "train_loss": 0.20374761147621287, + "train_runtime": 18387.3875, + "train_samples_per_second": 10.11, + "train_steps_per_second": 0.158 + } + ], + "logging_steps": 10, + "max_steps": 2907, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}