diff --git "a/checkpoint/checkpoint-18000/trainer_state.json" "b/checkpoint/checkpoint-18000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint/checkpoint-18000/trainer_state.json" @@ -0,0 +1,3021 @@ +{ + "best_metric": 0.9302791357040405, + "best_model_checkpoint": "./output/dpo_output/10k_students_10k_stack/checkpoint/checkpoint-18000", + "epoch": 0.8335262792313035, + "eval_steps": 1000, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0046307015512850195, + "grad_norm": 5.932480812072754, + "learning_rate": 4.977772632553832e-05, + "logits/chosen": -18.937969207763672, + "logits/rejected": -17.256298065185547, + "logps/chosen": -169.58087158203125, + "logps/rejected": -168.01634216308594, + "loss": 0.7752, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -2.037126302719116, + "rewards/margins": 0.4727743864059448, + "rewards/rejected": -2.5099010467529297, + "step": 100 + }, + { + "epoch": 0.009261403102570039, + "grad_norm": 8.801850318908691, + "learning_rate": 4.954850659874971e-05, + "logits/chosen": -17.237207412719727, + "logits/rejected": -15.479244232177734, + "logps/chosen": -199.30752563476562, + "logps/rejected": -186.26181030273438, + "loss": 1.1298, + "rewards/accuracies": 0.49000000953674316, + "rewards/chosen": -4.508936882019043, + "rewards/margins": -0.0808589830994606, + "rewards/rejected": -4.428077697753906, + "step": 200 + }, + { + "epoch": 0.01389210465385506, + "grad_norm": 102.89826202392578, + "learning_rate": 4.931697152118546e-05, + "logits/chosen": -17.642629623413086, + "logits/rejected": -16.12238121032715, + "logps/chosen": -188.68121337890625, + "logps/rejected": -182.5252685546875, + "loss": 0.8529, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": -2.8182811737060547, + "rewards/margins": 0.3854120671749115, + "rewards/rejected": -3.203693151473999, + "step": 300 + }, + { + "epoch": 0.018522806205140078, + "grad_norm": 0.0003582279896363616, + "learning_rate": 4.9085436443621215e-05, + "logits/chosen": -16.342151641845703, + "logits/rejected": -14.690173149108887, + "logps/chosen": -197.86817932128906, + "logps/rejected": -199.99160766601562, + "loss": 0.9249, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": -4.376795768737793, + "rewards/margins": 0.645234227180481, + "rewards/rejected": -5.022029876708984, + "step": 400 + }, + { + "epoch": 0.0231535077564251, + "grad_norm": 0.7547457218170166, + "learning_rate": 4.8853901366056956e-05, + "logits/chosen": -14.315742492675781, + "logits/rejected": -13.13943099975586, + "logps/chosen": -226.77879333496094, + "logps/rejected": -212.2596893310547, + "loss": 1.1835, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": -7.502179145812988, + "rewards/margins": 0.5412274599075317, + "rewards/rejected": -8.043407440185547, + "step": 500 + }, + { + "epoch": 0.02778420930771012, + "grad_norm": 0.09894751757383347, + "learning_rate": 4.862236628849271e-05, + "logits/chosen": -14.297338485717773, + "logits/rejected": -13.270490646362305, + "logps/chosen": -255.91860961914062, + "logps/rejected": -263.9288024902344, + "loss": 1.2753, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -10.779191970825195, + "rewards/margins": 1.2364416122436523, + "rewards/rejected": -12.015632629394531, + "step": 600 + }, + { + "epoch": 0.03241491085899514, + "grad_norm": 18.43197250366211, + "learning_rate": 4.839083121092846e-05, + "logits/chosen": -14.56347370147705, + "logits/rejected": -12.294930458068848, + "logps/chosen": -234.3055877685547, + "logps/rejected": -230.1753692626953, + "loss": 1.085, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -8.342914581298828, + "rewards/margins": 1.1828445196151733, + "rewards/rejected": -9.525758743286133, + "step": 700 + }, + { + "epoch": 0.037045612410280156, + "grad_norm": 147.7353515625, + "learning_rate": 4.8159296133364206e-05, + "logits/chosen": -14.960326194763184, + "logits/rejected": -12.486715316772461, + "logps/chosen": -243.6907196044922, + "logps/rejected": -258.2897033691406, + "loss": 0.8159, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -8.589630126953125, + "rewards/margins": 2.1720709800720215, + "rewards/rejected": -10.761700630187988, + "step": 800 + }, + { + "epoch": 0.041676313961565174, + "grad_norm": 40.75476837158203, + "learning_rate": 4.792776105579996e-05, + "logits/chosen": -14.261970520019531, + "logits/rejected": -13.42474365234375, + "logps/chosen": -236.05096435546875, + "logps/rejected": -245.3695831298828, + "loss": 1.3291, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -9.054373741149902, + "rewards/margins": 1.2040209770202637, + "rewards/rejected": -10.258395195007324, + "step": 900 + }, + { + "epoch": 0.0463070155128502, + "grad_norm": 41.665733337402344, + "learning_rate": 4.76962259782357e-05, + "logits/chosen": -13.747593879699707, + "logits/rejected": -11.4396390914917, + "logps/chosen": -267.20654296875, + "logps/rejected": -263.12127685546875, + "loss": 1.1523, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -12.121853828430176, + "rewards/margins": 1.9070380926132202, + "rewards/rejected": -14.028891563415527, + "step": 1000 + }, + { + "epoch": 0.0463070155128502, + "eval_logits/chosen": -15.348400115966797, + "eval_logits/rejected": -12.8839750289917, + "eval_logps/chosen": -278.6545715332031, + "eval_logps/rejected": -285.7479248046875, + "eval_loss": 1.159505009651184, + "eval_rewards/accuracies": 0.6658333539962769, + "eval_rewards/chosen": -12.499542236328125, + "eval_rewards/margins": 2.093540906906128, + "eval_rewards/rejected": -14.593082427978516, + "eval_runtime": 595.521, + "eval_samples_per_second": 4.03, + "eval_steps_per_second": 4.03, + "step": 1000 + }, + { + "epoch": 0.05093771706413522, + "grad_norm": 27.11232566833496, + "learning_rate": 4.746469090067146e-05, + "logits/chosen": -15.117622375488281, + "logits/rejected": -13.129312515258789, + "logps/chosen": -235.89825439453125, + "logps/rejected": -241.9706573486328, + "loss": 0.9035, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.738895416259766, + "rewards/margins": 1.9915401935577393, + "rewards/rejected": -11.730435371398926, + "step": 1100 + }, + { + "epoch": 0.05556841861542024, + "grad_norm": 14.75066089630127, + "learning_rate": 4.7247047927761054e-05, + "logits/chosen": -13.134005546569824, + "logits/rejected": -11.821113586425781, + "logps/chosen": -537.9695434570312, + "logps/rejected": -567.16259765625, + "loss": 3.6509, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -38.636722564697266, + "rewards/margins": 2.336888551712036, + "rewards/rejected": -40.97361373901367, + "step": 1200 + }, + { + "epoch": 0.060199120166705256, + "grad_norm": 2.5833189487457275, + "learning_rate": 4.701551285019681e-05, + "logits/chosen": -14.083927154541016, + "logits/rejected": -11.76788330078125, + "logps/chosen": -228.8092041015625, + "logps/rejected": -229.73764038085938, + "loss": 0.9777, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -8.000514030456543, + "rewards/margins": 1.5612505674362183, + "rewards/rejected": -9.56176471710205, + "step": 1300 + }, + { + "epoch": 0.06482982171799027, + "grad_norm": 55.1929817199707, + "learning_rate": 4.6783977772632556e-05, + "logits/chosen": -13.045205116271973, + "logits/rejected": -11.083745002746582, + "logps/chosen": -280.2500915527344, + "logps/rejected": -279.9684753417969, + "loss": 1.1287, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.219869613647461, + "rewards/margins": 2.250887393951416, + "rewards/rejected": -14.470755577087402, + "step": 1400 + }, + { + "epoch": 0.06946052326927529, + "grad_norm": 106.15294647216797, + "learning_rate": 4.6552442695068304e-05, + "logits/chosen": -14.938093185424805, + "logits/rejected": -12.27778434753418, + "logps/chosen": -229.48614501953125, + "logps/rejected": -224.2559356689453, + "loss": 0.8994, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -8.494085311889648, + "rewards/margins": 1.5291900634765625, + "rewards/rejected": -10.023276329040527, + "step": 1500 + }, + { + "epoch": 0.07409122482056031, + "grad_norm": 0.6163883209228516, + "learning_rate": 4.632090761750406e-05, + "logits/chosen": -11.895711898803711, + "logits/rejected": -11.187520980834961, + "logps/chosen": -250.82479858398438, + "logps/rejected": -271.1986083984375, + "loss": 1.1029, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -10.7871675491333, + "rewards/margins": 1.8207966089248657, + "rewards/rejected": -12.607963562011719, + "step": 1600 + }, + { + "epoch": 0.07872192637184533, + "grad_norm": 4.759420394897461, + "learning_rate": 4.60893725399398e-05, + "logits/chosen": -14.557934761047363, + "logits/rejected": -12.280036926269531, + "logps/chosen": -226.98574829101562, + "logps/rejected": -264.96875, + "loss": 0.9261, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -8.311798095703125, + "rewards/margins": 2.297745943069458, + "rewards/rejected": -10.609543800354004, + "step": 1700 + }, + { + "epoch": 0.08335262792313035, + "grad_norm": 112.9684066772461, + "learning_rate": 4.5857837462375555e-05, + "logits/chosen": -12.77873420715332, + "logits/rejected": -10.884748458862305, + "logps/chosen": -258.1977233886719, + "logps/rejected": -291.1019287109375, + "loss": 1.0586, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -11.546738624572754, + "rewards/margins": 1.946559190750122, + "rewards/rejected": -13.493298530578613, + "step": 1800 + }, + { + "epoch": 0.08798332947441537, + "grad_norm": 185.68812561035156, + "learning_rate": 4.56263023848113e-05, + "logits/chosen": -10.224660873413086, + "logits/rejected": -8.805120468139648, + "logps/chosen": -291.38720703125, + "logps/rejected": -307.789306640625, + "loss": 0.8702, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -13.615660667419434, + "rewards/margins": 2.516305923461914, + "rewards/rejected": -16.13196563720703, + "step": 1900 + }, + { + "epoch": 0.0926140310257004, + "grad_norm": 1.0021706819534302, + "learning_rate": 4.539476730724705e-05, + "logits/chosen": -12.920204162597656, + "logits/rejected": -10.876238822937012, + "logps/chosen": -260.2502746582031, + "logps/rejected": -249.31326293945312, + "loss": 1.0427, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -10.521868705749512, + "rewards/margins": 1.398219347000122, + "rewards/rejected": -11.920088768005371, + "step": 2000 + }, + { + "epoch": 0.0926140310257004, + "eval_logits/chosen": -14.9287748336792, + "eval_logits/rejected": -12.413740158081055, + "eval_logps/chosen": -250.66151428222656, + "eval_logps/rejected": -258.962890625, + "eval_loss": 0.9348099827766418, + "eval_rewards/accuracies": 0.6762499809265137, + "eval_rewards/chosen": -9.700236320495605, + "eval_rewards/margins": 2.2143468856811523, + "eval_rewards/rejected": -11.914582252502441, + "eval_runtime": 595.1862, + "eval_samples_per_second": 4.032, + "eval_steps_per_second": 4.032, + "step": 2000 + }, + { + "epoch": 0.09724473257698542, + "grad_norm": 92.90179443359375, + "learning_rate": 4.51632322296828e-05, + "logits/chosen": -13.820252418518066, + "logits/rejected": -11.60437297821045, + "logps/chosen": -263.9429931640625, + "logps/rejected": -266.4606018066406, + "loss": 1.1237, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.721684455871582, + "rewards/margins": 2.347269058227539, + "rewards/rejected": -14.068955421447754, + "step": 2100 + }, + { + "epoch": 0.10187543412827044, + "grad_norm": 1.9892809391021729, + "learning_rate": 4.4931697152118546e-05, + "logits/chosen": -16.49983787536621, + "logits/rejected": -13.618443489074707, + "logps/chosen": -261.19989013671875, + "logps/rejected": -265.1740417480469, + "loss": 1.0921, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": -8.718969345092773, + "rewards/margins": 1.7608253955841064, + "rewards/rejected": -10.4797945022583, + "step": 2200 + }, + { + "epoch": 0.10650613567955546, + "grad_norm": 2.8146913051605225, + "learning_rate": 4.4700162074554294e-05, + "logits/chosen": -14.866826057434082, + "logits/rejected": -12.879820823669434, + "logps/chosen": -237.8057861328125, + "logps/rejected": -248.6677703857422, + "loss": 0.8993, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -9.813733100891113, + "rewards/margins": 1.9094059467315674, + "rewards/rejected": -11.723139762878418, + "step": 2300 + }, + { + "epoch": 0.11113683723084047, + "grad_norm": 7.9283857345581055, + "learning_rate": 4.446862699699005e-05, + "logits/chosen": -15.892507553100586, + "logits/rejected": -12.79761028289795, + "logps/chosen": -240.3441925048828, + "logps/rejected": -241.5165557861328, + "loss": 0.7475, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -8.371927261352539, + "rewards/margins": 2.462991714477539, + "rewards/rejected": -10.834918975830078, + "step": 2400 + }, + { + "epoch": 0.11576753878212549, + "grad_norm": 17.73735237121582, + "learning_rate": 4.4237091919425796e-05, + "logits/chosen": -14.584887504577637, + "logits/rejected": -12.091426849365234, + "logps/chosen": -258.0081787109375, + "logps/rejected": -260.34161376953125, + "loss": 1.1756, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -10.273959159851074, + "rewards/margins": 1.984671950340271, + "rewards/rejected": -12.258630752563477, + "step": 2500 + }, + { + "epoch": 0.12039824033341051, + "grad_norm": 133.74703979492188, + "learning_rate": 4.4005556841861544e-05, + "logits/chosen": -13.465319633483887, + "logits/rejected": -10.94394302368164, + "logps/chosen": -266.9393310546875, + "logps/rejected": -272.83087158203125, + "loss": 1.1628, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -10.949825286865234, + "rewards/margins": 1.9796525239944458, + "rewards/rejected": -12.92947769165039, + "step": 2600 + }, + { + "epoch": 0.12502894188469554, + "grad_norm": 3.2051329612731934, + "learning_rate": 4.377402176429729e-05, + "logits/chosen": -13.998117446899414, + "logits/rejected": -11.195394515991211, + "logps/chosen": -265.32000732421875, + "logps/rejected": -262.2739562988281, + "loss": 0.98, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -10.569400787353516, + "rewards/margins": 2.5353689193725586, + "rewards/rejected": -13.10477066040039, + "step": 2700 + }, + { + "epoch": 0.12965964343598055, + "grad_norm": 82.38189697265625, + "learning_rate": 4.354248668673304e-05, + "logits/chosen": -14.051268577575684, + "logits/rejected": -11.348038673400879, + "logps/chosen": -271.5502014160156, + "logps/rejected": -274.1068115234375, + "loss": 1.3392, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -12.65804672241211, + "rewards/margins": 2.2346200942993164, + "rewards/rejected": -14.892668724060059, + "step": 2800 + }, + { + "epoch": 0.13429034498726558, + "grad_norm": 0.8718699812889099, + "learning_rate": 4.3310951609168794e-05, + "logits/chosen": -13.869842529296875, + "logits/rejected": -10.826061248779297, + "logps/chosen": -257.593994140625, + "logps/rejected": -272.55975341796875, + "loss": 0.9159, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.715877532958984, + "rewards/margins": 2.9527039527893066, + "rewards/rejected": -13.668583869934082, + "step": 2900 + }, + { + "epoch": 0.13892104653855059, + "grad_norm": 0.9077057242393494, + "learning_rate": 4.3079416531604535e-05, + "logits/chosen": -12.1543550491333, + "logits/rejected": -10.162296295166016, + "logps/chosen": -303.01409912109375, + "logps/rejected": -313.34844970703125, + "loss": 1.2139, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -13.634842872619629, + "rewards/margins": 2.48551082611084, + "rewards/rejected": -16.12035369873047, + "step": 3000 + }, + { + "epoch": 0.13892104653855059, + "eval_logits/chosen": -13.600151062011719, + "eval_logits/rejected": -11.181111335754395, + "eval_logps/chosen": -264.4333801269531, + "eval_logps/rejected": -276.6205749511719, + "eval_loss": 1.0894662141799927, + "eval_rewards/accuracies": 0.6833333373069763, + "eval_rewards/chosen": -11.077425956726074, + "eval_rewards/margins": 2.6029253005981445, + "eval_rewards/rejected": -13.680350303649902, + "eval_runtime": 595.3502, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 4.031, + "step": 3000 + }, + { + "epoch": 0.14355174808983562, + "grad_norm": 118.16087341308594, + "learning_rate": 4.284788145404029e-05, + "logits/chosen": -12.824311256408691, + "logits/rejected": -10.598331451416016, + "logps/chosen": -287.16229248046875, + "logps/rejected": -296.9625244140625, + "loss": 1.3294, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -13.0645751953125, + "rewards/margins": 2.751075029373169, + "rewards/rejected": -15.815651893615723, + "step": 3100 + }, + { + "epoch": 0.14818244964112062, + "grad_norm": 131.24530029296875, + "learning_rate": 4.261634637647604e-05, + "logits/chosen": -12.044851303100586, + "logits/rejected": -10.322354316711426, + "logps/chosen": -263.2828674316406, + "logps/rejected": -278.4976806640625, + "loss": 0.8734, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -13.411654472351074, + "rewards/margins": 2.831801652908325, + "rewards/rejected": -16.24345588684082, + "step": 3200 + }, + { + "epoch": 0.15281315119240566, + "grad_norm": 0.012756047770380974, + "learning_rate": 4.2384811298911786e-05, + "logits/chosen": -13.847241401672363, + "logits/rejected": -10.280381202697754, + "logps/chosen": -289.5423889160156, + "logps/rejected": -274.0093994140625, + "loss": 1.238, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -11.928001403808594, + "rewards/margins": 2.2357709407806396, + "rewards/rejected": -14.163771629333496, + "step": 3300 + }, + { + "epoch": 0.15744385274369066, + "grad_norm": 74.41487884521484, + "learning_rate": 4.215327622134754e-05, + "logits/chosen": -13.10720443725586, + "logits/rejected": -10.8968505859375, + "logps/chosen": -238.0747528076172, + "logps/rejected": -262.4646301269531, + "loss": 0.8596, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -10.469843864440918, + "rewards/margins": 2.7977781295776367, + "rewards/rejected": -13.267622947692871, + "step": 3400 + }, + { + "epoch": 0.1620745542949757, + "grad_norm": 45.10124206542969, + "learning_rate": 4.192174114378328e-05, + "logits/chosen": -13.744721412658691, + "logits/rejected": -11.368577003479004, + "logps/chosen": -264.2241516113281, + "logps/rejected": -258.3453063964844, + "loss": 1.384, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -10.90272331237793, + "rewards/margins": 1.433777928352356, + "rewards/rejected": -12.33650016784668, + "step": 3500 + }, + { + "epoch": 0.1667052558462607, + "grad_norm": 0.9729955196380615, + "learning_rate": 4.1690206066219036e-05, + "logits/chosen": -13.980158805847168, + "logits/rejected": -11.490269660949707, + "logps/chosen": -263.657958984375, + "logps/rejected": -263.7289733886719, + "loss": 0.8817, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.24023723602295, + "rewards/margins": 2.3780357837677, + "rewards/rejected": -13.61827278137207, + "step": 3600 + }, + { + "epoch": 0.17133595739754573, + "grad_norm": 104.15093231201172, + "learning_rate": 4.1458670988654784e-05, + "logits/chosen": -16.399433135986328, + "logits/rejected": -13.435510635375977, + "logps/chosen": -229.3163299560547, + "logps/rejected": -235.4849853515625, + "loss": 1.0346, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -9.024942398071289, + "rewards/margins": 2.8823766708374023, + "rewards/rejected": -11.907319068908691, + "step": 3700 + }, + { + "epoch": 0.17596665894883073, + "grad_norm": 0.09079485386610031, + "learning_rate": 4.122713591109053e-05, + "logits/chosen": -14.557435035705566, + "logits/rejected": -12.38566780090332, + "logps/chosen": -252.38616943359375, + "logps/rejected": -257.95721435546875, + "loss": 1.2015, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -10.700638771057129, + "rewards/margins": 2.269381523132324, + "rewards/rejected": -12.970019340515137, + "step": 3800 + }, + { + "epoch": 0.18059736050011577, + "grad_norm": 1.7486906051635742, + "learning_rate": 4.099560083352628e-05, + "logits/chosen": -16.514432907104492, + "logits/rejected": -13.728776931762695, + "logps/chosen": -254.0413055419922, + "logps/rejected": -259.3268127441406, + "loss": 0.902, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -8.835321426391602, + "rewards/margins": 2.317606210708618, + "rewards/rejected": -11.15292739868164, + "step": 3900 + }, + { + "epoch": 0.1852280620514008, + "grad_norm": 84.05282592773438, + "learning_rate": 4.076406575596203e-05, + "logits/chosen": -16.174575805664062, + "logits/rejected": -12.256491661071777, + "logps/chosen": -261.49627685546875, + "logps/rejected": -262.43243408203125, + "loss": 0.9042, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.012048721313477, + "rewards/margins": 3.254835844039917, + "rewards/rejected": -13.266884803771973, + "step": 4000 + }, + { + "epoch": 0.1852280620514008, + "eval_logits/chosen": -13.876004219055176, + "eval_logits/rejected": -11.615591049194336, + "eval_logps/chosen": -272.5106506347656, + "eval_logps/rejected": -284.0415344238281, + "eval_loss": 1.0893065929412842, + "eval_rewards/accuracies": 0.6891666650772095, + "eval_rewards/chosen": -11.885148048400879, + "eval_rewards/margins": 2.5372982025146484, + "eval_rewards/rejected": -14.422446250915527, + "eval_runtime": 595.328, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 4.031, + "step": 4000 + }, + { + "epoch": 0.1898587636026858, + "grad_norm": 31.714935302734375, + "learning_rate": 4.053253067839778e-05, + "logits/chosen": -14.158173561096191, + "logits/rejected": -11.79055404663086, + "logps/chosen": -261.6609191894531, + "logps/rejected": -266.7125244140625, + "loss": 1.0238, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -10.325104713439941, + "rewards/margins": 2.4658026695251465, + "rewards/rejected": -12.790907859802246, + "step": 4100 + }, + { + "epoch": 0.19448946515397084, + "grad_norm": 0.1167730987071991, + "learning_rate": 4.030099560083353e-05, + "logits/chosen": -13.809327125549316, + "logits/rejected": -11.6063871383667, + "logps/chosen": -293.43170166015625, + "logps/rejected": -306.2814636230469, + "loss": 1.0791, + "rewards/accuracies": 0.7699999809265137, + "rewards/chosen": -13.137182235717773, + "rewards/margins": 3.19193172454834, + "rewards/rejected": -16.329113006591797, + "step": 4200 + }, + { + "epoch": 0.19912016670525584, + "grad_norm": 108.64598846435547, + "learning_rate": 4.006946052326928e-05, + "logits/chosen": -13.153284072875977, + "logits/rejected": -11.617776870727539, + "logps/chosen": -275.723876953125, + "logps/rejected": -291.53851318359375, + "loss": 1.2784, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -11.406493186950684, + "rewards/margins": 1.6062767505645752, + "rewards/rejected": -13.012768745422363, + "step": 4300 + }, + { + "epoch": 0.20375086825654087, + "grad_norm": 20.68520164489746, + "learning_rate": 3.9837925445705025e-05, + "logits/chosen": -11.738310813903809, + "logits/rejected": -10.779892921447754, + "logps/chosen": -269.1992492675781, + "logps/rejected": -300.6120300292969, + "loss": 0.964, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -12.769682884216309, + "rewards/margins": 2.6736278533935547, + "rewards/rejected": -15.443307876586914, + "step": 4400 + }, + { + "epoch": 0.20838156980782588, + "grad_norm": 2.7133309841156006, + "learning_rate": 3.960639036814078e-05, + "logits/chosen": -12.766013145446777, + "logits/rejected": -10.480879783630371, + "logps/chosen": -344.195068359375, + "logps/rejected": -360.9930725097656, + "loss": 1.1572, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -18.676206588745117, + "rewards/margins": 2.426331043243408, + "rewards/rejected": -21.1025390625, + "step": 4500 + }, + { + "epoch": 0.2130122713591109, + "grad_norm": 116.57975769042969, + "learning_rate": 3.937485529057652e-05, + "logits/chosen": -13.390982627868652, + "logits/rejected": -11.077901840209961, + "logps/chosen": -304.98016357421875, + "logps/rejected": -315.5147399902344, + "loss": 0.8711, + "rewards/accuracies": 0.7900000214576721, + "rewards/chosen": -13.933846473693848, + "rewards/margins": 3.604663133621216, + "rewards/rejected": -17.538511276245117, + "step": 4600 + }, + { + "epoch": 0.21764297291039592, + "grad_norm": 0.0504024475812912, + "learning_rate": 3.9143320213012276e-05, + "logits/chosen": -13.784337997436523, + "logits/rejected": -11.406414985656738, + "logps/chosen": -289.4490661621094, + "logps/rejected": -298.804443359375, + "loss": 1.2294, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -13.892451286315918, + "rewards/margins": 3.4186654090881348, + "rewards/rejected": -17.31111717224121, + "step": 4700 + }, + { + "epoch": 0.22227367446168095, + "grad_norm": 99.90382385253906, + "learning_rate": 3.891178513544802e-05, + "logits/chosen": -14.787217140197754, + "logits/rejected": -13.069499969482422, + "logps/chosen": -251.93524169921875, + "logps/rejected": -260.898681640625, + "loss": 1.4711, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": -11.366158485412598, + "rewards/margins": 1.8550819158554077, + "rewards/rejected": -13.221240043640137, + "step": 4800 + }, + { + "epoch": 0.22690437601296595, + "grad_norm": 46.121463775634766, + "learning_rate": 3.868025005788377e-05, + "logits/chosen": -16.073060989379883, + "logits/rejected": -13.704066276550293, + "logps/chosen": -268.179443359375, + "logps/rejected": -277.36944580078125, + "loss": 0.8299, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -10.643136978149414, + "rewards/margins": 3.1942529678344727, + "rewards/rejected": -13.837389945983887, + "step": 4900 + }, + { + "epoch": 0.23153507756425099, + "grad_norm": 215.08323669433594, + "learning_rate": 3.844871498031952e-05, + "logits/chosen": -14.96874713897705, + "logits/rejected": -12.921075820922852, + "logps/chosen": -268.0001525878906, + "logps/rejected": -285.048583984375, + "loss": 1.134, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -12.303441047668457, + "rewards/margins": 2.4722583293914795, + "rewards/rejected": -14.7756986618042, + "step": 5000 + }, + { + "epoch": 0.23153507756425099, + "eval_logits/chosen": -15.224173545837402, + "eval_logits/rejected": -12.963865280151367, + "eval_logps/chosen": -270.43438720703125, + "eval_logps/rejected": -280.1809387207031, + "eval_loss": 1.1040797233581543, + "eval_rewards/accuracies": 0.6566666960716248, + "eval_rewards/chosen": -11.67752742767334, + "eval_rewards/margins": 2.3588619232177734, + "eval_rewards/rejected": -14.036388397216797, + "eval_runtime": 595.5539, + "eval_samples_per_second": 4.03, + "eval_steps_per_second": 4.03, + "step": 5000 + }, + { + "epoch": 0.23616577911553602, + "grad_norm": 4.563509355648421e-05, + "learning_rate": 3.821717990275527e-05, + "logits/chosen": -15.249272346496582, + "logits/rejected": -13.242805480957031, + "logps/chosen": -243.7661590576172, + "logps/rejected": -253.2401580810547, + "loss": 1.1956, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -9.967913627624512, + "rewards/margins": 1.7604706287384033, + "rewards/rejected": -11.728384017944336, + "step": 5100 + }, + { + "epoch": 0.24079648066682102, + "grad_norm": 5.94889497756958, + "learning_rate": 3.798564482519102e-05, + "logits/chosen": -15.72038459777832, + "logits/rejected": -12.727486610412598, + "logps/chosen": -264.8755187988281, + "logps/rejected": -278.22100830078125, + "loss": 0.9918, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.164973258972168, + "rewards/margins": 2.757608652114868, + "rewards/rejected": -13.922581672668457, + "step": 5200 + }, + { + "epoch": 0.24542718221810605, + "grad_norm": 58.313072204589844, + "learning_rate": 3.775410974762676e-05, + "logits/chosen": -15.393084526062012, + "logits/rejected": -12.0960111618042, + "logps/chosen": -262.96484375, + "logps/rejected": -269.2939453125, + "loss": 1.0712, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.024227142333984, + "rewards/margins": 2.7153475284576416, + "rewards/rejected": -13.739574432373047, + "step": 5300 + }, + { + "epoch": 0.2500578837693911, + "grad_norm": 48.545108795166016, + "learning_rate": 3.752257467006252e-05, + "logits/chosen": -11.685456275939941, + "logits/rejected": -10.465229034423828, + "logps/chosen": -265.57440185546875, + "logps/rejected": -289.9385986328125, + "loss": 0.9366, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -13.04489517211914, + "rewards/margins": 2.876924991607666, + "rewards/rejected": -15.921817779541016, + "step": 5400 + }, + { + "epoch": 0.25468858532067606, + "grad_norm": 149.3887481689453, + "learning_rate": 3.7291039592498265e-05, + "logits/chosen": -13.974499702453613, + "logits/rejected": -12.667045593261719, + "logps/chosen": -295.39923095703125, + "logps/rejected": -316.9970397949219, + "loss": 1.3305, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -14.188154220581055, + "rewards/margins": 2.394373655319214, + "rewards/rejected": -16.5825252532959, + "step": 5500 + }, + { + "epoch": 0.2593192868719611, + "grad_norm": 98.0199966430664, + "learning_rate": 3.705950451493401e-05, + "logits/chosen": -14.317160606384277, + "logits/rejected": -12.36131477355957, + "logps/chosen": -283.5146179199219, + "logps/rejected": -290.797119140625, + "loss": 1.2772, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -13.607783317565918, + "rewards/margins": 1.6482733488082886, + "rewards/rejected": -15.25605583190918, + "step": 5600 + }, + { + "epoch": 0.26394998842324613, + "grad_norm": 150.75015258789062, + "learning_rate": 3.682796943736976e-05, + "logits/chosen": -15.891270637512207, + "logits/rejected": -13.70472526550293, + "logps/chosen": -277.7356872558594, + "logps/rejected": -277.91168212890625, + "loss": 1.1193, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -10.948705673217773, + "rewards/margins": 2.005187749862671, + "rewards/rejected": -12.953892707824707, + "step": 5700 + }, + { + "epoch": 0.26858068997453116, + "grad_norm": 113.30935668945312, + "learning_rate": 3.6596434359805515e-05, + "logits/chosen": -14.558771133422852, + "logits/rejected": -12.814410209655762, + "logps/chosen": -282.6718444824219, + "logps/rejected": -283.1080017089844, + "loss": 1.442, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": -13.232661247253418, + "rewards/margins": 1.6263720989227295, + "rewards/rejected": -14.859031677246094, + "step": 5800 + }, + { + "epoch": 0.27321139152581614, + "grad_norm": 165.05665588378906, + "learning_rate": 3.636489928224126e-05, + "logits/chosen": -13.373592376708984, + "logits/rejected": -12.674919128417969, + "logps/chosen": -270.6869201660156, + "logps/rejected": -289.0032958984375, + "loss": 1.3157, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -13.579002380371094, + "rewards/margins": 1.3477805852890015, + "rewards/rejected": -14.926780700683594, + "step": 5900 + }, + { + "epoch": 0.27784209307710117, + "grad_norm": 28.589683532714844, + "learning_rate": 3.613336420467701e-05, + "logits/chosen": -14.952668190002441, + "logits/rejected": -13.118420600891113, + "logps/chosen": -267.6083679199219, + "logps/rejected": -269.6361389160156, + "loss": 0.9529, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": -11.644050598144531, + "rewards/margins": 1.825527310371399, + "rewards/rejected": -13.46957778930664, + "step": 6000 + }, + { + "epoch": 0.27784209307710117, + "eval_logits/chosen": -16.652177810668945, + "eval_logits/rejected": -14.431635856628418, + "eval_logps/chosen": -268.42169189453125, + "eval_logps/rejected": -274.84332275390625, + "eval_loss": 0.9690461158752441, + "eval_rewards/accuracies": 0.6704166531562805, + "eval_rewards/chosen": -11.476255416870117, + "eval_rewards/margins": 2.0263733863830566, + "eval_rewards/rejected": -13.502629280090332, + "eval_runtime": 595.3843, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 4.031, + "step": 6000 + }, + { + "epoch": 0.2824727946283862, + "grad_norm": 5.406942844390869, + "learning_rate": 3.590182912711276e-05, + "logits/chosen": -16.971216201782227, + "logits/rejected": -14.750784873962402, + "logps/chosen": -265.4952392578125, + "logps/rejected": -268.4944152832031, + "loss": 0.7441, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -12.107220649719238, + "rewards/margins": 2.3485963344573975, + "rewards/rejected": -14.455816268920898, + "step": 6100 + }, + { + "epoch": 0.28710349617967124, + "grad_norm": 4.571298599243164, + "learning_rate": 3.567029404954851e-05, + "logits/chosen": -15.671191215515137, + "logits/rejected": -14.07499885559082, + "logps/chosen": -264.6024475097656, + "logps/rejected": -272.0396423339844, + "loss": 0.8241, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -9.934189796447754, + "rewards/margins": 2.5992603302001953, + "rewards/rejected": -12.53345012664795, + "step": 6200 + }, + { + "epoch": 0.2917341977309562, + "grad_norm": 153.76095581054688, + "learning_rate": 3.543875897198426e-05, + "logits/chosen": -16.08559226989746, + "logits/rejected": -13.792272567749023, + "logps/chosen": -255.55722045898438, + "logps/rejected": -261.3247985839844, + "loss": 0.8513, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -9.566289901733398, + "rewards/margins": 2.1109235286712646, + "rewards/rejected": -11.677214622497559, + "step": 6300 + }, + { + "epoch": 0.29636489928224125, + "grad_norm": 2.1439866031869315e-05, + "learning_rate": 3.520722389442e-05, + "logits/chosen": -14.347354888916016, + "logits/rejected": -12.357396125793457, + "logps/chosen": -293.4591064453125, + "logps/rejected": -306.54376220703125, + "loss": 0.6611, + "rewards/accuracies": 0.8199999928474426, + "rewards/chosen": -13.621291160583496, + "rewards/margins": 4.165319919586182, + "rewards/rejected": -17.786611557006836, + "step": 6400 + }, + { + "epoch": 0.3009956008335263, + "grad_norm": 0.0006867141928523779, + "learning_rate": 3.497568881685576e-05, + "logits/chosen": -13.660686492919922, + "logits/rejected": -12.209990501403809, + "logps/chosen": -298.58905029296875, + "logps/rejected": -312.8013000488281, + "loss": 1.2808, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.760621070861816, + "rewards/margins": 2.044278860092163, + "rewards/rejected": -16.804899215698242, + "step": 6500 + }, + { + "epoch": 0.3056263023848113, + "grad_norm": 0.06333109736442566, + "learning_rate": 3.4744153739291505e-05, + "logits/chosen": -15.21036148071289, + "logits/rejected": -12.713698387145996, + "logps/chosen": -328.4114990234375, + "logps/rejected": -326.3374328613281, + "loss": 1.0761, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -14.99398422241211, + "rewards/margins": 3.227440118789673, + "rewards/rejected": -18.221426010131836, + "step": 6600 + }, + { + "epoch": 0.31025700393609634, + "grad_norm": 13.536566734313965, + "learning_rate": 3.451261866172725e-05, + "logits/chosen": -14.803274154663086, + "logits/rejected": -12.432111740112305, + "logps/chosen": -308.0420837402344, + "logps/rejected": -308.2493591308594, + "loss": 1.6725, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -15.077004432678223, + "rewards/margins": 1.685246467590332, + "rewards/rejected": -16.762248992919922, + "step": 6700 + }, + { + "epoch": 0.3148877054873813, + "grad_norm": 98.5750503540039, + "learning_rate": 3.428108358416301e-05, + "logits/chosen": -16.611717224121094, + "logits/rejected": -14.931644439697266, + "logps/chosen": -242.56851196289062, + "logps/rejected": -245.1646728515625, + "loss": 1.2702, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -9.661548614501953, + "rewards/margins": 1.7692126035690308, + "rewards/rejected": -11.430761337280273, + "step": 6800 + }, + { + "epoch": 0.31951840703866635, + "grad_norm": 8.07032299041748, + "learning_rate": 3.404954850659875e-05, + "logits/chosen": -15.000127792358398, + "logits/rejected": -12.639293670654297, + "logps/chosen": -314.791259765625, + "logps/rejected": -336.9305725097656, + "loss": 1.106, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -15.519781112670898, + "rewards/margins": 3.476163387298584, + "rewards/rejected": -18.995946884155273, + "step": 6900 + }, + { + "epoch": 0.3241491085899514, + "grad_norm": 18.31216049194336, + "learning_rate": 3.38180134290345e-05, + "logits/chosen": -15.47805404663086, + "logits/rejected": -13.119900703430176, + "logps/chosen": -250.51080322265625, + "logps/rejected": -276.98236083984375, + "loss": 0.8653, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -11.675602912902832, + "rewards/margins": 3.392669916152954, + "rewards/rejected": -15.068273544311523, + "step": 7000 + }, + { + "epoch": 0.3241491085899514, + "eval_logits/chosen": -16.167076110839844, + "eval_logits/rejected": -13.826526641845703, + "eval_logps/chosen": -262.20208740234375, + "eval_logps/rejected": -274.491455078125, + "eval_loss": 1.0383297204971313, + "eval_rewards/accuracies": 0.6933333277702332, + "eval_rewards/chosen": -10.854294776916504, + "eval_rewards/margins": 2.6131441593170166, + "eval_rewards/rejected": -13.467439651489258, + "eval_runtime": 595.5855, + "eval_samples_per_second": 4.03, + "eval_steps_per_second": 4.03, + "step": 7000 + }, + { + "epoch": 0.3287798101412364, + "grad_norm": 8.22741985321045, + "learning_rate": 3.358647835147025e-05, + "logits/chosen": -16.461559295654297, + "logits/rejected": -13.514617919921875, + "logps/chosen": -277.9877014160156, + "logps/rejected": -275.4588928222656, + "loss": 1.0087, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -11.538105010986328, + "rewards/margins": 2.5826308727264404, + "rewards/rejected": -14.120736122131348, + "step": 7100 + }, + { + "epoch": 0.3334105116925214, + "grad_norm": 53.40534210205078, + "learning_rate": 3.3354943273906e-05, + "logits/chosen": -15.52739143371582, + "logits/rejected": -13.096809387207031, + "logps/chosen": -263.75750732421875, + "logps/rejected": -282.41973876953125, + "loss": 0.9069, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -11.735355377197266, + "rewards/margins": 3.6761889457702637, + "rewards/rejected": -15.411545753479004, + "step": 7200 + }, + { + "epoch": 0.3380412132438064, + "grad_norm": 3.0102253731456585e-05, + "learning_rate": 3.3123408196341746e-05, + "logits/chosen": -15.369466781616211, + "logits/rejected": -13.38189697265625, + "logps/chosen": -285.2251281738281, + "logps/rejected": -295.077392578125, + "loss": 1.3243, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -13.272722244262695, + "rewards/margins": 1.980672001838684, + "rewards/rejected": -15.253395080566406, + "step": 7300 + }, + { + "epoch": 0.34267191479509146, + "grad_norm": 28.470182418823242, + "learning_rate": 3.2891873118777494e-05, + "logits/chosen": -15.281991958618164, + "logits/rejected": -13.268410682678223, + "logps/chosen": -269.4937744140625, + "logps/rejected": -277.6639709472656, + "loss": 0.974, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -11.336699485778809, + "rewards/margins": 2.3826072216033936, + "rewards/rejected": -13.719305038452148, + "step": 7400 + }, + { + "epoch": 0.3473026163463765, + "grad_norm": 0.040093112736940384, + "learning_rate": 3.266033804121324e-05, + "logits/chosen": -15.32481575012207, + "logits/rejected": -13.589777946472168, + "logps/chosen": -289.1875305175781, + "logps/rejected": -314.6361389160156, + "loss": 1.0543, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.599404335021973, + "rewards/margins": 3.6570980548858643, + "rewards/rejected": -16.256502151489258, + "step": 7500 + }, + { + "epoch": 0.35193331789766147, + "grad_norm": 117.9237289428711, + "learning_rate": 3.2428802963649e-05, + "logits/chosen": -13.89665412902832, + "logits/rejected": -12.795659065246582, + "logps/chosen": -273.2151794433594, + "logps/rejected": -273.5435485839844, + "loss": 1.6158, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -12.329755783081055, + "rewards/margins": 1.4938496351242065, + "rewards/rejected": -13.823604583740234, + "step": 7600 + }, + { + "epoch": 0.3565640194489465, + "grad_norm": 107.03690338134766, + "learning_rate": 3.2197267886084745e-05, + "logits/chosen": -13.768420219421387, + "logits/rejected": -12.064899444580078, + "logps/chosen": -288.6653747558594, + "logps/rejected": -299.8011779785156, + "loss": 0.9657, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -13.14587116241455, + "rewards/margins": 2.6114940643310547, + "rewards/rejected": -15.757366180419922, + "step": 7700 + }, + { + "epoch": 0.36119472100023153, + "grad_norm": 13.562236785888672, + "learning_rate": 3.196573280852049e-05, + "logits/chosen": -14.771617889404297, + "logits/rejected": -13.586108207702637, + "logps/chosen": -243.8267822265625, + "logps/rejected": -270.68585205078125, + "loss": 1.0483, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -10.424483299255371, + "rewards/margins": 2.4393911361694336, + "rewards/rejected": -12.863875389099121, + "step": 7800 + }, + { + "epoch": 0.36582542255151657, + "grad_norm": 32.51217269897461, + "learning_rate": 3.173419773095624e-05, + "logits/chosen": -16.06950569152832, + "logits/rejected": -14.431178092956543, + "logps/chosen": -274.8634033203125, + "logps/rejected": -273.82330322265625, + "loss": 1.4736, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -11.861929893493652, + "rewards/margins": 1.6777317523956299, + "rewards/rejected": -13.539660453796387, + "step": 7900 + }, + { + "epoch": 0.3704561241028016, + "grad_norm": 3.5673348903656006, + "learning_rate": 3.150266265339199e-05, + "logits/chosen": -16.012493133544922, + "logits/rejected": -13.561750411987305, + "logps/chosen": -296.7126770019531, + "logps/rejected": -307.0336608886719, + "loss": 0.7154, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -11.784126281738281, + "rewards/margins": 3.737394094467163, + "rewards/rejected": -15.521519660949707, + "step": 8000 + }, + { + "epoch": 0.3704561241028016, + "eval_logits/chosen": -15.530220031738281, + "eval_logits/rejected": -13.350693702697754, + "eval_logps/chosen": -271.8744812011719, + "eval_logps/rejected": -283.94451904296875, + "eval_loss": 1.0585798025131226, + "eval_rewards/accuracies": 0.6854166388511658, + "eval_rewards/chosen": -11.821531295776367, + "eval_rewards/margins": 2.59121036529541, + "eval_rewards/rejected": -14.412742614746094, + "eval_runtime": 595.2814, + "eval_samples_per_second": 4.032, + "eval_steps_per_second": 4.032, + "step": 8000 + }, + { + "epoch": 0.3750868256540866, + "grad_norm": 3.183262825012207, + "learning_rate": 3.127112757582774e-05, + "logits/chosen": -16.365947723388672, + "logits/rejected": -13.736886024475098, + "logps/chosen": -274.3581237792969, + "logps/rejected": -268.3993225097656, + "loss": 1.0855, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -11.214922904968262, + "rewards/margins": 2.280700445175171, + "rewards/rejected": -13.495623588562012, + "step": 8100 + }, + { + "epoch": 0.3797175272053716, + "grad_norm": 131.38169860839844, + "learning_rate": 3.1039592498263484e-05, + "logits/chosen": -15.709159851074219, + "logits/rejected": -12.860796928405762, + "logps/chosen": -279.14532470703125, + "logps/rejected": -292.4508361816406, + "loss": 1.0698, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -12.19998550415039, + "rewards/margins": 3.0295968055725098, + "rewards/rejected": -15.229582786560059, + "step": 8200 + }, + { + "epoch": 0.38434822875665664, + "grad_norm": 5.40769624710083, + "learning_rate": 3.080805742069924e-05, + "logits/chosen": -15.032764434814453, + "logits/rejected": -12.852176666259766, + "logps/chosen": -275.3847351074219, + "logps/rejected": -290.7295227050781, + "loss": 0.8273, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.424884796142578, + "rewards/margins": 3.084066390991211, + "rewards/rejected": -15.508951187133789, + "step": 8300 + }, + { + "epoch": 0.3889789303079417, + "grad_norm": 68.88822937011719, + "learning_rate": 3.0576522343134986e-05, + "logits/chosen": -14.54586410522461, + "logits/rejected": -11.940185546875, + "logps/chosen": -265.1244201660156, + "logps/rejected": -278.01776123046875, + "loss": 1.115, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -11.760342597961426, + "rewards/margins": 2.771430253982544, + "rewards/rejected": -14.531773567199707, + "step": 8400 + }, + { + "epoch": 0.39360963185922665, + "grad_norm": 0.03446720167994499, + "learning_rate": 3.0344987265570734e-05, + "logits/chosen": -15.201501846313477, + "logits/rejected": -12.549232482910156, + "logps/chosen": -283.1265869140625, + "logps/rejected": -287.8479309082031, + "loss": 1.0616, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -12.81374740600586, + "rewards/margins": 2.7665340900421143, + "rewards/rejected": -15.580282211303711, + "step": 8500 + }, + { + "epoch": 0.3982403334105117, + "grad_norm": 161.14781188964844, + "learning_rate": 3.0113452188006485e-05, + "logits/chosen": -15.855225563049316, + "logits/rejected": -13.235614776611328, + "logps/chosen": -269.371337890625, + "logps/rejected": -254.53370666503906, + "loss": 1.5923, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -11.234640121459961, + "rewards/margins": 2.052427291870117, + "rewards/rejected": -13.287066459655762, + "step": 8600 + }, + { + "epoch": 0.4028710349617967, + "grad_norm": 0.12227249890565872, + "learning_rate": 2.988191711044223e-05, + "logits/chosen": -13.875014305114746, + "logits/rejected": -11.776965141296387, + "logps/chosen": -294.89019775390625, + "logps/rejected": -304.0113220214844, + "loss": 1.0747, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -13.667197227478027, + "rewards/margins": 2.635390043258667, + "rewards/rejected": -16.302587509155273, + "step": 8700 + }, + { + "epoch": 0.40750173651308175, + "grad_norm": 18.97279930114746, + "learning_rate": 2.965038203287798e-05, + "logits/chosen": -13.133578300476074, + "logits/rejected": -11.885417938232422, + "logps/chosen": -343.5578918457031, + "logps/rejected": -364.3144836425781, + "loss": 1.1518, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -18.133079528808594, + "rewards/margins": 2.30412220954895, + "rewards/rejected": -20.43720245361328, + "step": 8800 + }, + { + "epoch": 0.4121324380643667, + "grad_norm": 1.0884398221969604, + "learning_rate": 2.9418846955313732e-05, + "logits/chosen": -12.621952056884766, + "logits/rejected": -10.378700256347656, + "logps/chosen": -284.67498779296875, + "logps/rejected": -298.4495849609375, + "loss": 0.9243, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.914322853088379, + "rewards/margins": 2.796579360961914, + "rewards/rejected": -16.710901260375977, + "step": 8900 + }, + { + "epoch": 0.41676313961565176, + "grad_norm": 83.71758270263672, + "learning_rate": 2.918731187774948e-05, + "logits/chosen": -13.866020202636719, + "logits/rejected": -11.602230072021484, + "logps/chosen": -294.79998779296875, + "logps/rejected": -295.6817626953125, + "loss": 1.2985, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -13.578290939331055, + "rewards/margins": 2.169534206390381, + "rewards/rejected": -15.747827529907227, + "step": 9000 + }, + { + "epoch": 0.41676313961565176, + "eval_logits/chosen": -14.788281440734863, + "eval_logits/rejected": -12.487042427062988, + "eval_logps/chosen": -283.36846923828125, + "eval_logps/rejected": -294.3752136230469, + "eval_loss": 1.0318500995635986, + "eval_rewards/accuracies": 0.6791666746139526, + "eval_rewards/chosen": -12.970930099487305, + "eval_rewards/margins": 2.4848814010620117, + "eval_rewards/rejected": -15.455812454223633, + "eval_runtime": 595.2306, + "eval_samples_per_second": 4.032, + "eval_steps_per_second": 4.032, + "step": 9000 + }, + { + "epoch": 0.4213938411669368, + "grad_norm": 0.0032111050095409155, + "learning_rate": 2.895577680018523e-05, + "logits/chosen": -12.746328353881836, + "logits/rejected": -11.675999641418457, + "logps/chosen": -278.3330993652344, + "logps/rejected": -295.795166015625, + "loss": 1.163, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -13.291616439819336, + "rewards/margins": 1.9494922161102295, + "rewards/rejected": -15.241106986999512, + "step": 9100 + }, + { + "epoch": 0.4260245427182218, + "grad_norm": 6.636563777923584, + "learning_rate": 2.8724241722620976e-05, + "logits/chosen": -13.147544860839844, + "logits/rejected": -10.731348037719727, + "logps/chosen": -280.5777282714844, + "logps/rejected": -294.8570556640625, + "loss": 0.9186, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -13.029483795166016, + "rewards/margins": 2.9215362071990967, + "rewards/rejected": -15.951019287109375, + "step": 9200 + }, + { + "epoch": 0.43065524426950685, + "grad_norm": 40.27287673950195, + "learning_rate": 2.8492706645056727e-05, + "logits/chosen": -14.783395767211914, + "logits/rejected": -12.629622459411621, + "logps/chosen": -288.9443054199219, + "logps/rejected": -288.2258605957031, + "loss": 1.5318, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -12.149921417236328, + "rewards/margins": 1.6422406435012817, + "rewards/rejected": -13.792163848876953, + "step": 9300 + }, + { + "epoch": 0.43528594582079183, + "grad_norm": 19.758710861206055, + "learning_rate": 2.8261171567492478e-05, + "logits/chosen": -15.180052757263184, + "logits/rejected": -12.836910247802734, + "logps/chosen": -277.3511657714844, + "logps/rejected": -276.3822326660156, + "loss": 0.8826, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.489418983459473, + "rewards/margins": 2.5768609046936035, + "rewards/rejected": -14.066280364990234, + "step": 9400 + }, + { + "epoch": 0.43991664737207686, + "grad_norm": 78.22869110107422, + "learning_rate": 2.8029636489928223e-05, + "logits/chosen": -14.848343849182129, + "logits/rejected": -12.628809928894043, + "logps/chosen": -265.226318359375, + "logps/rejected": -287.7568054199219, + "loss": 0.8833, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.058309555053711, + "rewards/margins": 3.1272084712982178, + "rewards/rejected": -15.185519218444824, + "step": 9500 + }, + { + "epoch": 0.4445473489233619, + "grad_norm": 11.517510414123535, + "learning_rate": 2.7798101412363974e-05, + "logits/chosen": -14.113899230957031, + "logits/rejected": -13.288124084472656, + "logps/chosen": -267.1159362792969, + "logps/rejected": -288.9775695800781, + "loss": 1.2618, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -13.201350212097168, + "rewards/margins": 2.0197267532348633, + "rewards/rejected": -15.221077919006348, + "step": 9600 + }, + { + "epoch": 0.44917805047464693, + "grad_norm": 0.0021336909849196672, + "learning_rate": 2.7566566334799725e-05, + "logits/chosen": -14.831360816955566, + "logits/rejected": -12.230374336242676, + "logps/chosen": -287.16168212890625, + "logps/rejected": -288.4497375488281, + "loss": 0.9279, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.792760848999023, + "rewards/margins": 2.720005989074707, + "rewards/rejected": -15.512765884399414, + "step": 9700 + }, + { + "epoch": 0.4538087520259319, + "grad_norm": 107.86576080322266, + "learning_rate": 2.7335031257235473e-05, + "logits/chosen": -14.760610580444336, + "logits/rejected": -12.295888900756836, + "logps/chosen": -286.934814453125, + "logps/rejected": -295.39581298828125, + "loss": 0.9565, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.014842987060547, + "rewards/margins": 3.125756025314331, + "rewards/rejected": -15.140597343444824, + "step": 9800 + }, + { + "epoch": 0.45843945357721694, + "grad_norm": 33.82047653198242, + "learning_rate": 2.7103496179671224e-05, + "logits/chosen": -15.7147798538208, + "logits/rejected": -13.075119972229004, + "logps/chosen": -277.96746826171875, + "logps/rejected": -290.1826477050781, + "loss": 0.7664, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.504081726074219, + "rewards/margins": 3.1116833686828613, + "rewards/rejected": -14.615765571594238, + "step": 9900 + }, + { + "epoch": 0.46307015512850197, + "grad_norm": 36.2800407409668, + "learning_rate": 2.687196110210697e-05, + "logits/chosen": -15.890725135803223, + "logits/rejected": -14.088423728942871, + "logps/chosen": -265.2686462402344, + "logps/rejected": -274.26129150390625, + "loss": 1.115, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -11.545877456665039, + "rewards/margins": 2.241971015930176, + "rewards/rejected": -13.787849426269531, + "step": 10000 + }, + { + "epoch": 0.46307015512850197, + "eval_logits/chosen": -16.213720321655273, + "eval_logits/rejected": -13.893430709838867, + "eval_logps/chosen": -266.6280517578125, + "eval_logps/rejected": -277.2411193847656, + "eval_loss": 0.9608184695243835, + "eval_rewards/accuracies": 0.6929166913032532, + "eval_rewards/chosen": -11.296893119812012, + "eval_rewards/margins": 2.4455130100250244, + "eval_rewards/rejected": -13.742403984069824, + "eval_runtime": 595.4169, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 4.031, + "step": 10000 + }, + { + "epoch": 0.467700856679787, + "grad_norm": 19.66727066040039, + "learning_rate": 2.664042602454272e-05, + "logits/chosen": -15.256206512451172, + "logits/rejected": -13.022037506103516, + "logps/chosen": -262.89752197265625, + "logps/rejected": -279.8008117675781, + "loss": 1.2362, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.055992126464844, + "rewards/margins": 2.9693291187286377, + "rewards/rejected": -15.025321960449219, + "step": 10100 + }, + { + "epoch": 0.47233155823107204, + "grad_norm": 117.48945617675781, + "learning_rate": 2.640889094697847e-05, + "logits/chosen": -15.561003684997559, + "logits/rejected": -13.633736610412598, + "logps/chosen": -261.46624755859375, + "logps/rejected": -266.1573181152344, + "loss": 1.265, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.366207122802734, + "rewards/margins": 2.0377037525177, + "rewards/rejected": -14.403912544250488, + "step": 10200 + }, + { + "epoch": 0.476962259782357, + "grad_norm": 94.04891204833984, + "learning_rate": 2.6177355869414215e-05, + "logits/chosen": -14.99539566040039, + "logits/rejected": -13.923880577087402, + "logps/chosen": -283.6234436035156, + "logps/rejected": -304.1859130859375, + "loss": 1.0018, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -11.769340515136719, + "rewards/margins": 2.5206191539764404, + "rewards/rejected": -14.289960861206055, + "step": 10300 + }, + { + "epoch": 0.48159296133364204, + "grad_norm": 170.051513671875, + "learning_rate": 2.5945820791849967e-05, + "logits/chosen": -14.865379333496094, + "logits/rejected": -13.41491413116455, + "logps/chosen": -291.048828125, + "logps/rejected": -303.6175842285156, + "loss": 1.5362, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -13.860764503479004, + "rewards/margins": 2.151341676712036, + "rewards/rejected": -16.012107849121094, + "step": 10400 + }, + { + "epoch": 0.4862236628849271, + "grad_norm": 185.730712890625, + "learning_rate": 2.5714285714285714e-05, + "logits/chosen": -16.53899383544922, + "logits/rejected": -14.344609260559082, + "logps/chosen": -253.43275451660156, + "logps/rejected": -259.32000732421875, + "loss": 0.8832, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -10.720047950744629, + "rewards/margins": 2.231088876724243, + "rewards/rejected": -12.95113754272461, + "step": 10500 + }, + { + "epoch": 0.4908543644362121, + "grad_norm": 49.539283752441406, + "learning_rate": 2.5482750636721466e-05, + "logits/chosen": -15.784833908081055, + "logits/rejected": -13.199982643127441, + "logps/chosen": -258.3516845703125, + "logps/rejected": -272.4159851074219, + "loss": 0.882, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -10.969993591308594, + "rewards/margins": 3.263814687728882, + "rewards/rejected": -14.233808517456055, + "step": 10600 + }, + { + "epoch": 0.4954850659874971, + "grad_norm": 39.54684829711914, + "learning_rate": 2.5251215559157217e-05, + "logits/chosen": -12.431325912475586, + "logits/rejected": -10.826537132263184, + "logps/chosen": -282.0684814453125, + "logps/rejected": -307.7406921386719, + "loss": 0.834, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -13.165081977844238, + "rewards/margins": 3.1749138832092285, + "rewards/rejected": -16.339994430541992, + "step": 10700 + }, + { + "epoch": 0.5001157675387822, + "grad_norm": 56.96760940551758, + "learning_rate": 2.501968048159296e-05, + "logits/chosen": -11.492157936096191, + "logits/rejected": -10.02319049835205, + "logps/chosen": -316.1195068359375, + "logps/rejected": -327.64093017578125, + "loss": 1.2565, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -16.742006301879883, + "rewards/margins": 2.2133688926696777, + "rewards/rejected": -18.95537567138672, + "step": 10800 + }, + { + "epoch": 0.5047464690900672, + "grad_norm": 18.938976287841797, + "learning_rate": 2.4788145404028713e-05, + "logits/chosen": -13.359079360961914, + "logits/rejected": -11.073596954345703, + "logps/chosen": -300.6905212402344, + "logps/rejected": -313.9961853027344, + "loss": 0.9312, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.300127029418945, + "rewards/margins": 3.4490301609039307, + "rewards/rejected": -18.749156951904297, + "step": 10900 + }, + { + "epoch": 0.5093771706413521, + "grad_norm": 114.1439208984375, + "learning_rate": 2.455661032646446e-05, + "logits/chosen": -12.819533348083496, + "logits/rejected": -11.759612083435059, + "logps/chosen": -281.001708984375, + "logps/rejected": -290.2948303222656, + "loss": 1.247, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -12.776802062988281, + "rewards/margins": 2.128657817840576, + "rewards/rejected": -14.9054594039917, + "step": 11000 + }, + { + "epoch": 0.5093771706413521, + "eval_logits/chosen": -14.590534210205078, + "eval_logits/rejected": -12.480382919311523, + "eval_logps/chosen": -255.2227020263672, + "eval_logps/rejected": -267.7187194824219, + "eval_loss": 0.9643799066543579, + "eval_rewards/accuracies": 0.6891666650772095, + "eval_rewards/chosen": -10.15635871887207, + "eval_rewards/margins": 2.6338088512420654, + "eval_rewards/rejected": -12.790166854858398, + "eval_runtime": 595.3576, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 4.031, + "step": 11000 + }, + { + "epoch": 0.5140078721926372, + "grad_norm": 8.612520217895508, + "learning_rate": 2.4325075248900208e-05, + "logits/chosen": -14.783145904541016, + "logits/rejected": -12.533705711364746, + "logps/chosen": -256.94061279296875, + "logps/rejected": -258.1162414550781, + "loss": 1.0966, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -9.744736671447754, + "rewards/margins": 2.184854030609131, + "rewards/rejected": -11.92959213256836, + "step": 11100 + }, + { + "epoch": 0.5186385737439222, + "grad_norm": 0.05390896648168564, + "learning_rate": 2.409354017133596e-05, + "logits/chosen": -13.70276165008545, + "logits/rejected": -11.86025619506836, + "logps/chosen": -299.79901123046875, + "logps/rejected": -312.7564392089844, + "loss": 1.143, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -14.02008056640625, + "rewards/margins": 3.1246557235717773, + "rewards/rejected": -17.144737243652344, + "step": 11200 + }, + { + "epoch": 0.5232692752952072, + "grad_norm": 9.781183242797852, + "learning_rate": 2.3862005093771707e-05, + "logits/chosen": -13.585820198059082, + "logits/rejected": -11.637076377868652, + "logps/chosen": -319.637451171875, + "logps/rejected": -331.197509765625, + "loss": 1.268, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -16.022397994995117, + "rewards/margins": 2.7055411338806152, + "rewards/rejected": -18.72793960571289, + "step": 11300 + }, + { + "epoch": 0.5278999768464923, + "grad_norm": 0.5222776532173157, + "learning_rate": 2.3630470016207455e-05, + "logits/chosen": -15.22104263305664, + "logits/rejected": -13.836353302001953, + "logps/chosen": -299.341552734375, + "logps/rejected": -308.09967041015625, + "loss": 1.5426, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.346240043640137, + "rewards/margins": 2.032839775085449, + "rewards/rejected": -16.37908172607422, + "step": 11400 + }, + { + "epoch": 0.5325306783977772, + "grad_norm": 8.653938293457031, + "learning_rate": 2.3398934938643206e-05, + "logits/chosen": -15.450213432312012, + "logits/rejected": -13.667362213134766, + "logps/chosen": -278.2622375488281, + "logps/rejected": -309.5611572265625, + "loss": 1.1489, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.77297592163086, + "rewards/margins": 3.425985813140869, + "rewards/rejected": -16.19896125793457, + "step": 11500 + }, + { + "epoch": 0.5371613799490623, + "grad_norm": 0.0004895396414212883, + "learning_rate": 2.3167399861078954e-05, + "logits/chosen": -13.779914855957031, + "logits/rejected": -12.196557998657227, + "logps/chosen": -301.4092102050781, + "logps/rejected": -328.33807373046875, + "loss": 0.9485, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -15.266792297363281, + "rewards/margins": 3.053185224533081, + "rewards/rejected": -18.319976806640625, + "step": 11600 + }, + { + "epoch": 0.5417920815003473, + "grad_norm": 9.483084678649902, + "learning_rate": 2.2935864783514705e-05, + "logits/chosen": -13.6648588180542, + "logits/rejected": -12.20712661743164, + "logps/chosen": -316.0234069824219, + "logps/rejected": -321.4068298339844, + "loss": 1.0788, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -15.333993911743164, + "rewards/margins": 3.2509260177612305, + "rewards/rejected": -18.584922790527344, + "step": 11700 + }, + { + "epoch": 0.5464227830516323, + "grad_norm": 0.00022494388394989073, + "learning_rate": 2.2704329705950453e-05, + "logits/chosen": -13.692609786987305, + "logits/rejected": -11.759315490722656, + "logps/chosen": -280.59149169921875, + "logps/rejected": -304.33343505859375, + "loss": 1.2103, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -13.981989860534668, + "rewards/margins": 3.3563461303710938, + "rewards/rejected": -17.338335037231445, + "step": 11800 + }, + { + "epoch": 0.5510534846029174, + "grad_norm": 209.61117553710938, + "learning_rate": 2.24727946283862e-05, + "logits/chosen": -13.98275375366211, + "logits/rejected": -12.51055908203125, + "logps/chosen": -289.4585266113281, + "logps/rejected": -311.2178955078125, + "loss": 0.8208, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -13.634072303771973, + "rewards/margins": 2.5867316722869873, + "rewards/rejected": -16.220806121826172, + "step": 11900 + }, + { + "epoch": 0.5556841861542023, + "grad_norm": 8.679315567016602, + "learning_rate": 2.224125955082195e-05, + "logits/chosen": -15.948921203613281, + "logits/rejected": -13.961400985717773, + "logps/chosen": -239.7647247314453, + "logps/rejected": -265.7496032714844, + "loss": 0.8552, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -9.785449028015137, + "rewards/margins": 3.496326208114624, + "rewards/rejected": -13.281774520874023, + "step": 12000 + }, + { + "epoch": 0.5556841861542023, + "eval_logits/chosen": -16.915624618530273, + "eval_logits/rejected": -14.572998046875, + "eval_logps/chosen": -249.591064453125, + "eval_logps/rejected": -262.7091064453125, + "eval_loss": 0.9432744383811951, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -9.593189239501953, + "eval_rewards/margins": 2.6960136890411377, + "eval_rewards/rejected": -12.289203643798828, + "eval_runtime": 595.3295, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 4.031, + "step": 12000 + }, + { + "epoch": 0.5603148877054874, + "grad_norm": 0.0012331042671576142, + "learning_rate": 2.20097244732577e-05, + "logits/chosen": -16.262012481689453, + "logits/rejected": -13.913456916809082, + "logps/chosen": -255.302734375, + "logps/rejected": -258.9942626953125, + "loss": 0.735, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -10.057559967041016, + "rewards/margins": 2.793912410736084, + "rewards/rejected": -12.851473808288574, + "step": 12100 + }, + { + "epoch": 0.5649455892567724, + "grad_norm": 2.753217631834559e-05, + "learning_rate": 2.1778189395693448e-05, + "logits/chosen": -14.808769226074219, + "logits/rejected": -13.047616958618164, + "logps/chosen": -283.38848876953125, + "logps/rejected": -289.1595764160156, + "loss": 0.9373, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -12.686135292053223, + "rewards/margins": 2.9538357257843018, + "rewards/rejected": -15.639970779418945, + "step": 12200 + }, + { + "epoch": 0.5695762908080574, + "grad_norm": 104.78581237792969, + "learning_rate": 2.1546654318129196e-05, + "logits/chosen": -16.803150177001953, + "logits/rejected": -13.628451347351074, + "logps/chosen": -308.3251037597656, + "logps/rejected": -310.8792419433594, + "loss": 1.1489, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -13.29080581665039, + "rewards/margins": 3.165923833847046, + "rewards/rejected": -16.45673179626465, + "step": 12300 + }, + { + "epoch": 0.5742069923593425, + "grad_norm": 150.4987030029297, + "learning_rate": 2.1315119240564947e-05, + "logits/chosen": -17.27256965637207, + "logits/rejected": -15.5678129196167, + "logps/chosen": -275.7103271484375, + "logps/rejected": -295.6195983886719, + "loss": 1.068, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -12.30087661743164, + "rewards/margins": 2.3663229942321777, + "rewards/rejected": -14.667201042175293, + "step": 12400 + }, + { + "epoch": 0.5788376939106274, + "grad_norm": 133.8286590576172, + "learning_rate": 2.1083584163000698e-05, + "logits/chosen": -16.757888793945312, + "logits/rejected": -14.104558944702148, + "logps/chosen": -300.20166015625, + "logps/rejected": -296.3144836425781, + "loss": 1.1399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.082332611083984, + "rewards/margins": 2.939077377319336, + "rewards/rejected": -16.021408081054688, + "step": 12500 + }, + { + "epoch": 0.5834683954619124, + "grad_norm": 0.008064119145274162, + "learning_rate": 2.0852049085436446e-05, + "logits/chosen": -16.91384506225586, + "logits/rejected": -14.279358863830566, + "logps/chosen": -287.9043273925781, + "logps/rejected": -298.12799072265625, + "loss": 1.1943, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -11.913217544555664, + "rewards/margins": 3.4037926197052, + "rewards/rejected": -15.317008972167969, + "step": 12600 + }, + { + "epoch": 0.5880990970131975, + "grad_norm": 111.3694076538086, + "learning_rate": 2.0620514007872194e-05, + "logits/chosen": -15.357534408569336, + "logits/rejected": -13.22323226928711, + "logps/chosen": -286.35333251953125, + "logps/rejected": -292.608642578125, + "loss": 0.8879, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -13.575251579284668, + "rewards/margins": 3.244809627532959, + "rewards/rejected": -16.8200626373291, + "step": 12700 + }, + { + "epoch": 0.5927297985644825, + "grad_norm": 0.017219742760062218, + "learning_rate": 2.0388978930307942e-05, + "logits/chosen": -17.543432235717773, + "logits/rejected": -14.663466453552246, + "logps/chosen": -289.28436279296875, + "logps/rejected": -293.8553771972656, + "loss": 1.1143, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -12.925048828125, + "rewards/margins": 3.078773260116577, + "rewards/rejected": -16.003822326660156, + "step": 12800 + }, + { + "epoch": 0.5973605001157676, + "grad_norm": 2.288099765777588, + "learning_rate": 2.015744385274369e-05, + "logits/chosen": -16.486202239990234, + "logits/rejected": -13.849590301513672, + "logps/chosen": -280.4773864746094, + "logps/rejected": -303.1964111328125, + "loss": 0.9143, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -11.8477144241333, + "rewards/margins": 3.704777717590332, + "rewards/rejected": -15.55249309539795, + "step": 12900 + }, + { + "epoch": 0.6019912016670526, + "grad_norm": 0.00016069311823230237, + "learning_rate": 1.992590877517944e-05, + "logits/chosen": -15.689356803894043, + "logits/rejected": -13.36341667175293, + "logps/chosen": -289.8409729003906, + "logps/rejected": -305.40447998046875, + "loss": 1.0207, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -13.258853912353516, + "rewards/margins": 3.5280921459198, + "rewards/rejected": -16.786945343017578, + "step": 13000 + }, + { + "epoch": 0.6019912016670526, + "eval_logits/chosen": -14.765463829040527, + "eval_logits/rejected": -12.787005424499512, + "eval_logps/chosen": -291.5428466796875, + "eval_logps/rejected": -308.170654296875, + "eval_loss": 1.0707193613052368, + "eval_rewards/accuracies": 0.6933333277702332, + "eval_rewards/chosen": -13.788372039794922, + "eval_rewards/margins": 3.046985626220703, + "eval_rewards/rejected": -16.835355758666992, + "eval_runtime": 595.1873, + "eval_samples_per_second": 4.032, + "eval_steps_per_second": 4.032, + "step": 13000 + }, + { + "epoch": 0.6066219032183375, + "grad_norm": 0.046535275876522064, + "learning_rate": 1.969437369761519e-05, + "logits/chosen": -15.378928184509277, + "logits/rejected": -12.789362907409668, + "logps/chosen": -309.1998291015625, + "logps/rejected": -323.6488037109375, + "loss": 0.9008, + "rewards/accuracies": 0.7699999809265137, + "rewards/chosen": -13.331801414489746, + "rewards/margins": 3.8759450912475586, + "rewards/rejected": -17.207748413085938, + "step": 13100 + }, + { + "epoch": 0.6112526047696226, + "grad_norm": 1.3174071311950684, + "learning_rate": 1.946283862005094e-05, + "logits/chosen": -14.426813125610352, + "logits/rejected": -12.497895240783691, + "logps/chosen": -283.8373718261719, + "logps/rejected": -290.7317199707031, + "loss": 1.187, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -12.359267234802246, + "rewards/margins": 3.016118049621582, + "rewards/rejected": -15.375384330749512, + "step": 13200 + }, + { + "epoch": 0.6158833063209076, + "grad_norm": 100.655517578125, + "learning_rate": 1.9231303542486688e-05, + "logits/chosen": -15.149957656860352, + "logits/rejected": -14.034040451049805, + "logps/chosen": -258.75225830078125, + "logps/rejected": -281.8547668457031, + "loss": 1.4425, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -10.893335342407227, + "rewards/margins": 2.794825553894043, + "rewards/rejected": -13.688159942626953, + "step": 13300 + }, + { + "epoch": 0.6205140078721927, + "grad_norm": 15.705586433410645, + "learning_rate": 1.899976846492244e-05, + "logits/chosen": -15.395785331726074, + "logits/rejected": -13.503198623657227, + "logps/chosen": -251.60552978515625, + "logps/rejected": -264.2630920410156, + "loss": 1.0821, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -9.937586784362793, + "rewards/margins": 2.0965304374694824, + "rewards/rejected": -12.034117698669434, + "step": 13400 + }, + { + "epoch": 0.6251447094234777, + "grad_norm": 69.60533905029297, + "learning_rate": 1.8768233387358187e-05, + "logits/chosen": -16.102800369262695, + "logits/rejected": -13.624198913574219, + "logps/chosen": -270.32696533203125, + "logps/rejected": -277.168701171875, + "loss": 1.3105, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -10.82644271850586, + "rewards/margins": 2.253209352493286, + "rewards/rejected": -13.079649925231934, + "step": 13500 + }, + { + "epoch": 0.6297754109747626, + "grad_norm": 1.8112232282874174e-05, + "learning_rate": 1.8536698309793935e-05, + "logits/chosen": -15.864636421203613, + "logits/rejected": -13.777695655822754, + "logps/chosen": -267.03643798828125, + "logps/rejected": -290.5374755859375, + "loss": 0.8846, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -11.370699882507324, + "rewards/margins": 2.5179872512817383, + "rewards/rejected": -13.888686180114746, + "step": 13600 + }, + { + "epoch": 0.6344061125260477, + "grad_norm": 3.8680782318115234, + "learning_rate": 1.8305163232229682e-05, + "logits/chosen": -14.683868408203125, + "logits/rejected": -13.11482048034668, + "logps/chosen": -297.38006591796875, + "logps/rejected": -313.8203125, + "loss": 1.0431, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.36437702178955, + "rewards/margins": 2.6509974002838135, + "rewards/rejected": -17.01537322998047, + "step": 13700 + }, + { + "epoch": 0.6390368140773327, + "grad_norm": 8.920862197875977, + "learning_rate": 1.8073628154665434e-05, + "logits/chosen": -14.917624473571777, + "logits/rejected": -13.442770957946777, + "logps/chosen": -280.73907470703125, + "logps/rejected": -293.82025146484375, + "loss": 1.152, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.922651290893555, + "rewards/margins": 2.584465265274048, + "rewards/rejected": -15.507116317749023, + "step": 13800 + }, + { + "epoch": 0.6436675156286177, + "grad_norm": 0.016613028943538666, + "learning_rate": 1.784209307710118e-05, + "logits/chosen": -14.548389434814453, + "logits/rejected": -12.599615097045898, + "logps/chosen": -251.54917907714844, + "logps/rejected": -274.4599914550781, + "loss": 1.1302, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -11.875404357910156, + "rewards/margins": 2.626995801925659, + "rewards/rejected": -14.502399444580078, + "step": 13900 + }, + { + "epoch": 0.6482982171799028, + "grad_norm": 1.4019454717636108, + "learning_rate": 1.761055799953693e-05, + "logits/chosen": -15.579767227172852, + "logits/rejected": -13.13071060180664, + "logps/chosen": -302.3855895996094, + "logps/rejected": -302.752197265625, + "loss": 1.1515, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -12.90218734741211, + "rewards/margins": 2.74299693107605, + "rewards/rejected": -15.645181655883789, + "step": 14000 + }, + { + "epoch": 0.6482982171799028, + "eval_logits/chosen": -14.666524887084961, + "eval_logits/rejected": -12.669315338134766, + "eval_logps/chosen": -285.93170166015625, + "eval_logps/rejected": -303.45745849609375, + "eval_loss": 0.9831567406654358, + "eval_rewards/accuracies": 0.7083333134651184, + "eval_rewards/chosen": -13.227254867553711, + "eval_rewards/margins": 3.13678240776062, + "eval_rewards/rejected": -16.36404037475586, + "eval_runtime": 595.2283, + "eval_samples_per_second": 4.032, + "eval_steps_per_second": 4.032, + "step": 14000 + }, + { + "epoch": 0.6529289187311877, + "grad_norm": 0.014233123511075974, + "learning_rate": 1.737902292197268e-05, + "logits/chosen": -14.30229377746582, + "logits/rejected": -12.721465110778809, + "logps/chosen": -298.66680908203125, + "logps/rejected": -312.6944274902344, + "loss": 1.4831, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -14.223825454711914, + "rewards/margins": 1.9816741943359375, + "rewards/rejected": -16.20549964904785, + "step": 14100 + }, + { + "epoch": 0.6575596202824728, + "grad_norm": 13.379115104675293, + "learning_rate": 1.714748784440843e-05, + "logits/chosen": -12.73404312133789, + "logits/rejected": -12.431059837341309, + "logps/chosen": -303.9568786621094, + "logps/rejected": -315.571533203125, + "loss": 1.5352, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -15.876431465148926, + "rewards/margins": 1.6310337781906128, + "rewards/rejected": -17.507463455200195, + "step": 14200 + }, + { + "epoch": 0.6621903218337578, + "grad_norm": 161.47584533691406, + "learning_rate": 1.691595276684418e-05, + "logits/chosen": -15.389359474182129, + "logits/rejected": -12.315173149108887, + "logps/chosen": -290.67950439453125, + "logps/rejected": -285.4556579589844, + "loss": 0.9067, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.797494888305664, + "rewards/margins": 3.7715113162994385, + "rewards/rejected": -16.569007873535156, + "step": 14300 + }, + { + "epoch": 0.6668210233850428, + "grad_norm": 46.53482437133789, + "learning_rate": 1.6684417689279927e-05, + "logits/chosen": -14.485678672790527, + "logits/rejected": -12.454192161560059, + "logps/chosen": -311.9996032714844, + "logps/rejected": -307.30157470703125, + "loss": 1.1361, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -14.843450546264648, + "rewards/margins": 2.6856822967529297, + "rewards/rejected": -17.529132843017578, + "step": 14400 + }, + { + "epoch": 0.6714517249363279, + "grad_norm": 26.74111557006836, + "learning_rate": 1.6452882611715675e-05, + "logits/chosen": -15.238399505615234, + "logits/rejected": -13.172194480895996, + "logps/chosen": -277.8358459472656, + "logps/rejected": -284.4881286621094, + "loss": 0.929, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -12.725804328918457, + "rewards/margins": 2.9910831451416016, + "rewards/rejected": -15.71688461303711, + "step": 14500 + }, + { + "epoch": 0.6760824264876129, + "grad_norm": 11.716771125793457, + "learning_rate": 1.6221347534151423e-05, + "logits/chosen": -14.936090469360352, + "logits/rejected": -13.450495719909668, + "logps/chosen": -270.56854248046875, + "logps/rejected": -291.61248779296875, + "loss": 1.0162, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -12.403809547424316, + "rewards/margins": 2.691650390625, + "rewards/rejected": -15.09546184539795, + "step": 14600 + }, + { + "epoch": 0.6807131280388979, + "grad_norm": 36.412960052490234, + "learning_rate": 1.5989812456587174e-05, + "logits/chosen": -14.691692352294922, + "logits/rejected": -12.6322021484375, + "logps/chosen": -292.3916931152344, + "logps/rejected": -296.074951171875, + "loss": 1.5685, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -14.284167289733887, + "rewards/margins": 2.3777682781219482, + "rewards/rejected": -16.66193389892578, + "step": 14700 + }, + { + "epoch": 0.6853438295901829, + "grad_norm": 24.627042770385742, + "learning_rate": 1.5758277379022922e-05, + "logits/chosen": -13.739795684814453, + "logits/rejected": -12.348119735717773, + "logps/chosen": -292.4388732910156, + "logps/rejected": -313.0835876464844, + "loss": 0.9656, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -15.461133003234863, + "rewards/margins": 3.123561143875122, + "rewards/rejected": -18.584693908691406, + "step": 14800 + }, + { + "epoch": 0.6899745311414679, + "grad_norm": 99.29113006591797, + "learning_rate": 1.552674230145867e-05, + "logits/chosen": -14.539809226989746, + "logits/rejected": -12.484415054321289, + "logps/chosen": -308.2098083496094, + "logps/rejected": -323.6732482910156, + "loss": 1.0316, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -15.086298942565918, + "rewards/margins": 3.069261074066162, + "rewards/rejected": -18.155559539794922, + "step": 14900 + }, + { + "epoch": 0.694605232692753, + "grad_norm": 7.5167412757873535, + "learning_rate": 1.529520722389442e-05, + "logits/chosen": -13.50758171081543, + "logits/rejected": -12.50390625, + "logps/chosen": -277.1807861328125, + "logps/rejected": -310.3955993652344, + "loss": 1.0758, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -14.185407638549805, + "rewards/margins": 3.0528149604797363, + "rewards/rejected": -17.238222122192383, + "step": 15000 + }, + { + "epoch": 0.694605232692753, + "eval_logits/chosen": -13.796860694885254, + "eval_logits/rejected": -11.833579063415527, + "eval_logps/chosen": -290.43701171875, + "eval_logps/rejected": -308.2549743652344, + "eval_loss": 0.9895668625831604, + "eval_rewards/accuracies": 0.7108333110809326, + "eval_rewards/chosen": -13.677786827087402, + "eval_rewards/margins": 3.1660029888153076, + "eval_rewards/rejected": -16.84379005432129, + "eval_runtime": 595.1302, + "eval_samples_per_second": 4.033, + "eval_steps_per_second": 4.033, + "step": 15000 + }, + { + "epoch": 0.699235934244038, + "grad_norm": 1.053455114364624, + "learning_rate": 1.506367214633017e-05, + "logits/chosen": -12.791362762451172, + "logits/rejected": -10.7622709274292, + "logps/chosen": -290.3780212402344, + "logps/rejected": -300.4528503417969, + "loss": 0.8474, + "rewards/accuracies": 0.7699999809265137, + "rewards/chosen": -14.130489349365234, + "rewards/margins": 3.402808427810669, + "rewards/rejected": -17.533300399780273, + "step": 15100 + }, + { + "epoch": 0.7038666357953229, + "grad_norm": 87.40723419189453, + "learning_rate": 1.4834452419541562e-05, + "logits/chosen": -13.96704387664795, + "logits/rejected": -11.846227645874023, + "logps/chosen": -267.1881103515625, + "logps/rejected": -279.21661376953125, + "loss": 0.9192, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.996103286743164, + "rewards/margins": 2.6929244995117188, + "rewards/rejected": -15.689026832580566, + "step": 15200 + }, + { + "epoch": 0.708497337346608, + "grad_norm": 0.009607501327991486, + "learning_rate": 1.460291734197731e-05, + "logits/chosen": -14.558998107910156, + "logits/rejected": -12.206707954406738, + "logps/chosen": -270.8651428222656, + "logps/rejected": -293.0597839355469, + "loss": 1.0426, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -12.181621551513672, + "rewards/margins": 3.268308162689209, + "rewards/rejected": -15.449930191040039, + "step": 15300 + }, + { + "epoch": 0.713128038897893, + "grad_norm": 2.457125186920166, + "learning_rate": 1.4371382264413058e-05, + "logits/chosen": -15.273738861083984, + "logits/rejected": -11.646648406982422, + "logps/chosen": -288.58837890625, + "logps/rejected": -276.04229736328125, + "loss": 0.979, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.105132102966309, + "rewards/margins": 3.0485777854919434, + "rewards/rejected": -15.153708457946777, + "step": 15400 + }, + { + "epoch": 0.7177587404491781, + "grad_norm": 0.34003034234046936, + "learning_rate": 1.4139847186848809e-05, + "logits/chosen": -16.237367630004883, + "logits/rejected": -13.76130485534668, + "logps/chosen": -256.4635925292969, + "logps/rejected": -282.35308837890625, + "loss": 0.9507, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -10.752126693725586, + "rewards/margins": 3.626908540725708, + "rewards/rejected": -14.379035949707031, + "step": 15500 + }, + { + "epoch": 0.7223894420004631, + "grad_norm": 146.7932891845703, + "learning_rate": 1.3908312109284558e-05, + "logits/chosen": -15.454626083374023, + "logits/rejected": -13.428391456604004, + "logps/chosen": -283.355224609375, + "logps/rejected": -297.4376525878906, + "loss": 0.9933, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.3836030960083, + "rewards/margins": 2.890033483505249, + "rewards/rejected": -14.273636817932129, + "step": 15600 + }, + { + "epoch": 0.727020143551748, + "grad_norm": 141.23631286621094, + "learning_rate": 1.3676777031720306e-05, + "logits/chosen": -14.704964637756348, + "logits/rejected": -12.575312614440918, + "logps/chosen": -286.65362548828125, + "logps/rejected": -310.803955078125, + "loss": 1.2049, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -11.945639610290527, + "rewards/margins": 4.382092475891113, + "rewards/rejected": -16.32773208618164, + "step": 15700 + }, + { + "epoch": 0.7316508451030331, + "grad_norm": 0.2799323499202728, + "learning_rate": 1.3445241954156054e-05, + "logits/chosen": -12.737774848937988, + "logits/rejected": -11.21628189086914, + "logps/chosen": -298.6518249511719, + "logps/rejected": -310.916259765625, + "loss": 1.5277, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -15.038265228271484, + "rewards/margins": 2.875054359436035, + "rewards/rejected": -17.913318634033203, + "step": 15800 + }, + { + "epoch": 0.7362815466543181, + "grad_norm": 1.3903080224990845, + "learning_rate": 1.3213706876591805e-05, + "logits/chosen": -13.068157196044922, + "logits/rejected": -11.648136138916016, + "logps/chosen": -295.270263671875, + "logps/rejected": -309.232666015625, + "loss": 1.4665, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -14.108922958374023, + "rewards/margins": 2.2027931213378906, + "rewards/rejected": -16.311716079711914, + "step": 15900 + }, + { + "epoch": 0.7409122482056032, + "grad_norm": 141.10264587402344, + "learning_rate": 1.2982171799027553e-05, + "logits/chosen": -12.949649810791016, + "logits/rejected": -11.205181121826172, + "logps/chosen": -260.01531982421875, + "logps/rejected": -281.1571960449219, + "loss": 0.8967, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -11.738259315490723, + "rewards/margins": 3.2035210132598877, + "rewards/rejected": -14.941780090332031, + "step": 16000 + }, + { + "epoch": 0.7409122482056032, + "eval_logits/chosen": -13.536864280700684, + "eval_logits/rejected": -11.524176597595215, + "eval_logps/chosen": -281.8379821777344, + "eval_logps/rejected": -299.4613342285156, + "eval_loss": 0.9621976613998413, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -12.81788444519043, + "eval_rewards/margins": 3.146540880203247, + "eval_rewards/rejected": -15.964425086975098, + "eval_runtime": 595.3564, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 4.031, + "step": 16000 + }, + { + "epoch": 0.7455429497568882, + "grad_norm": 190.99884033203125, + "learning_rate": 1.2750636721463303e-05, + "logits/chosen": -13.84753131866455, + "logits/rejected": -12.161564826965332, + "logps/chosen": -280.1208801269531, + "logps/rejected": -314.58917236328125, + "loss": 1.0943, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -12.789907455444336, + "rewards/margins": 3.241086006164551, + "rewards/rejected": -16.03099250793457, + "step": 16100 + }, + { + "epoch": 0.7501736513081732, + "grad_norm": 0.0009654845925979316, + "learning_rate": 1.251910164389905e-05, + "logits/chosen": -13.52328872680664, + "logits/rejected": -11.637651443481445, + "logps/chosen": -293.39483642578125, + "logps/rejected": -323.0391540527344, + "loss": 1.0122, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -13.000173568725586, + "rewards/margins": 3.734714984893799, + "rewards/rejected": -16.734888076782227, + "step": 16200 + }, + { + "epoch": 0.7548043528594582, + "grad_norm": 53.88007354736328, + "learning_rate": 1.22875665663348e-05, + "logits/chosen": -13.761452674865723, + "logits/rejected": -11.556824684143066, + "logps/chosen": -272.66436767578125, + "logps/rejected": -294.9356384277344, + "loss": 0.8265, + "rewards/accuracies": 0.7799999713897705, + "rewards/chosen": -11.812010765075684, + "rewards/margins": 2.9586479663848877, + "rewards/rejected": -14.770659446716309, + "step": 16300 + }, + { + "epoch": 0.7594350544107432, + "grad_norm": 26.267419815063477, + "learning_rate": 1.205603148877055e-05, + "logits/chosen": -12.850613594055176, + "logits/rejected": -11.345057487487793, + "logps/chosen": -296.42498779296875, + "logps/rejected": -312.7100830078125, + "loss": 0.8868, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -12.809453010559082, + "rewards/margins": 2.7720766067504883, + "rewards/rejected": -15.581528663635254, + "step": 16400 + }, + { + "epoch": 0.7640657559620282, + "grad_norm": 0.1327919363975525, + "learning_rate": 1.1824496411206299e-05, + "logits/chosen": -12.737493515014648, + "logits/rejected": -10.811483383178711, + "logps/chosen": -271.80743408203125, + "logps/rejected": -286.36383056640625, + "loss": 1.2946, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -12.860858917236328, + "rewards/margins": 2.731450080871582, + "rewards/rejected": -15.592310905456543, + "step": 16500 + }, + { + "epoch": 0.7686964575133133, + "grad_norm": 0.0014511727495118976, + "learning_rate": 1.1592961333642047e-05, + "logits/chosen": -12.535152435302734, + "logits/rejected": -10.770435333251953, + "logps/chosen": -276.2295227050781, + "logps/rejected": -292.5387878417969, + "loss": 1.0457, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -12.783775329589844, + "rewards/margins": 2.7502074241638184, + "rewards/rejected": -15.533984184265137, + "step": 16600 + }, + { + "epoch": 0.7733271590645983, + "grad_norm": 17.507179260253906, + "learning_rate": 1.1361426256077796e-05, + "logits/chosen": -12.542750358581543, + "logits/rejected": -10.764151573181152, + "logps/chosen": -294.1999206542969, + "logps/rejected": -315.76861572265625, + "loss": 1.0815, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -14.347511291503906, + "rewards/margins": 3.8440487384796143, + "rewards/rejected": -18.191560745239258, + "step": 16700 + }, + { + "epoch": 0.7779578606158833, + "grad_norm": 127.55777740478516, + "learning_rate": 1.1129891178513544e-05, + "logits/chosen": -14.010462760925293, + "logits/rejected": -11.466479301452637, + "logps/chosen": -294.719482421875, + "logps/rejected": -292.8918151855469, + "loss": 1.1056, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -12.861717224121094, + "rewards/margins": 2.829714298248291, + "rewards/rejected": -15.69143295288086, + "step": 16800 + }, + { + "epoch": 0.7825885621671683, + "grad_norm": 4.169695854187012, + "learning_rate": 1.0898356100949295e-05, + "logits/chosen": -13.056097030639648, + "logits/rejected": -10.822772979736328, + "logps/chosen": -294.1612548828125, + "logps/rejected": -305.1460876464844, + "loss": 0.8364, + "rewards/accuracies": 0.7799999713897705, + "rewards/chosen": -13.012012481689453, + "rewards/margins": 3.6792736053466797, + "rewards/rejected": -16.691287994384766, + "step": 16900 + }, + { + "epoch": 0.7872192637184533, + "grad_norm": 1.3353691101074219, + "learning_rate": 1.0666821023385043e-05, + "logits/chosen": -12.639263153076172, + "logits/rejected": -10.551265716552734, + "logps/chosen": -285.9519958496094, + "logps/rejected": -300.0907897949219, + "loss": 1.0922, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -14.422396659851074, + "rewards/margins": 2.6487817764282227, + "rewards/rejected": -17.071178436279297, + "step": 17000 + }, + { + "epoch": 0.7872192637184533, + "eval_logits/chosen": -12.934601783752441, + "eval_logits/rejected": -10.735593795776367, + "eval_logps/chosen": -287.7341613769531, + "eval_logps/rejected": -306.9658203125, + "eval_loss": 0.9625710248947144, + "eval_rewards/accuracies": 0.7195833325386047, + "eval_rewards/chosen": -13.407501220703125, + "eval_rewards/margins": 3.3073694705963135, + "eval_rewards/rejected": -16.714872360229492, + "eval_runtime": 595.2721, + "eval_samples_per_second": 4.032, + "eval_steps_per_second": 4.032, + "step": 17000 + }, + { + "epoch": 0.7918499652697384, + "grad_norm": 238.36265563964844, + "learning_rate": 1.0435285945820793e-05, + "logits/chosen": -12.879168510437012, + "logits/rejected": -10.35470962524414, + "logps/chosen": -287.34698486328125, + "logps/rejected": -294.0375061035156, + "loss": 1.0777, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -13.080986022949219, + "rewards/margins": 3.038571834564209, + "rewards/rejected": -16.119556427001953, + "step": 17100 + }, + { + "epoch": 0.7964806668210234, + "grad_norm": 115.40522766113281, + "learning_rate": 1.020375086825654e-05, + "logits/chosen": -13.390498161315918, + "logits/rejected": -11.11551284790039, + "logps/chosen": -284.6138000488281, + "logps/rejected": -290.37554931640625, + "loss": 1.0877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.631682395935059, + "rewards/margins": 2.2628602981567383, + "rewards/rejected": -14.894542694091797, + "step": 17200 + }, + { + "epoch": 0.8011113683723085, + "grad_norm": 81.7811279296875, + "learning_rate": 9.97221579069229e-06, + "logits/chosen": -13.257206916809082, + "logits/rejected": -11.7109375, + "logps/chosen": -298.948974609375, + "logps/rejected": -325.81036376953125, + "loss": 1.3241, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -13.646310806274414, + "rewards/margins": 2.978910207748413, + "rewards/rejected": -16.625221252441406, + "step": 17300 + }, + { + "epoch": 0.8057420699235934, + "grad_norm": 142.9304656982422, + "learning_rate": 9.742996063903683e-06, + "logits/chosen": -13.058794021606445, + "logits/rejected": -11.029099464416504, + "logps/chosen": -276.3821105957031, + "logps/rejected": -288.647705078125, + "loss": 1.0781, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -13.544109344482422, + "rewards/margins": 2.761138677597046, + "rewards/rejected": -16.305246353149414, + "step": 17400 + }, + { + "epoch": 0.8103727714748784, + "grad_norm": 10.479734420776367, + "learning_rate": 9.51146098633943e-06, + "logits/chosen": -12.74901008605957, + "logits/rejected": -11.03323745727539, + "logps/chosen": -290.25555419921875, + "logps/rejected": -309.6465759277344, + "loss": 0.9795, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.511792182922363, + "rewards/margins": 3.147340774536133, + "rewards/rejected": -16.659133911132812, + "step": 17500 + }, + { + "epoch": 0.8150034730261635, + "grad_norm": 0.0002637170546222478, + "learning_rate": 9.282241259550822e-06, + "logits/chosen": -11.844225883483887, + "logits/rejected": -9.50997543334961, + "logps/chosen": -286.7944030761719, + "logps/rejected": -299.2738342285156, + "loss": 0.9225, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -14.926375389099121, + "rewards/margins": 3.188018798828125, + "rewards/rejected": -18.114395141601562, + "step": 17600 + }, + { + "epoch": 0.8196341745774485, + "grad_norm": 9.602815628051758, + "learning_rate": 9.050706181986571e-06, + "logits/chosen": -13.951495170593262, + "logits/rejected": -11.048531532287598, + "logps/chosen": -290.7369384765625, + "logps/rejected": -305.9258117675781, + "loss": 1.1613, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -13.608471870422363, + "rewards/margins": 3.499152183532715, + "rewards/rejected": -17.107624053955078, + "step": 17700 + }, + { + "epoch": 0.8242648761287334, + "grad_norm": 99.48814392089844, + "learning_rate": 8.819171104422321e-06, + "logits/chosen": -13.188122749328613, + "logits/rejected": -11.652185440063477, + "logps/chosen": -272.55535888671875, + "logps/rejected": -287.4886779785156, + "loss": 0.9058, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.741266250610352, + "rewards/margins": 2.858311176300049, + "rewards/rejected": -14.599577903747559, + "step": 17800 + }, + { + "epoch": 0.8288955776800185, + "grad_norm": 0.6386672854423523, + "learning_rate": 8.587636026858069e-06, + "logits/chosen": -13.960208892822266, + "logits/rejected": -12.15221881866455, + "logps/chosen": -278.147705078125, + "logps/rejected": -297.4937744140625, + "loss": 0.9024, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.239116668701172, + "rewards/margins": 2.993659019470215, + "rewards/rejected": -15.232775688171387, + "step": 17900 + }, + { + "epoch": 0.8335262792313035, + "grad_norm": 25.236881256103516, + "learning_rate": 8.356100949293818e-06, + "logits/chosen": -14.14141845703125, + "logits/rejected": -11.77079963684082, + "logps/chosen": -274.72637939453125, + "logps/rejected": -297.6695556640625, + "loss": 0.6923, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.932685852050781, + "rewards/margins": 3.8792471885681152, + "rewards/rejected": -16.811933517456055, + "step": 18000 + }, + { + "epoch": 0.8335262792313035, + "eval_logits/chosen": -14.136133193969727, + "eval_logits/rejected": -11.753191947937012, + "eval_logps/chosen": -275.5329284667969, + "eval_logps/rejected": -294.1294250488281, + "eval_loss": 0.9302791357040405, + "eval_rewards/accuracies": 0.7266666889190674, + "eval_rewards/chosen": -12.187378883361816, + "eval_rewards/margins": 3.2438580989837646, + "eval_rewards/rejected": -15.43123722076416, + "eval_runtime": 595.0535, + "eval_samples_per_second": 4.033, + "eval_steps_per_second": 4.033, + "step": 18000 + } + ], + "logging_steps": 100, + "max_steps": 21595, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}