diff --git "a/checkpoint-5000/trainer_state.json" "b/checkpoint-5000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5000/trainer_state.json" @@ -0,0 +1,8336 @@ +{ + "best_metric": 0.4747713804244995, + "best_model_checkpoint": "data/phi_1_5_dpo_ep6/checkpoint-5000", + "epoch": 5.4288816503800215, + "eval_steps": 100, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2.2784331867920224, + "learning_rate": 5e-09, + "logits/chosen": 4.471683502197266, + "logits/rejected": 5.047541618347168, + "logps/chosen": -583.1558837890625, + "logps/rejected": -443.7651062011719, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 2.1074329313581432, + "learning_rate": 5e-08, + "logits/chosen": 4.588065147399902, + "logits/rejected": 4.978394985198975, + "logps/chosen": -552.5501098632812, + "logps/rejected": -387.0480651855469, + "loss": 0.6928, + "rewards/accuracies": 0.5069444179534912, + "rewards/chosen": 0.0003048771759495139, + "rewards/margins": 0.0007037359173409641, + "rewards/rejected": -0.00039885862497612834, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 3.585629152687865, + "learning_rate": 1e-07, + "logits/chosen": 4.497701168060303, + "logits/rejected": 4.875298976898193, + "logps/chosen": -568.1593627929688, + "logps/rejected": -405.4841003417969, + "loss": 0.6933, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.0002680518664419651, + "rewards/margins": -1.2179441000625957e-05, + "rewards/rejected": -0.0002558725536800921, + "step": 20 + }, + { + "epoch": 0.03, + "grad_norm": 2.1397508289715015, + "learning_rate": 1.5e-07, + "logits/chosen": 4.588676452636719, + "logits/rejected": 4.831759452819824, + "logps/chosen": -480.6222229003906, + "logps/rejected": -376.4667663574219, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -8.940284897107631e-05, + "rewards/margins": -5.898187373531982e-05, + "rewards/rejected": -3.042102252948098e-05, + "step": 30 + }, + { + "epoch": 0.04, + "grad_norm": 2.196396235652288, + "learning_rate": 2e-07, + "logits/chosen": 4.697300910949707, + "logits/rejected": 5.044940948486328, + "logps/chosen": -558.78759765625, + "logps/rejected": -447.4754333496094, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0011049907188862562, + "rewards/margins": 0.0004904826055280864, + "rewards/rejected": 0.0006145082297734916, + "step": 40 + }, + { + "epoch": 0.05, + "grad_norm": 1.9641522024523996, + "learning_rate": 2.5e-07, + "logits/chosen": 4.586944103240967, + "logits/rejected": 4.845526695251465, + "logps/chosen": -569.728515625, + "logps/rejected": -436.475341796875, + "loss": 0.6927, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0018296729540452361, + "rewards/margins": 0.0005367769626900554, + "rewards/rejected": 0.0012928961077705026, + "step": 50 + }, + { + "epoch": 0.07, + "grad_norm": 2.0311045116257476, + "learning_rate": 3e-07, + "logits/chosen": 4.496231555938721, + "logits/rejected": 4.9472222328186035, + "logps/chosen": -518.8323364257812, + "logps/rejected": -363.575927734375, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005082172341644764, + "rewards/margins": 0.003068957244977355, + "rewards/rejected": 0.0020132153294980526, + "step": 60 + }, + { + "epoch": 0.08, + "grad_norm": 2.122608710951403, + "learning_rate": 3.5e-07, + "logits/chosen": 4.606133460998535, + "logits/rejected": 4.809296607971191, + "logps/chosen": -485.9520568847656, + "logps/rejected": -387.82830810546875, + "loss": 0.6915, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.007089491002261639, + "rewards/margins": 0.000784275762271136, + "rewards/rejected": 0.006305214948952198, + "step": 70 + }, + { + "epoch": 0.09, + "grad_norm": 2.9116433618910453, + "learning_rate": 4e-07, + "logits/chosen": 4.483397483825684, + "logits/rejected": 5.010839462280273, + "logps/chosen": -596.1915283203125, + "logps/rejected": -399.2021789550781, + "loss": 0.6908, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01583760604262352, + "rewards/margins": 0.006217488087713718, + "rewards/rejected": 0.00962011981755495, + "step": 80 + }, + { + "epoch": 0.1, + "grad_norm": 2.1318681018681374, + "learning_rate": 4.5e-07, + "logits/chosen": 4.709166526794434, + "logits/rejected": 4.690488338470459, + "logps/chosen": -557.8314208984375, + "logps/rejected": -429.8755798339844, + "loss": 0.6885, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.025951331481337547, + "rewards/margins": 0.00821693055331707, + "rewards/rejected": 0.017734399065375328, + "step": 90 + }, + { + "epoch": 0.11, + "grad_norm": 2.0017670975468773, + "learning_rate": 5e-07, + "logits/chosen": 4.650539875030518, + "logits/rejected": 4.840279579162598, + "logps/chosen": -522.7197875976562, + "logps/rejected": -441.0248107910156, + "loss": 0.6881, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.03702830895781517, + "rewards/margins": 0.008776113390922546, + "rewards/rejected": 0.028252195566892624, + "step": 100 + }, + { + "epoch": 0.11, + "eval_logits/chosen": 4.6645894050598145, + "eval_logits/rejected": 4.888310432434082, + "eval_logps/chosen": -538.6564331054688, + "eval_logps/rejected": -421.0949401855469, + "eval_loss": 0.6856396198272705, + "eval_rewards/accuracies": 0.7023809552192688, + "eval_rewards/chosen": 0.046804603189229965, + "eval_rewards/margins": 0.017009131610393524, + "eval_rewards/rejected": 0.02979547157883644, + "eval_runtime": 203.1077, + "eval_samples_per_second": 9.847, + "eval_steps_per_second": 0.31, + "step": 100 + }, + { + "epoch": 0.12, + "grad_norm": 6.583589348054713, + "learning_rate": 4.999958096628589e-07, + "logits/chosen": 4.720016002655029, + "logits/rejected": 4.64304256439209, + "logps/chosen": -500.9232482910156, + "logps/rejected": -427.63153076171875, + "loss": 0.6854, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.044227249920368195, + "rewards/margins": 0.005637052934616804, + "rewards/rejected": 0.0385901965200901, + "step": 110 + }, + { + "epoch": 0.13, + "grad_norm": 3.1982958145542955, + "learning_rate": 4.999832387919069e-07, + "logits/chosen": 4.5180182456970215, + "logits/rejected": 4.88397216796875, + "logps/chosen": -552.8077392578125, + "logps/rejected": -401.8198547363281, + "loss": 0.6833, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0751495510339737, + "rewards/margins": 0.0239702258259058, + "rewards/rejected": 0.05117932707071304, + "step": 120 + }, + { + "epoch": 0.14, + "grad_norm": 2.4449854530637416, + "learning_rate": 4.999622878085538e-07, + "logits/chosen": 4.7093329429626465, + "logits/rejected": 4.781570911407471, + "logps/chosen": -516.9468994140625, + "logps/rejected": -393.8667907714844, + "loss": 0.6792, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.07485412806272507, + "rewards/margins": 0.029199976474046707, + "rewards/rejected": 0.04565414786338806, + "step": 130 + }, + { + "epoch": 0.15, + "grad_norm": 2.1934078981016993, + "learning_rate": 4.999329574151327e-07, + "logits/chosen": 4.582241058349609, + "logits/rejected": 4.7597975730896, + "logps/chosen": -501.5450134277344, + "logps/rejected": -409.2080078125, + "loss": 0.6777, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.08410192281007767, + "rewards/margins": 0.025774246081709862, + "rewards/rejected": 0.058327674865722656, + "step": 140 + }, + { + "epoch": 0.16, + "grad_norm": 2.96575096247207, + "learning_rate": 4.998952485948778e-07, + "logits/chosen": 4.52903938293457, + "logits/rejected": 4.743136882781982, + "logps/chosen": -528.6700439453125, + "logps/rejected": -401.9606628417969, + "loss": 0.6772, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.09848333895206451, + "rewards/margins": 0.0334458127617836, + "rewards/rejected": 0.06503753364086151, + "step": 150 + }, + { + "epoch": 0.17, + "grad_norm": 2.8737260484485887, + "learning_rate": 4.998491626118902e-07, + "logits/chosen": 4.462107181549072, + "logits/rejected": 4.984793663024902, + "logps/chosen": -594.0728149414062, + "logps/rejected": -446.53826904296875, + "loss": 0.6756, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1353224217891693, + "rewards/margins": 0.06305033713579178, + "rewards/rejected": 0.07227210700511932, + "step": 160 + }, + { + "epoch": 0.18, + "grad_norm": 1.8647392675629684, + "learning_rate": 4.997947010110966e-07, + "logits/chosen": 4.541747570037842, + "logits/rejected": 4.776431083679199, + "logps/chosen": -535.6854858398438, + "logps/rejected": -413.56048583984375, + "loss": 0.6738, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12944112718105316, + "rewards/margins": 0.03229222446680069, + "rewards/rejected": 0.09714889526367188, + "step": 170 + }, + { + "epoch": 0.2, + "grad_norm": 2.177803918062556, + "learning_rate": 4.997318656181965e-07, + "logits/chosen": 4.700952053070068, + "logits/rejected": 4.818137168884277, + "logps/chosen": -530.86474609375, + "logps/rejected": -421.16632080078125, + "loss": 0.67, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.13788820803165436, + "rewards/margins": 0.0508052296936512, + "rewards/rejected": 0.08708297461271286, + "step": 180 + }, + { + "epoch": 0.21, + "grad_norm": 2.0886821853779236, + "learning_rate": 4.99660658539602e-07, + "logits/chosen": 4.684708595275879, + "logits/rejected": 4.796721458435059, + "logps/chosen": -509.56982421875, + "logps/rejected": -412.2833557128906, + "loss": 0.6702, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1366799771785736, + "rewards/margins": 0.022193659096956253, + "rewards/rejected": 0.11448632180690765, + "step": 190 + }, + { + "epoch": 0.22, + "grad_norm": 2.220515964351123, + "learning_rate": 4.995810821623662e-07, + "logits/chosen": 4.499368190765381, + "logits/rejected": 4.888810157775879, + "logps/chosen": -496.9297790527344, + "logps/rejected": -394.7762756347656, + "loss": 0.6692, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.16144059598445892, + "rewards/margins": 0.06260715425014496, + "rewards/rejected": 0.09883344173431396, + "step": 200 + }, + { + "epoch": 0.22, + "eval_logits/chosen": 4.636972904205322, + "eval_logits/rejected": 4.871835231781006, + "eval_logps/chosen": -525.9188842773438, + "eval_logps/rejected": -414.19549560546875, + "eval_loss": 0.6641963124275208, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": 0.17417941987514496, + "eval_rewards/margins": 0.07538975775241852, + "eval_rewards/rejected": 0.09878966212272644, + "eval_runtime": 203.2626, + "eval_samples_per_second": 9.839, + "eval_steps_per_second": 0.31, + "step": 200 + }, + { + "epoch": 0.23, + "grad_norm": 2.9376887120140927, + "learning_rate": 4.99493139154104e-07, + "logits/chosen": 4.480714321136475, + "logits/rejected": 5.064620018005371, + "logps/chosen": -576.3648681640625, + "logps/rejected": -387.22650146484375, + "loss": 0.6545, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.19050724804401398, + "rewards/margins": 0.11292573064565659, + "rewards/rejected": 0.07758153975009918, + "step": 210 + }, + { + "epoch": 0.24, + "grad_norm": 1.8767204134346958, + "learning_rate": 4.993968324629023e-07, + "logits/chosen": 4.697628498077393, + "logits/rejected": 4.726076602935791, + "logps/chosen": -505.72137451171875, + "logps/rejected": -431.8421325683594, + "loss": 0.658, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.1784447729587555, + "rewards/margins": 0.0745016485452652, + "rewards/rejected": 0.1039431244134903, + "step": 220 + }, + { + "epoch": 0.25, + "grad_norm": 2.082590024513838, + "learning_rate": 4.99292165317221e-07, + "logits/chosen": 4.607518672943115, + "logits/rejected": 4.914595127105713, + "logps/chosen": -509.68499755859375, + "logps/rejected": -400.05584716796875, + "loss": 0.6609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2046581506729126, + "rewards/margins": 0.06535429507493973, + "rewards/rejected": 0.13930386304855347, + "step": 230 + }, + { + "epoch": 0.26, + "grad_norm": 3.237183271768573, + "learning_rate": 4.991791412257852e-07, + "logits/chosen": 4.595475196838379, + "logits/rejected": 4.688504695892334, + "logps/chosen": -520.5208740234375, + "logps/rejected": -381.739990234375, + "loss": 0.6551, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.2024141252040863, + "rewards/margins": 0.10769456624984741, + "rewards/rejected": 0.09471957385540009, + "step": 240 + }, + { + "epoch": 0.27, + "grad_norm": 1.918810148742378, + "learning_rate": 4.990577639774672e-07, + "logits/chosen": 4.68790340423584, + "logits/rejected": 4.860108375549316, + "logps/chosen": -487.408203125, + "logps/rejected": -385.59564208984375, + "loss": 0.6541, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.19003790616989136, + "rewards/margins": 0.09179778397083282, + "rewards/rejected": 0.09824012964963913, + "step": 250 + }, + { + "epoch": 0.28, + "grad_norm": 2.2494428114155856, + "learning_rate": 4.9892803764116e-07, + "logits/chosen": 4.434889793395996, + "logits/rejected": 4.799215793609619, + "logps/chosen": -511.882568359375, + "logps/rejected": -400.19927978515625, + "loss": 0.6486, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.19563761353492737, + "rewards/margins": 0.10582878440618515, + "rewards/rejected": 0.08980882912874222, + "step": 260 + }, + { + "epoch": 0.29, + "grad_norm": 3.666721869904351, + "learning_rate": 4.987899665656399e-07, + "logits/chosen": 4.74543571472168, + "logits/rejected": 4.740050315856934, + "logps/chosen": -503.22930908203125, + "logps/rejected": -457.3968811035156, + "loss": 0.6495, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.20304739475250244, + "rewards/margins": 0.05644340440630913, + "rewards/rejected": 0.14660397171974182, + "step": 270 + }, + { + "epoch": 0.3, + "grad_norm": 5.948963323108333, + "learning_rate": 4.986435553794221e-07, + "logits/chosen": 4.648507118225098, + "logits/rejected": 4.644364356994629, + "logps/chosen": -506.22503662109375, + "logps/rejected": -439.25494384765625, + "loss": 0.6575, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.20311832427978516, + "rewards/margins": 0.06590523570775986, + "rewards/rejected": 0.1372130960226059, + "step": 280 + }, + { + "epoch": 0.31, + "grad_norm": 1.6922344782897734, + "learning_rate": 4.984888089906041e-07, + "logits/chosen": 4.390051364898682, + "logits/rejected": 4.838263511657715, + "logps/chosen": -551.6712646484375, + "logps/rejected": -444.44598388671875, + "loss": 0.651, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.2372256964445114, + "rewards/margins": 0.09892772883176804, + "rewards/rejected": 0.13829797506332397, + "step": 290 + }, + { + "epoch": 0.33, + "grad_norm": 1.9239681386959693, + "learning_rate": 4.983257325867025e-07, + "logits/chosen": 4.478053569793701, + "logits/rejected": 5.162686347961426, + "logps/chosen": -508.06414794921875, + "logps/rejected": -339.91845703125, + "loss": 0.6368, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.25704532861709595, + "rewards/margins": 0.1772209107875824, + "rewards/rejected": 0.07982443273067474, + "step": 300 + }, + { + "epoch": 0.33, + "eval_logits/chosen": 4.5967912673950195, + "eval_logits/rejected": 4.840723037719727, + "eval_logps/chosen": -517.7680053710938, + "eval_logps/rejected": -411.4657287597656, + "eval_loss": 0.6442444920539856, + "eval_rewards/accuracies": 0.7083333134651184, + "eval_rewards/chosen": 0.25568887591362, + "eval_rewards/margins": 0.12960152328014374, + "eval_rewards/rejected": 0.12608733773231506, + "eval_runtime": 203.0386, + "eval_samples_per_second": 9.85, + "eval_steps_per_second": 0.31, + "step": 300 + }, + { + "epoch": 0.34, + "grad_norm": 2.1499440637283977, + "learning_rate": 4.981543316344781e-07, + "logits/chosen": 4.507613182067871, + "logits/rejected": 4.725367546081543, + "logps/chosen": -546.1619873046875, + "logps/rejected": -389.0150146484375, + "loss": 0.6368, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.3014827370643616, + "rewards/margins": 0.1564800888299942, + "rewards/rejected": 0.14500267803668976, + "step": 310 + }, + { + "epoch": 0.35, + "grad_norm": 3.228748334481589, + "learning_rate": 4.979746118797531e-07, + "logits/chosen": 4.5380539894104, + "logits/rejected": 4.800951957702637, + "logps/chosen": -521.36083984375, + "logps/rejected": -410.35650634765625, + "loss": 0.6463, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.24756142497062683, + "rewards/margins": 0.14254947006702423, + "rewards/rejected": 0.1050119400024414, + "step": 320 + }, + { + "epoch": 0.36, + "grad_norm": 1.912283760633026, + "learning_rate": 4.977865793472184e-07, + "logits/chosen": 4.489951133728027, + "logits/rejected": 5.115104675292969, + "logps/chosen": -489.64996337890625, + "logps/rejected": -359.89239501953125, + "loss": 0.6359, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.2450093924999237, + "rewards/margins": 0.13062641024589539, + "rewards/rejected": 0.11438298225402832, + "step": 330 + }, + { + "epoch": 0.37, + "grad_norm": 2.2785842112366574, + "learning_rate": 4.975902403402318e-07, + "logits/chosen": 4.511780261993408, + "logits/rejected": 4.862536430358887, + "logps/chosen": -557.9495849609375, + "logps/rejected": -401.67901611328125, + "loss": 0.6448, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2301413118839264, + "rewards/margins": 0.1367010772228241, + "rewards/rejected": 0.0934402346611023, + "step": 340 + }, + { + "epoch": 0.38, + "grad_norm": 1.8134683023299867, + "learning_rate": 4.973856014406061e-07, + "logits/chosen": 4.520918846130371, + "logits/rejected": 4.871068477630615, + "logps/chosen": -503.2167053222656, + "logps/rejected": -392.04998779296875, + "loss": 0.6313, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.2531774044036865, + "rewards/margins": 0.18557706475257874, + "rewards/rejected": 0.06760034710168839, + "step": 350 + }, + { + "epoch": 0.39, + "grad_norm": 2.0638343555583893, + "learning_rate": 4.971726695083893e-07, + "logits/chosen": 4.620011329650879, + "logits/rejected": 4.672842025756836, + "logps/chosen": -500.76776123046875, + "logps/rejected": -445.62493896484375, + "loss": 0.631, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.27054670453071594, + "rewards/margins": 0.1622386872768402, + "rewards/rejected": 0.10830801725387573, + "step": 360 + }, + { + "epoch": 0.4, + "grad_norm": 2.361650417228017, + "learning_rate": 4.96951451681634e-07, + "logits/chosen": 4.580225467681885, + "logits/rejected": 4.911395072937012, + "logps/chosen": -531.093994140625, + "logps/rejected": -425.7933654785156, + "loss": 0.6303, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.27540484070777893, + "rewards/margins": 0.12915542721748352, + "rewards/rejected": 0.14624938368797302, + "step": 370 + }, + { + "epoch": 0.41, + "grad_norm": 2.7174483204511937, + "learning_rate": 4.967219553761586e-07, + "logits/chosen": 4.723080635070801, + "logits/rejected": 4.8610453605651855, + "logps/chosen": -517.4929809570312, + "logps/rejected": -459.22979736328125, + "loss": 0.6392, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.2602981626987457, + "rewards/margins": 0.16877390444278717, + "rewards/rejected": 0.09152427315711975, + "step": 380 + }, + { + "epoch": 0.42, + "grad_norm": 3.9776964083274917, + "learning_rate": 4.96484188285298e-07, + "logits/chosen": 4.445316791534424, + "logits/rejected": 4.665471076965332, + "logps/chosen": -577.9735107421875, + "logps/rejected": -460.4791564941406, + "loss": 0.6329, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.3058663308620453, + "rewards/margins": 0.1735948771238327, + "rewards/rejected": 0.1322714388370514, + "step": 390 + }, + { + "epoch": 0.43, + "grad_norm": 1.76118979332965, + "learning_rate": 4.962381583796465e-07, + "logits/chosen": 4.506722450256348, + "logits/rejected": 4.659956932067871, + "logps/chosen": -500.7157287597656, + "logps/rejected": -396.84393310546875, + "loss": 0.6283, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.27281665802001953, + "rewards/margins": 0.16729183495044708, + "rewards/rejected": 0.10552482306957245, + "step": 400 + }, + { + "epoch": 0.43, + "eval_logits/chosen": 4.515594959259033, + "eval_logits/rejected": 4.762938499450684, + "eval_logps/chosen": -517.2609252929688, + "eval_logps/rejected": -415.9521789550781, + "eval_loss": 0.6283168792724609, + "eval_rewards/accuracies": 0.7083333134651184, + "eval_rewards/chosen": 0.2607596516609192, + "eval_rewards/margins": 0.1795366406440735, + "eval_rewards/rejected": 0.08122298866510391, + "eval_runtime": 203.11, + "eval_samples_per_second": 9.847, + "eval_steps_per_second": 0.31, + "step": 400 + }, + { + "epoch": 0.45, + "grad_norm": 2.272815572830576, + "learning_rate": 4.9598387390679e-07, + "logits/chosen": 4.5500898361206055, + "logits/rejected": 4.538148880004883, + "logps/chosen": -542.4667358398438, + "logps/rejected": -461.719970703125, + "loss": 0.6271, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.28052183985710144, + "rewards/margins": 0.19728553295135498, + "rewards/rejected": 0.08323628455400467, + "step": 410 + }, + { + "epoch": 0.46, + "grad_norm": 2.1158441354384188, + "learning_rate": 4.9572134339103e-07, + "logits/chosen": 4.513038158416748, + "logits/rejected": 4.744021415710449, + "logps/chosen": -557.8423461914062, + "logps/rejected": -416.1087951660156, + "loss": 0.6292, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2836315631866455, + "rewards/margins": 0.16970789432525635, + "rewards/rejected": 0.11392368376255035, + "step": 420 + }, + { + "epoch": 0.47, + "grad_norm": 2.4154271169731176, + "learning_rate": 4.954505756330975e-07, + "logits/chosen": 4.326611518859863, + "logits/rejected": 4.637131214141846, + "logps/chosen": -497.8357849121094, + "logps/rejected": -378.7847595214844, + "loss": 0.6125, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.2221941202878952, + "rewards/margins": 0.1941184103488922, + "rewards/rejected": 0.028075695037841797, + "step": 430 + }, + { + "epoch": 0.48, + "grad_norm": 3.6625125192218624, + "learning_rate": 4.951715797098579e-07, + "logits/chosen": 4.394779682159424, + "logits/rejected": 4.818585395812988, + "logps/chosen": -543.4212036132812, + "logps/rejected": -413.21881103515625, + "loss": 0.6213, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.20388510823249817, + "rewards/margins": 0.2376679927110672, + "rewards/rejected": -0.03378290683031082, + "step": 440 + }, + { + "epoch": 0.49, + "grad_norm": 3.0417051890171325, + "learning_rate": 4.94884364974007e-07, + "logits/chosen": 4.4246063232421875, + "logits/rejected": 4.6781697273254395, + "logps/chosen": -488.7919921875, + "logps/rejected": -413.4225158691406, + "loss": 0.6171, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.18326595425605774, + "rewards/margins": 0.17521528899669647, + "rewards/rejected": 0.008050672709941864, + "step": 450 + }, + { + "epoch": 0.5, + "grad_norm": 2.2275571876330544, + "learning_rate": 4.945889410537577e-07, + "logits/chosen": 4.345341205596924, + "logits/rejected": 4.646592140197754, + "logps/chosen": -530.9051513671875, + "logps/rejected": -403.25579833984375, + "loss": 0.6132, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.24030213057994843, + "rewards/margins": 0.22080914676189423, + "rewards/rejected": 0.01949295774102211, + "step": 460 + }, + { + "epoch": 0.51, + "grad_norm": 2.06236619392903, + "learning_rate": 4.942853178525163e-07, + "logits/chosen": 4.217994689941406, + "logits/rejected": 4.637279510498047, + "logps/chosen": -551.31640625, + "logps/rejected": -434.1787109375, + "loss": 0.6159, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.19648627936840057, + "rewards/margins": 0.18652307987213135, + "rewards/rejected": 0.00996321253478527, + "step": 470 + }, + { + "epoch": 0.52, + "grad_norm": 2.135303619213813, + "learning_rate": 4.939735055485515e-07, + "logits/chosen": 4.297804832458496, + "logits/rejected": 4.6675238609313965, + "logps/chosen": -536.94287109375, + "logps/rejected": -443.29693603515625, + "loss": 0.602, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.1485282927751541, + "rewards/margins": 0.2334461659193039, + "rewards/rejected": -0.08491786569356918, + "step": 480 + }, + { + "epoch": 0.53, + "grad_norm": 1.9853533137189272, + "learning_rate": 4.936535145946528e-07, + "logits/chosen": 4.211465358734131, + "logits/rejected": 4.481369972229004, + "logps/chosen": -560.4998779296875, + "logps/rejected": -426.50897216796875, + "loss": 0.608, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.21016542613506317, + "rewards/margins": 0.27178627252578735, + "rewards/rejected": -0.061620790511369705, + "step": 490 + }, + { + "epoch": 0.54, + "grad_norm": 3.856906618729529, + "learning_rate": 4.933253557177799e-07, + "logits/chosen": 4.18944787979126, + "logits/rejected": 4.669934272766113, + "logps/chosen": -520.4953002929688, + "logps/rejected": -416.11590576171875, + "loss": 0.6052, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.15112890303134918, + "rewards/margins": 0.20565268397331238, + "rewards/rejected": -0.05452378839254379, + "step": 500 + }, + { + "epoch": 0.54, + "eval_logits/chosen": 4.3152852058410645, + "eval_logits/rejected": 4.5515546798706055, + "eval_logps/chosen": -529.0491333007812, + "eval_logps/rejected": -434.0544738769531, + "eval_loss": 0.6131876111030579, + "eval_rewards/accuracies": 0.7103174328804016, + "eval_rewards/chosen": 0.14287839829921722, + "eval_rewards/margins": 0.24267850816249847, + "eval_rewards/rejected": -0.09980012476444244, + "eval_runtime": 202.6353, + "eval_samples_per_second": 9.87, + "eval_steps_per_second": 0.311, + "step": 500 + }, + { + "epoch": 0.55, + "grad_norm": 1.8808869421536998, + "learning_rate": 4.929890399187035e-07, + "logits/chosen": 4.082136631011963, + "logits/rejected": 4.399838447570801, + "logps/chosen": -544.2486572265625, + "logps/rejected": -440.2666931152344, + "loss": 0.5926, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.10085509717464447, + "rewards/margins": 0.26802974939346313, + "rewards/rejected": -0.16717462241649628, + "step": 510 + }, + { + "epoch": 0.56, + "grad_norm": 2.251806935640601, + "learning_rate": 4.926445784716363e-07, + "logits/chosen": 4.27434778213501, + "logits/rejected": 4.4805097579956055, + "logps/chosen": -493.55059814453125, + "logps/rejected": -420.85205078125, + "loss": 0.6046, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05821802467107773, + "rewards/margins": 0.24736134707927704, + "rewards/rejected": -0.1891433149576187, + "step": 520 + }, + { + "epoch": 0.58, + "grad_norm": 1.8178650690469547, + "learning_rate": 4.922919829238551e-07, + "logits/chosen": 4.100694179534912, + "logits/rejected": 4.416205406188965, + "logps/chosen": -512.9978637695312, + "logps/rejected": -404.34527587890625, + "loss": 0.588, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.1162143126130104, + "rewards/margins": 0.31595414876937866, + "rewards/rejected": -0.19973981380462646, + "step": 530 + }, + { + "epoch": 0.59, + "grad_norm": 5.360920096472579, + "learning_rate": 4.919312650953137e-07, + "logits/chosen": 4.242461204528809, + "logits/rejected": 4.388332843780518, + "logps/chosen": -487.63592529296875, + "logps/rejected": -440.6812438964844, + "loss": 0.5947, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.026374664157629013, + "rewards/margins": 0.25695887207984924, + "rewards/rejected": -0.23058418929576874, + "step": 540 + }, + { + "epoch": 0.6, + "grad_norm": 2.9490637929554993, + "learning_rate": 4.915624370782462e-07, + "logits/chosen": 4.2471604347229, + "logits/rejected": 4.280003547668457, + "logps/chosen": -500.488525390625, + "logps/rejected": -468.49749755859375, + "loss": 0.6095, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.032730571925640106, + "rewards/margins": 0.19359466433525085, + "rewards/rejected": -0.16086408495903015, + "step": 550 + }, + { + "epoch": 0.61, + "grad_norm": 4.01165068460727, + "learning_rate": 4.911855112367632e-07, + "logits/chosen": 4.124515533447266, + "logits/rejected": 4.417056083679199, + "logps/chosen": -545.4638061523438, + "logps/rejected": -426.03253173828125, + "loss": 0.6051, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07037229835987091, + "rewards/margins": 0.26376873254776, + "rewards/rejected": -0.19339647889137268, + "step": 560 + }, + { + "epoch": 0.62, + "grad_norm": 3.850942981354808, + "learning_rate": 4.908005002064349e-07, + "logits/chosen": 4.044139862060547, + "logits/rejected": 4.450469970703125, + "logps/chosen": -526.7841796875, + "logps/rejected": -394.8378601074219, + "loss": 0.5936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14681175351142883, + "rewards/margins": 0.3266211748123169, + "rewards/rejected": -0.17980939149856567, + "step": 570 + }, + { + "epoch": 0.63, + "grad_norm": 1.9639781340108824, + "learning_rate": 4.904074168938699e-07, + "logits/chosen": 4.0963239669799805, + "logits/rejected": 4.188741207122803, + "logps/chosen": -488.54803466796875, + "logps/rejected": -426.55731201171875, + "loss": 0.5948, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.03685665875673294, + "rewards/margins": 0.2861422598361969, + "rewards/rejected": -0.24928562343120575, + "step": 580 + }, + { + "epoch": 0.64, + "grad_norm": 1.9123674383013602, + "learning_rate": 4.900062744762808e-07, + "logits/chosen": 4.1099090576171875, + "logits/rejected": 4.312173366546631, + "logps/chosen": -595.0274658203125, + "logps/rejected": -468.94915771484375, + "loss": 0.5845, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.10104771703481674, + "rewards/margins": 0.3661288917064667, + "rewards/rejected": -0.2650812268257141, + "step": 590 + }, + { + "epoch": 0.65, + "grad_norm": 6.153053301101733, + "learning_rate": 4.895970864010433e-07, + "logits/chosen": 4.1564226150512695, + "logits/rejected": 4.437946319580078, + "logps/chosen": -579.667724609375, + "logps/rejected": -429.3572692871094, + "loss": 0.5923, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.0929703563451767, + "rewards/margins": 0.3057120740413666, + "rewards/rejected": -0.2127417027950287, + "step": 600 + }, + { + "epoch": 0.65, + "eval_logits/chosen": 4.228888511657715, + "eval_logits/rejected": 4.458848476409912, + "eval_logps/chosen": -529.0886840820312, + "eval_logps/rejected": -440.3538513183594, + "eval_loss": 0.6007751226425171, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": 0.14248257875442505, + "eval_rewards/margins": 0.3052762746810913, + "eval_rewards/rejected": -0.16279369592666626, + "eval_runtime": 203.2506, + "eval_samples_per_second": 9.84, + "eval_steps_per_second": 0.31, + "step": 600 + }, + { + "epoch": 0.66, + "grad_norm": 2.546602524377636, + "learning_rate": 4.891798663852454e-07, + "logits/chosen": 4.237885475158691, + "logits/rejected": 4.455267906188965, + "logps/chosen": -548.642822265625, + "logps/rejected": -467.329833984375, + "loss": 0.5907, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.14880751073360443, + "rewards/margins": 0.31532496213912964, + "rewards/rejected": -0.1665174812078476, + "step": 610 + }, + { + "epoch": 0.67, + "grad_norm": 2.0210468870193523, + "learning_rate": 4.887546284152276e-07, + "logits/chosen": 4.149720191955566, + "logits/rejected": 4.5017805099487305, + "logps/chosen": -544.840576171875, + "logps/rejected": -441.256103515625, + "loss": 0.5975, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.13573914766311646, + "rewards/margins": 0.2920631468296051, + "rewards/rejected": -0.15632399916648865, + "step": 620 + }, + { + "epoch": 0.68, + "grad_norm": 2.158709999703385, + "learning_rate": 4.883213867461131e-07, + "logits/chosen": 4.091971397399902, + "logits/rejected": 4.204458236694336, + "logps/chosen": -531.9281005859375, + "logps/rejected": -437.74847412109375, + "loss": 0.5968, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.008038423955440521, + "rewards/margins": 0.20734302699565887, + "rewards/rejected": -0.19930461049079895, + "step": 630 + }, + { + "epoch": 0.69, + "grad_norm": 2.1425960327586475, + "learning_rate": 4.878801559013315e-07, + "logits/chosen": 4.021170616149902, + "logits/rejected": 4.375435829162598, + "logps/chosen": -581.045166015625, + "logps/rejected": -459.45709228515625, + "loss": 0.5813, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.11591031402349472, + "rewards/margins": 0.33884397149086, + "rewards/rejected": -0.22293367981910706, + "step": 640 + }, + { + "epoch": 0.71, + "grad_norm": 2.5470845859360636, + "learning_rate": 4.874309506721307e-07, + "logits/chosen": 4.09341287612915, + "logits/rejected": 4.369636535644531, + "logps/chosen": -503.78125, + "logps/rejected": -418.76080322265625, + "loss": 0.5808, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.08035546541213989, + "rewards/margins": 0.3203769624233246, + "rewards/rejected": -0.2400215119123459, + "step": 650 + }, + { + "epoch": 0.72, + "grad_norm": 2.833418539609218, + "learning_rate": 4.869737861170815e-07, + "logits/chosen": 4.144112586975098, + "logits/rejected": 4.382904529571533, + "logps/chosen": -480.66094970703125, + "logps/rejected": -407.4732971191406, + "loss": 0.5824, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.08363370597362518, + "rewards/margins": 0.3676373362541199, + "rewards/rejected": -0.28400367498397827, + "step": 660 + }, + { + "epoch": 0.73, + "grad_norm": 2.231705054067377, + "learning_rate": 4.865086775615727e-07, + "logits/chosen": 3.999843120574951, + "logits/rejected": 4.188136100769043, + "logps/chosen": -563.2714233398438, + "logps/rejected": -490.22186279296875, + "loss": 0.6089, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08467956632375717, + "rewards/margins": 0.2611342966556549, + "rewards/rejected": -0.3458138406276703, + "step": 670 + }, + { + "epoch": 0.74, + "grad_norm": 2.15719451802752, + "learning_rate": 4.860356405972979e-07, + "logits/chosen": 3.9899182319641113, + "logits/rejected": 4.266640663146973, + "logps/chosen": -571.6624145507812, + "logps/rejected": -439.17822265625, + "loss": 0.5675, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.06711284816265106, + "rewards/margins": 0.3917382061481476, + "rewards/rejected": -0.32462531328201294, + "step": 680 + }, + { + "epoch": 0.75, + "grad_norm": 3.8527902072521663, + "learning_rate": 4.855546910817316e-07, + "logits/chosen": 4.05373477935791, + "logits/rejected": 4.261914253234863, + "logps/chosen": -539.2098388671875, + "logps/rejected": -448.1529235839844, + "loss": 0.5773, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.051774777472019196, + "rewards/margins": 0.4131855070590973, + "rewards/rejected": -0.3614107072353363, + "step": 690 + }, + { + "epoch": 0.76, + "grad_norm": 2.712352840545992, + "learning_rate": 4.850658451375989e-07, + "logits/chosen": 3.863182544708252, + "logits/rejected": 4.379975318908691, + "logps/chosen": -548.5557861328125, + "logps/rejected": -405.55377197265625, + "loss": 0.5899, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.021846016868948936, + "rewards/margins": 0.35745707154273987, + "rewards/rejected": -0.3793030381202698, + "step": 700 + }, + { + "epoch": 0.76, + "eval_logits/chosen": 4.13485050201416, + "eval_logits/rejected": 4.344428539276123, + "eval_logps/chosen": -535.7857055664062, + "eval_logps/rejected": -453.22711181640625, + "eval_loss": 0.5879531502723694, + "eval_rewards/accuracies": 0.7083333134651184, + "eval_rewards/chosen": 0.07551173120737076, + "eval_rewards/margins": 0.36703822016716003, + "eval_rewards/rejected": -0.2915264964103699, + "eval_runtime": 203.5158, + "eval_samples_per_second": 9.827, + "eval_steps_per_second": 0.31, + "step": 700 + }, + { + "epoch": 0.77, + "grad_norm": 3.5270973534849777, + "learning_rate": 4.845691191523343e-07, + "logits/chosen": 4.097344875335693, + "logits/rejected": 4.327819347381592, + "logps/chosen": -538.5155639648438, + "logps/rejected": -463.36199951171875, + "loss": 0.587, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.003706134855747223, + "rewards/margins": 0.4366297125816345, + "rewards/rejected": -0.4329235553741455, + "step": 710 + }, + { + "epoch": 0.78, + "grad_norm": 2.340358336507431, + "learning_rate": 4.840645297775326e-07, + "logits/chosen": 4.059669494628906, + "logits/rejected": 4.2709221839904785, + "logps/chosen": -494.7140197753906, + "logps/rejected": -407.42889404296875, + "loss": 0.5868, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12765507400035858, + "rewards/margins": 0.26888275146484375, + "rewards/rejected": -0.3965378403663635, + "step": 720 + }, + { + "epoch": 0.79, + "grad_norm": 5.452700718311131, + "learning_rate": 4.835520939283907e-07, + "logits/chosen": 3.9361884593963623, + "logits/rejected": 4.386366367340088, + "logps/chosen": -586.333740234375, + "logps/rejected": -431.07415771484375, + "loss": 0.5551, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.08558530360460281, + "rewards/margins": 0.3690303564071655, + "rewards/rejected": -0.45461565256118774, + "step": 730 + }, + { + "epoch": 0.8, + "grad_norm": 1.8567399586454065, + "learning_rate": 4.830318287831401e-07, + "logits/chosen": 4.077117919921875, + "logits/rejected": 4.315122604370117, + "logps/chosen": -569.4171142578125, + "logps/rejected": -495.67095947265625, + "loss": 0.5767, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.07002132385969162, + "rewards/margins": 0.32364723086357117, + "rewards/rejected": -0.393668532371521, + "step": 740 + }, + { + "epoch": 0.81, + "grad_norm": 1.9588211831618236, + "learning_rate": 4.82503751782472e-07, + "logits/chosen": 4.032423973083496, + "logits/rejected": 4.166065216064453, + "logps/chosen": -554.0896606445312, + "logps/rejected": -491.1756896972656, + "loss": 0.6032, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.0907275378704071, + "rewards/margins": 0.3419603407382965, + "rewards/rejected": -0.4326878488063812, + "step": 750 + }, + { + "epoch": 0.83, + "grad_norm": 3.107580787070029, + "learning_rate": 4.819678806289514e-07, + "logits/chosen": 4.053136825561523, + "logits/rejected": 4.109324932098389, + "logps/chosen": -527.7947998046875, + "logps/rejected": -492.25579833984375, + "loss": 0.572, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1407662332057953, + "rewards/margins": 0.31691235303878784, + "rewards/rejected": -0.4576786160469055, + "step": 760 + }, + { + "epoch": 0.84, + "grad_norm": 2.82290413965199, + "learning_rate": 4.814242332864249e-07, + "logits/chosen": 4.089312553405762, + "logits/rejected": 4.108882904052734, + "logps/chosen": -565.4785766601562, + "logps/rejected": -530.161376953125, + "loss": 0.5773, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10818660259246826, + "rewards/margins": 0.31342652440071106, + "rewards/rejected": -0.4216131269931793, + "step": 770 + }, + { + "epoch": 0.85, + "grad_norm": 3.0371041705697035, + "learning_rate": 4.808728279794178e-07, + "logits/chosen": 4.000454902648926, + "logits/rejected": 4.330757141113281, + "logps/chosen": -619.1341552734375, + "logps/rejected": -447.75372314453125, + "loss": 0.5774, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1276620477437973, + "rewards/margins": 0.40843528509140015, + "rewards/rejected": -0.5360973477363586, + "step": 780 + }, + { + "epoch": 0.86, + "grad_norm": 2.1196217775360657, + "learning_rate": 4.803136831925228e-07, + "logits/chosen": 4.038321495056152, + "logits/rejected": 4.138734817504883, + "logps/chosen": -558.1214599609375, + "logps/rejected": -492.8551330566406, + "loss": 0.5433, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.07264934480190277, + "rewards/margins": 0.48099589347839355, + "rewards/rejected": -0.5536452531814575, + "step": 790 + }, + { + "epoch": 0.87, + "grad_norm": 3.0153920473437883, + "learning_rate": 4.797468176697817e-07, + "logits/chosen": 4.080142021179199, + "logits/rejected": 4.077189922332764, + "logps/chosen": -550.8026123046875, + "logps/rejected": -462.605712890625, + "loss": 0.558, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.23426461219787598, + "rewards/margins": 0.3467225134372711, + "rewards/rejected": -0.5809870958328247, + "step": 800 + }, + { + "epoch": 0.87, + "eval_logits/chosen": 4.064155101776123, + "eval_logits/rejected": 4.270414352416992, + "eval_logps/chosen": -552.982177734375, + "eval_logps/rejected": -477.1144104003906, + "eval_loss": 0.5715343952178955, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.09645335376262665, + "eval_rewards/margins": 0.43394559621810913, + "eval_rewards/rejected": -0.530398964881897, + "eval_runtime": 202.7946, + "eval_samples_per_second": 9.862, + "eval_steps_per_second": 0.311, + "step": 800 + }, + { + "epoch": 0.88, + "grad_norm": 3.1242415701188437, + "learning_rate": 4.791722504140557e-07, + "logits/chosen": 3.9817912578582764, + "logits/rejected": 3.9899230003356934, + "logps/chosen": -539.9017944335938, + "logps/rejected": -485.11749267578125, + "loss": 0.5644, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.18518143892288208, + "rewards/margins": 0.39562076330184937, + "rewards/rejected": -0.5808022618293762, + "step": 810 + }, + { + "epoch": 0.89, + "grad_norm": 4.364703585921067, + "learning_rate": 4.785900006863886e-07, + "logits/chosen": 3.8450775146484375, + "logits/rejected": 3.8901209831237793, + "logps/chosen": -592.6529541015625, + "logps/rejected": -499.44488525390625, + "loss": 0.541, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1347336769104004, + "rewards/margins": 0.4826177954673767, + "rewards/rejected": -0.6173514723777771, + "step": 820 + }, + { + "epoch": 0.9, + "grad_norm": 2.233604071674828, + "learning_rate": 4.780000880053617e-07, + "logits/chosen": 3.895159959793091, + "logits/rejected": 4.225598335266113, + "logps/chosen": -576.2763671875, + "logps/rejected": -453.64306640625, + "loss": 0.5504, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3578268885612488, + "rewards/margins": 0.4358748495578766, + "rewards/rejected": -0.793701708316803, + "step": 830 + }, + { + "epoch": 0.91, + "grad_norm": 2.8171329385172004, + "learning_rate": 4.774025321464393e-07, + "logits/chosen": 3.9733738899230957, + "logits/rejected": 4.090386390686035, + "logps/chosen": -548.554443359375, + "logps/rejected": -460.45391845703125, + "loss": 0.5436, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.22730925679206848, + "rewards/margins": 0.4536939263343811, + "rewards/rejected": -0.6810031533241272, + "step": 840 + }, + { + "epoch": 0.92, + "grad_norm": 2.597837995051063, + "learning_rate": 4.7679735314130554e-07, + "logits/chosen": 3.9798221588134766, + "logits/rejected": 4.143235683441162, + "logps/chosen": -604.2347412109375, + "logps/rejected": -508.7215881347656, + "loss": 0.5669, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15413324534893036, + "rewards/margins": 0.5176432728767395, + "rewards/rejected": -0.6717765927314758, + "step": 850 + }, + { + "epoch": 0.93, + "grad_norm": 2.8211401150825086, + "learning_rate": 4.761845712771928e-07, + "logits/chosen": 4.0112457275390625, + "logits/rejected": 3.9924988746643066, + "logps/chosen": -494.43121337890625, + "logps/rejected": -462.6639099121094, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3942197859287262, + "rewards/margins": 0.46237269043922424, + "rewards/rejected": -0.8565924763679504, + "step": 860 + }, + { + "epoch": 0.94, + "grad_norm": 2.726543592078615, + "learning_rate": 4.755642070962019e-07, + "logits/chosen": 4.057265281677246, + "logits/rejected": 4.079537868499756, + "logps/chosen": -638.6723022460938, + "logps/rejected": -520.6543579101562, + "loss": 0.5569, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.21904997527599335, + "rewards/margins": 0.5892314910888672, + "rewards/rejected": -0.8082815408706665, + "step": 870 + }, + { + "epoch": 0.96, + "grad_norm": 2.42602462885076, + "learning_rate": 4.749362813946134e-07, + "logits/chosen": 4.092899322509766, + "logits/rejected": 4.094930648803711, + "logps/chosen": -548.0364379882812, + "logps/rejected": -462.27374267578125, + "loss": 0.5519, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.1817770004272461, + "rewards/margins": 0.5349105000495911, + "rewards/rejected": -0.7166875004768372, + "step": 880 + }, + { + "epoch": 0.97, + "grad_norm": 3.503506476250686, + "learning_rate": 4.743008152221904e-07, + "logits/chosen": 3.9905002117156982, + "logits/rejected": 4.0423197746276855, + "logps/chosen": -543.5731201171875, + "logps/rejected": -484.5904846191406, + "loss": 0.5581, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.38738584518432617, + "rewards/margins": 0.41433319449424744, + "rewards/rejected": -0.801719069480896, + "step": 890 + }, + { + "epoch": 0.98, + "grad_norm": 2.6298968847563513, + "learning_rate": 4.7365782988147297e-07, + "logits/chosen": 3.9976742267608643, + "logits/rejected": 4.120612144470215, + "logps/chosen": -532.8512573242188, + "logps/rejected": -476.075927734375, + "loss": 0.5495, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3852420449256897, + "rewards/margins": 0.4661009907722473, + "rewards/rejected": -0.851343035697937, + "step": 900 + }, + { + "epoch": 0.98, + "eval_logits/chosen": 4.001543998718262, + "eval_logits/rejected": 4.197599411010742, + "eval_logps/chosen": -569.9209594726562, + "eval_logps/rejected": -500.8484191894531, + "eval_loss": 0.5551918745040894, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.26584070920944214, + "eval_rewards/margins": 0.5018988251686096, + "eval_rewards/rejected": -0.7677395343780518, + "eval_runtime": 202.8629, + "eval_samples_per_second": 9.859, + "eval_steps_per_second": 0.311, + "step": 900 + }, + { + "epoch": 0.99, + "grad_norm": 2.0947908923652254, + "learning_rate": 4.73007346927064e-07, + "logits/chosen": 3.878185749053955, + "logits/rejected": 4.080098628997803, + "logps/chosen": -557.4278564453125, + "logps/rejected": -478.7586975097656, + "loss": 0.5554, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.3504122793674469, + "rewards/margins": 0.43874025344848633, + "rewards/rejected": -0.7891525030136108, + "step": 910 + }, + { + "epoch": 1.0, + "grad_norm": 2.61481570805474, + "learning_rate": 4.7234938816490643e-07, + "logits/chosen": 3.984571933746338, + "logits/rejected": 4.107733249664307, + "logps/chosen": -548.0833740234375, + "logps/rejected": -495.656005859375, + "loss": 0.5533, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2844906747341156, + "rewards/margins": 0.4039286673069, + "rewards/rejected": -0.6884194016456604, + "step": 920 + }, + { + "epoch": 1.01, + "grad_norm": 2.639868915389026, + "learning_rate": 4.7168397565155264e-07, + "logits/chosen": 3.9218106269836426, + "logits/rejected": 4.216797828674316, + "logps/chosen": -566.1287231445312, + "logps/rejected": -468.50506591796875, + "loss": 0.5275, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.3953813314437866, + "rewards/margins": 0.6424225568771362, + "rewards/rejected": -1.0378040075302124, + "step": 930 + }, + { + "epoch": 1.02, + "grad_norm": 3.72502140934014, + "learning_rate": 4.710111316934248e-07, + "logits/chosen": 4.068375110626221, + "logits/rejected": 4.248932838439941, + "logps/chosen": -568.8611450195312, + "logps/rejected": -518.5789184570312, + "loss": 0.5293, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.41718488931655884, + "rewards/margins": 0.45708101987838745, + "rewards/rejected": -0.8742658495903015, + "step": 940 + }, + { + "epoch": 1.03, + "grad_norm": 2.3680192609419124, + "learning_rate": 4.7033087884606713e-07, + "logits/chosen": 3.8009941577911377, + "logits/rejected": 3.9223339557647705, + "logps/chosen": -525.62548828125, + "logps/rejected": -463.38397216796875, + "loss": 0.5481, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5579544305801392, + "rewards/margins": 0.34062978625297546, + "rewards/rejected": -0.898584246635437, + "step": 950 + }, + { + "epoch": 1.04, + "grad_norm": 2.1297712756488445, + "learning_rate": 4.6964323991338973e-07, + "logits/chosen": 3.92254900932312, + "logits/rejected": 4.1782660484313965, + "logps/chosen": -588.7681274414062, + "logps/rejected": -488.18597412109375, + "loss": 0.5315, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.37962308526039124, + "rewards/margins": 0.49557191133499146, + "rewards/rejected": -0.8751950263977051, + "step": 960 + }, + { + "epoch": 1.05, + "grad_norm": 3.11704708081821, + "learning_rate": 4.6894823794690436e-07, + "logits/chosen": 3.853797435760498, + "logits/rejected": 3.8071129322052, + "logps/chosen": -546.7620239257812, + "logps/rejected": -489.9036560058594, + "loss": 0.5353, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6293502449989319, + "rewards/margins": 0.4502403736114502, + "rewards/rejected": -1.0795905590057373, + "step": 970 + }, + { + "epoch": 1.06, + "grad_norm": 2.595678646905618, + "learning_rate": 4.6824589624495136e-07, + "logits/chosen": 3.8109824657440186, + "logits/rejected": 4.0372538566589355, + "logps/chosen": -553.0970458984375, + "logps/rejected": -498.0740661621094, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4165034294128418, + "rewards/margins": 0.5991848111152649, + "rewards/rejected": -1.0156883001327515, + "step": 980 + }, + { + "epoch": 1.07, + "grad_norm": 2.2526945379801053, + "learning_rate": 4.6753623835191903e-07, + "logits/chosen": 3.806671142578125, + "logits/rejected": 3.963538408279419, + "logps/chosen": -536.47705078125, + "logps/rejected": -471.6568298339844, + "loss": 0.5257, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.30763205885887146, + "rewards/margins": 0.5957274436950684, + "rewards/rejected": -0.903359591960907, + "step": 990 + }, + { + "epoch": 1.09, + "grad_norm": 3.2721785743598715, + "learning_rate": 4.668192880574537e-07, + "logits/chosen": 3.9289004802703857, + "logits/rejected": 4.014190673828125, + "logps/chosen": -570.447265625, + "logps/rejected": -481.65228271484375, + "loss": 0.5124, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.318829745054245, + "rewards/margins": 0.5781766772270203, + "rewards/rejected": -0.8970065116882324, + "step": 1000 + }, + { + "epoch": 1.09, + "eval_logits/chosen": 3.9124913215637207, + "eval_logits/rejected": 4.095924377441406, + "eval_logps/chosen": -582.042724609375, + "eval_logps/rejected": -518.0128784179688, + "eval_loss": 0.5473096966743469, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -0.38705816864967346, + "eval_rewards/margins": 0.5523259043693542, + "eval_rewards/rejected": -0.9393841624259949, + "eval_runtime": 203.1299, + "eval_samples_per_second": 9.846, + "eval_steps_per_second": 0.31, + "step": 1000 + }, + { + "epoch": 1.1, + "grad_norm": 2.4735244526194125, + "learning_rate": 4.6609506939566336e-07, + "logits/chosen": 3.8594024181365967, + "logits/rejected": 3.8261044025421143, + "logps/chosen": -606.83251953125, + "logps/rejected": -555.4110717773438, + "loss": 0.5417, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.40992632508277893, + "rewards/margins": 0.664315938949585, + "rewards/rejected": -1.0742422342300415, + "step": 1010 + }, + { + "epoch": 1.11, + "grad_norm": 3.452879055757116, + "learning_rate": 4.653636066443105e-07, + "logits/chosen": 3.8114733695983887, + "logits/rejected": 3.984539747238159, + "logps/chosen": -678.3446044921875, + "logps/rejected": -560.6809692382812, + "loss": 0.5451, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.4688519835472107, + "rewards/margins": 0.5835325121879578, + "rewards/rejected": -1.0523844957351685, + "step": 1020 + }, + { + "epoch": 1.12, + "grad_norm": 2.3604158315865336, + "learning_rate": 4.646249243239996e-07, + "logits/chosen": 3.7811264991760254, + "logits/rejected": 3.891242504119873, + "logps/chosen": -549.7884521484375, + "logps/rejected": -494.5982971191406, + "loss": 0.5057, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.48677635192871094, + "rewards/margins": 0.5802093744277954, + "rewards/rejected": -1.066985845565796, + "step": 1030 + }, + { + "epoch": 1.13, + "grad_norm": 2.911695118266272, + "learning_rate": 4.6387904719735426e-07, + "logits/chosen": 3.8690688610076904, + "logits/rejected": 4.009054660797119, + "logps/chosen": -642.6327514648438, + "logps/rejected": -560.8517456054688, + "loss": 0.5572, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3320097327232361, + "rewards/margins": 0.535895824432373, + "rewards/rejected": -0.8679056167602539, + "step": 1040 + }, + { + "epoch": 1.14, + "grad_norm": 2.781369434499782, + "learning_rate": 4.631260002681876e-07, + "logits/chosen": 3.9193673133850098, + "logits/rejected": 3.9731242656707764, + "logps/chosen": -576.6608276367188, + "logps/rejected": -497.3514709472656, + "loss": 0.5372, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.45688796043395996, + "rewards/margins": 0.5704649686813354, + "rewards/rejected": -1.0273528099060059, + "step": 1050 + }, + { + "epoch": 1.15, + "grad_norm": 3.027185161911988, + "learning_rate": 4.6236580878066354e-07, + "logits/chosen": 3.7078070640563965, + "logits/rejected": 3.9149177074432373, + "logps/chosen": -577.9295043945312, + "logps/rejected": -520.2188110351562, + "loss": 0.5307, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.45761507749557495, + "rewards/margins": 0.569358766078949, + "rewards/rejected": -1.0269739627838135, + "step": 1060 + }, + { + "epoch": 1.16, + "grad_norm": 2.81530979384369, + "learning_rate": 4.6159849821845134e-07, + "logits/chosen": 3.8268768787384033, + "logits/rejected": 3.9492759704589844, + "logps/chosen": -558.7430419921875, + "logps/rejected": -495.2325134277344, + "loss": 0.5187, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4992932379245758, + "rewards/margins": 0.5600478053092957, + "rewards/rejected": -1.0593410730361938, + "step": 1070 + }, + { + "epoch": 1.17, + "grad_norm": 3.004411745900334, + "learning_rate": 4.6082409430387036e-07, + "logits/chosen": 3.7030282020568848, + "logits/rejected": 3.8176980018615723, + "logps/chosen": -637.4542846679688, + "logps/rejected": -537.9615478515625, + "loss": 0.5298, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4201931357383728, + "rewards/margins": 0.6639485359191895, + "rewards/rejected": -1.084141731262207, + "step": 1080 + }, + { + "epoch": 1.18, + "grad_norm": 2.608455246691224, + "learning_rate": 4.600426229970287e-07, + "logits/chosen": 3.752833843231201, + "logits/rejected": 3.9438960552215576, + "logps/chosen": -566.6677856445312, + "logps/rejected": -518.1152954101562, + "loss": 0.5307, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.419950008392334, + "rewards/margins": 0.5896099805831909, + "rewards/rejected": -1.009559988975525, + "step": 1090 + }, + { + "epoch": 1.19, + "grad_norm": 3.071459738601912, + "learning_rate": 4.59254110494952e-07, + "logits/chosen": 3.917219638824463, + "logits/rejected": 4.227973937988281, + "logps/chosen": -599.9447021484375, + "logps/rejected": -491.8515625, + "loss": 0.5322, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3520606756210327, + "rewards/margins": 0.5913389921188354, + "rewards/rejected": -0.9433996081352234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_logits/chosen": 3.871466636657715, + "eval_logits/rejected": 4.0435919761657715, + "eval_logps/chosen": -579.7517700195312, + "eval_logps/rejected": -518.7011108398438, + "eval_loss": 0.5400219559669495, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": -0.3641493618488312, + "eval_rewards/margins": 0.5821177363395691, + "eval_rewards/rejected": -0.9462669491767883, + "eval_runtime": 202.8047, + "eval_samples_per_second": 9.862, + "eval_steps_per_second": 0.311, + "step": 1100 + }, + { + "epoch": 1.21, + "grad_norm": 2.3553195362992825, + "learning_rate": 4.5845858323070635e-07, + "logits/chosen": 3.76731538772583, + "logits/rejected": 3.8337559700012207, + "logps/chosen": -547.681884765625, + "logps/rejected": -493.3389587402344, + "loss": 0.5266, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4557339549064636, + "rewards/margins": 0.5228798985481262, + "rewards/rejected": -0.9786139726638794, + "step": 1110 + }, + { + "epoch": 1.22, + "grad_norm": 2.3470086606185214, + "learning_rate": 4.5765606787251107e-07, + "logits/chosen": 3.7126567363739014, + "logits/rejected": 4.070954322814941, + "logps/chosen": -620.4693603515625, + "logps/rejected": -516.2987670898438, + "loss": 0.5152, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4025896489620209, + "rewards/margins": 0.788905918598175, + "rewards/rejected": -1.191495656967163, + "step": 1120 + }, + { + "epoch": 1.23, + "grad_norm": 4.949775862620086, + "learning_rate": 4.5684659132284564e-07, + "logits/chosen": 3.6738791465759277, + "logits/rejected": 3.7427947521209717, + "logps/chosen": -600.7318725585938, + "logps/rejected": -527.5272216796875, + "loss": 0.5198, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.552603006362915, + "rewards/margins": 0.5690030455589294, + "rewards/rejected": -1.1216061115264893, + "step": 1130 + }, + { + "epoch": 1.24, + "grad_norm": 3.206426056624094, + "learning_rate": 4.5603018071754713e-07, + "logits/chosen": 3.661280393600464, + "logits/rejected": 3.9159374237060547, + "logps/chosen": -653.1373291015625, + "logps/rejected": -562.4027709960938, + "loss": 0.5258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4949869215488434, + "rewards/margins": 0.6851035356521606, + "rewards/rejected": -1.180090308189392, + "step": 1140 + }, + { + "epoch": 1.25, + "grad_norm": 2.2769394974170676, + "learning_rate": 4.55206863424901e-07, + "logits/chosen": 3.6856913566589355, + "logits/rejected": 4.0145063400268555, + "logps/chosen": -568.9828491210938, + "logps/rejected": -486.29656982421875, + "loss": 0.5186, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47184038162231445, + "rewards/margins": 0.7080105543136597, + "rewards/rejected": -1.1798509359359741, + "step": 1150 + }, + { + "epoch": 1.26, + "grad_norm": 2.359734545391228, + "learning_rate": 4.5437666704472355e-07, + "logits/chosen": 3.7498373985290527, + "logits/rejected": 3.9770877361297607, + "logps/chosen": -580.9403686523438, + "logps/rejected": -529.2962646484375, + "loss": 0.529, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5413922071456909, + "rewards/margins": 0.5566983222961426, + "rewards/rejected": -1.0980905294418335, + "step": 1160 + }, + { + "epoch": 1.27, + "grad_norm": 2.89031946703898, + "learning_rate": 4.535396194074366e-07, + "logits/chosen": 3.727942943572998, + "logits/rejected": 3.9232964515686035, + "logps/chosen": -600.3663330078125, + "logps/rejected": -554.5781860351562, + "loss": 0.5045, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.3358447551727295, + "rewards/margins": 0.7004222869873047, + "rewards/rejected": -1.0362670421600342, + "step": 1170 + }, + { + "epoch": 1.28, + "grad_norm": 2.8945187135010166, + "learning_rate": 4.526957485731344e-07, + "logits/chosen": 3.6489124298095703, + "logits/rejected": 3.7614033222198486, + "logps/chosen": -664.4152221679688, + "logps/rejected": -542.4786987304688, + "loss": 0.5471, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.45210176706314087, + "rewards/margins": 0.5391023755073547, + "rewards/rejected": -0.9912041425704956, + "step": 1180 + }, + { + "epoch": 1.29, + "grad_norm": 2.67596171281522, + "learning_rate": 4.518450828306436e-07, + "logits/chosen": 3.7186882495880127, + "logits/rejected": 3.734210968017578, + "logps/chosen": -583.5777587890625, + "logps/rejected": -533.0770263671875, + "loss": 0.5204, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.579624354839325, + "rewards/margins": 0.6160343289375305, + "rewards/rejected": -1.1956586837768555, + "step": 1190 + }, + { + "epoch": 1.3, + "grad_norm": 2.4346554508723615, + "learning_rate": 4.509876506965742e-07, + "logits/chosen": 3.6612792015075684, + "logits/rejected": 3.8886547088623047, + "logps/chosen": -599.6268310546875, + "logps/rejected": -515.5652465820312, + "loss": 0.5281, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.605526864528656, + "rewards/margins": 0.708820104598999, + "rewards/rejected": -1.3143469095230103, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_logits/chosen": 3.7841575145721436, + "eval_logits/rejected": 3.9368276596069336, + "eval_logps/chosen": -596.7364501953125, + "eval_logps/rejected": -539.057861328125, + "eval_loss": 0.534388542175293, + "eval_rewards/accuracies": 0.7460317611694336, + "eval_rewards/chosen": -0.5339952707290649, + "eval_rewards/margins": 0.6158384680747986, + "eval_rewards/rejected": -1.1498335599899292, + "eval_runtime": 203.2584, + "eval_samples_per_second": 9.84, + "eval_steps_per_second": 0.31, + "step": 1200 + }, + { + "epoch": 1.31, + "grad_norm": 3.695970636776523, + "learning_rate": 4.501234809143637e-07, + "logits/chosen": 3.718877077102661, + "logits/rejected": 3.7873053550720215, + "logps/chosen": -593.6304321289062, + "logps/rejected": -524.5706787109375, + "loss": 0.5043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5071628093719482, + "rewards/margins": 0.7255369424819946, + "rewards/rejected": -1.2326997518539429, + "step": 1210 + }, + { + "epoch": 1.32, + "grad_norm": 2.3954534729791033, + "learning_rate": 4.492526024533143e-07, + "logits/chosen": 3.745767116546631, + "logits/rejected": 3.882204532623291, + "logps/chosen": -592.2398681640625, + "logps/rejected": -523.0452270507812, + "loss": 0.5221, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.566536545753479, + "rewards/margins": 0.5222768783569336, + "rewards/rejected": -1.0888134241104126, + "step": 1220 + }, + { + "epoch": 1.34, + "grad_norm": 3.0519224807574847, + "learning_rate": 4.4837504450762067e-07, + "logits/chosen": 3.8677144050598145, + "logits/rejected": 3.9290356636047363, + "logps/chosen": -618.3121948242188, + "logps/rejected": -562.8489990234375, + "loss": 0.5103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35622692108154297, + "rewards/margins": 0.6840597987174988, + "rewards/rejected": -1.0402867794036865, + "step": 1230 + }, + { + "epoch": 1.35, + "grad_norm": 2.761352538082151, + "learning_rate": 4.4749083649539204e-07, + "logits/chosen": 3.855912685394287, + "logits/rejected": 3.8187203407287598, + "logps/chosen": -574.2819213867188, + "logps/rejected": -547.544189453125, + "loss": 0.5093, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.48297685384750366, + "rewards/margins": 0.7096476554870605, + "rewards/rejected": -1.1926246881484985, + "step": 1240 + }, + { + "epoch": 1.36, + "grad_norm": 2.44118330758894, + "learning_rate": 4.466000080576659e-07, + "logits/chosen": 3.6384823322296143, + "logits/rejected": 3.8135483264923096, + "logps/chosen": -582.2689208984375, + "logps/rejected": -531.7244873046875, + "loss": 0.5101, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6522843241691589, + "rewards/margins": 0.6011296510696411, + "rewards/rejected": -1.2534139156341553, + "step": 1250 + }, + { + "epoch": 1.37, + "grad_norm": 3.0536070990003177, + "learning_rate": 4.4570258905741417e-07, + "logits/chosen": 3.7012696266174316, + "logits/rejected": 3.7779288291931152, + "logps/chosen": -582.983642578125, + "logps/rejected": -548.3180541992188, + "loss": 0.5103, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6485563516616821, + "rewards/margins": 0.5782519578933716, + "rewards/rejected": -1.2268083095550537, + "step": 1260 + }, + { + "epoch": 1.38, + "grad_norm": 2.4433145304612522, + "learning_rate": 4.447986095785421e-07, + "logits/chosen": 3.6619350910186768, + "logits/rejected": 3.8520569801330566, + "logps/chosen": -629.6549682617188, + "logps/rejected": -559.6665649414062, + "loss": 0.5246, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5843095183372498, + "rewards/margins": 0.6479975581169128, + "rewards/rejected": -1.2323070764541626, + "step": 1270 + }, + { + "epoch": 1.39, + "grad_norm": 2.7685908669959094, + "learning_rate": 4.4388809992487996e-07, + "logits/chosen": 3.6826343536376953, + "logits/rejected": 3.9680213928222656, + "logps/chosen": -596.3795776367188, + "logps/rejected": -489.682861328125, + "loss": 0.521, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5391322374343872, + "rewards/margins": 0.6415873765945435, + "rewards/rejected": -1.1807196140289307, + "step": 1280 + }, + { + "epoch": 1.4, + "grad_norm": 2.462386287502182, + "learning_rate": 4.4297109061916725e-07, + "logits/chosen": 3.736112117767334, + "logits/rejected": 3.9865105152130127, + "logps/chosen": -615.0040893554688, + "logps/rejected": -472.54864501953125, + "loss": 0.519, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4535880982875824, + "rewards/margins": 0.651738166809082, + "rewards/rejected": -1.1053262948989868, + "step": 1290 + }, + { + "epoch": 1.41, + "grad_norm": 2.618384827610224, + "learning_rate": 4.420476124020291e-07, + "logits/chosen": 3.7563376426696777, + "logits/rejected": 3.8841323852539062, + "logps/chosen": -584.8636474609375, + "logps/rejected": -542.523681640625, + "loss": 0.5063, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.47505125403404236, + "rewards/margins": 0.5914508104324341, + "rewards/rejected": -1.0665019750595093, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_logits/chosen": 3.8498694896698, + "eval_logits/rejected": 4.013454914093018, + "eval_logps/chosen": -580.8731079101562, + "eval_logps/rejected": -523.8220825195312, + "eval_loss": 0.5296512246131897, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": -0.37536194920539856, + "eval_rewards/margins": 0.6221145391464233, + "eval_rewards/rejected": -0.9974763989448547, + "eval_runtime": 202.8973, + "eval_samples_per_second": 9.857, + "eval_steps_per_second": 0.311, + "step": 1300 + }, + { + "epoch": 1.42, + "grad_norm": 5.907008056312174, + "learning_rate": 4.411176962309461e-07, + "logits/chosen": 3.8201069831848145, + "logits/rejected": 3.972466230392456, + "logps/chosen": -662.3941650390625, + "logps/rejected": -575.9456176757812, + "loss": 0.4931, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.3811381459236145, + "rewards/margins": 0.6734236478805542, + "rewards/rejected": -1.0545618534088135, + "step": 1310 + }, + { + "epoch": 1.43, + "grad_norm": 3.6381197531260185, + "learning_rate": 4.4018137327921633e-07, + "logits/chosen": 3.693995714187622, + "logits/rejected": 3.833395004272461, + "logps/chosen": -566.2265625, + "logps/rejected": -505.2315979003906, + "loss": 0.5139, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.516391396522522, + "rewards/margins": 0.6587409973144531, + "rewards/rejected": -1.1751322746276855, + "step": 1320 + }, + { + "epoch": 1.44, + "grad_norm": 3.1695698040135647, + "learning_rate": 4.3923867493491057e-07, + "logits/chosen": 3.733046770095825, + "logits/rejected": 4.046479225158691, + "logps/chosen": -604.51513671875, + "logps/rejected": -487.60638427734375, + "loss": 0.5011, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.321003794670105, + "rewards/margins": 0.698340117931366, + "rewards/rejected": -1.0193439722061157, + "step": 1330 + }, + { + "epoch": 1.45, + "grad_norm": 2.8032158154033904, + "learning_rate": 4.3828963279981994e-07, + "logits/chosen": 3.8844008445739746, + "logits/rejected": 4.010292053222656, + "logps/chosen": -568.9833984375, + "logps/rejected": -535.6318969726562, + "loss": 0.5126, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.46742120385169983, + "rewards/margins": 0.6064526438713074, + "rewards/rejected": -1.0738738775253296, + "step": 1340 + }, + { + "epoch": 1.47, + "grad_norm": 2.5088126336091148, + "learning_rate": 4.3733427868839645e-07, + "logits/chosen": 3.8523497581481934, + "logits/rejected": 3.7402586936950684, + "logps/chosen": -551.6283569335938, + "logps/rejected": -521.6468505859375, + "loss": 0.5311, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.49471384286880493, + "rewards/margins": 0.6895469427108765, + "rewards/rejected": -1.184260606765747, + "step": 1350 + }, + { + "epoch": 1.48, + "grad_norm": 2.9949133381363153, + "learning_rate": 4.3637264462668664e-07, + "logits/chosen": 3.510672092437744, + "logits/rejected": 3.69635009765625, + "logps/chosen": -603.7630615234375, + "logps/rejected": -495.576904296875, + "loss": 0.5253, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.48227667808532715, + "rewards/margins": 0.6299175024032593, + "rewards/rejected": -1.112194299697876, + "step": 1360 + }, + { + "epoch": 1.49, + "grad_norm": 2.9594466747673804, + "learning_rate": 4.35404762851258e-07, + "logits/chosen": 3.5656349658966064, + "logits/rejected": 3.833620071411133, + "logps/chosen": -549.580078125, + "logps/rejected": -528.0352783203125, + "loss": 0.5199, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5975288152694702, + "rewards/margins": 0.6607010364532471, + "rewards/rejected": -1.2582299709320068, + "step": 1370 + }, + { + "epoch": 1.5, + "grad_norm": 2.8641396810139015, + "learning_rate": 4.34430665808118e-07, + "logits/chosen": 3.7431271076202393, + "logits/rejected": 3.898761034011841, + "logps/chosen": -635.0924682617188, + "logps/rejected": -550.517578125, + "loss": 0.5135, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.41983968019485474, + "rewards/margins": 0.6735917329788208, + "rewards/rejected": -1.0934313535690308, + "step": 1380 + }, + { + "epoch": 1.51, + "grad_norm": 2.6138220257213933, + "learning_rate": 4.3345038615162687e-07, + "logits/chosen": 3.7801513671875, + "logits/rejected": 3.9459152221679688, + "logps/chosen": -618.5330810546875, + "logps/rejected": -526.7120361328125, + "loss": 0.5401, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.41865187883377075, + "rewards/margins": 0.6133507490158081, + "rewards/rejected": -1.0320026874542236, + "step": 1390 + }, + { + "epoch": 1.52, + "grad_norm": 2.3348133744227595, + "learning_rate": 4.324639567434026e-07, + "logits/chosen": 3.724447727203369, + "logits/rejected": 3.9371161460876465, + "logps/chosen": -607.0804443359375, + "logps/rejected": -523.1050415039062, + "loss": 0.5073, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.33897972106933594, + "rewards/margins": 0.7490689158439636, + "rewards/rejected": -1.0880486965179443, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 3.784623384475708, + "eval_logits/rejected": 3.9400875568389893, + "eval_logps/chosen": -581.5236206054688, + "eval_logps/rejected": -527.0737915039062, + "eval_loss": 0.5216101408004761, + "eval_rewards/accuracies": 0.7757936716079712, + "eval_rewards/chosen": -0.3818674385547638, + "eval_rewards/margins": 0.6481255888938904, + "eval_rewards/rejected": -1.029992938041687, + "eval_runtime": 202.8107, + "eval_samples_per_second": 9.861, + "eval_steps_per_second": 0.311, + "step": 1400 + }, + { + "epoch": 1.53, + "grad_norm": 3.5770222334742705, + "learning_rate": 4.314714106512195e-07, + "logits/chosen": 3.729435443878174, + "logits/rejected": 4.007909774780273, + "logps/chosen": -589.6553955078125, + "logps/rejected": -514.7840576171875, + "loss": 0.5088, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.4252251982688904, + "rewards/margins": 0.7190917134284973, + "rewards/rejected": -1.1443169116973877, + "step": 1410 + }, + { + "epoch": 1.54, + "grad_norm": 3.563305957531453, + "learning_rate": 4.304727811478995e-07, + "logits/chosen": 3.6431992053985596, + "logits/rejected": 3.8974602222442627, + "logps/chosen": -645.6729736328125, + "logps/rejected": -540.0687255859375, + "loss": 0.5065, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.3529585897922516, + "rewards/margins": 0.7727079391479492, + "rewards/rejected": -1.1256663799285889, + "step": 1420 + }, + { + "epoch": 1.55, + "grad_norm": 2.58757534266237, + "learning_rate": 4.294681017101972e-07, + "logits/chosen": 3.654217481613159, + "logits/rejected": 3.7179622650146484, + "logps/chosen": -589.9276123046875, + "logps/rejected": -514.4965209960938, + "loss": 0.4785, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5733104944229126, + "rewards/margins": 0.6313902139663696, + "rewards/rejected": -1.2047007083892822, + "step": 1430 + }, + { + "epoch": 1.56, + "grad_norm": 3.0022317826313167, + "learning_rate": 4.2845740601767697e-07, + "logits/chosen": 3.5225296020507812, + "logits/rejected": 3.619579315185547, + "logps/chosen": -579.6800537109375, + "logps/rejected": -494.28399658203125, + "loss": 0.496, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4841237962245941, + "rewards/margins": 0.6999977231025696, + "rewards/rejected": -1.1841213703155518, + "step": 1440 + }, + { + "epoch": 1.57, + "grad_norm": 3.2680642270849556, + "learning_rate": 4.2744072795158446e-07, + "logits/chosen": 3.601937770843506, + "logits/rejected": 3.6645870208740234, + "logps/chosen": -604.373046875, + "logps/rejected": -517.3341064453125, + "loss": 0.5007, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5202440023422241, + "rewards/margins": 0.5371121168136597, + "rewards/rejected": -1.0573561191558838, + "step": 1450 + }, + { + "epoch": 1.59, + "grad_norm": 2.7738096169935598, + "learning_rate": 4.264181015937105e-07, + "logits/chosen": 3.6634833812713623, + "logits/rejected": 3.7749035358428955, + "logps/chosen": -604.2745361328125, + "logps/rejected": -527.6934814453125, + "loss": 0.4764, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.5378643274307251, + "rewards/margins": 0.8417774438858032, + "rewards/rejected": -1.3796416521072388, + "step": 1460 + }, + { + "epoch": 1.6, + "grad_norm": 2.881528548527839, + "learning_rate": 4.2538956122524874e-07, + "logits/chosen": 3.3788399696350098, + "logits/rejected": 3.563056230545044, + "logps/chosen": -621.4329833984375, + "logps/rejected": -560.8185424804688, + "loss": 0.4908, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7095000147819519, + "rewards/margins": 0.7638419270515442, + "rewards/rejected": -1.473341941833496, + "step": 1470 + }, + { + "epoch": 1.61, + "grad_norm": 2.777025235666112, + "learning_rate": 4.2435514132564645e-07, + "logits/chosen": 3.482332944869995, + "logits/rejected": 3.697258472442627, + "logps/chosen": -608.84326171875, + "logps/rejected": -499.27606201171875, + "loss": 0.4907, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.724181592464447, + "rewards/margins": 0.7695054411888123, + "rewards/rejected": -1.4936869144439697, + "step": 1480 + }, + { + "epoch": 1.62, + "grad_norm": 2.935178744516972, + "learning_rate": 4.233148765714487e-07, + "logits/chosen": 3.664395570755005, + "logits/rejected": 3.7082161903381348, + "logps/chosen": -598.2871704101562, + "logps/rejected": -573.3458862304688, + "loss": 0.4949, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.6274911761283875, + "rewards/margins": 0.8205268979072571, + "rewards/rejected": -1.4480180740356445, + "step": 1490 + }, + { + "epoch": 1.63, + "grad_norm": 3.1100698844265766, + "learning_rate": 4.222688018351357e-07, + "logits/chosen": 3.55896258354187, + "logits/rejected": 3.7258670330047607, + "logps/chosen": -677.6727294921875, + "logps/rejected": -549.7400512695312, + "loss": 0.5156, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5775496959686279, + "rewards/margins": 0.7107436060905457, + "rewards/rejected": -1.2882932424545288, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_logits/chosen": 3.6677606105804443, + "eval_logits/rejected": 3.78678560256958, + "eval_logps/chosen": -600.8123168945312, + "eval_logps/rejected": -552.3165893554688, + "eval_loss": 0.5176644325256348, + "eval_rewards/accuracies": 0.7559523582458496, + "eval_rewards/chosen": -0.5747539401054382, + "eval_rewards/margins": 0.7076672315597534, + "eval_rewards/rejected": -1.2824209928512573, + "eval_runtime": 203.422, + "eval_samples_per_second": 9.832, + "eval_steps_per_second": 0.31, + "step": 1500 + }, + { + "epoch": 1.64, + "grad_norm": 3.459724119197008, + "learning_rate": 4.212169521839541e-07, + "logits/chosen": 3.597620725631714, + "logits/rejected": 3.7718772888183594, + "logps/chosen": -646.1989135742188, + "logps/rejected": -560.6222534179688, + "loss": 0.4988, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.3835225999355316, + "rewards/margins": 0.8813455700874329, + "rewards/rejected": -1.2648680210113525, + "step": 1510 + }, + { + "epoch": 1.65, + "grad_norm": 2.539862474644881, + "learning_rate": 4.2015936287874103e-07, + "logits/chosen": 3.594446897506714, + "logits/rejected": 3.7397289276123047, + "logps/chosen": -585.3614501953125, + "logps/rejected": -507.7784729003906, + "loss": 0.5067, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6375758051872253, + "rewards/margins": 0.5888264775276184, + "rewards/rejected": -1.2264022827148438, + "step": 1520 + }, + { + "epoch": 1.66, + "grad_norm": 3.688641112230014, + "learning_rate": 4.1909606937274253e-07, + "logits/chosen": 3.694026231765747, + "logits/rejected": 3.582556962966919, + "logps/chosen": -593.2966918945312, + "logps/rejected": -550.7011108398438, + "loss": 0.494, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.5636107325553894, + "rewards/margins": 0.7417998313903809, + "rewards/rejected": -1.3054105043411255, + "step": 1530 + }, + { + "epoch": 1.67, + "grad_norm": 3.517111037766342, + "learning_rate": 4.180271073104249e-07, + "logits/chosen": 3.6457290649414062, + "logits/rejected": 3.6459174156188965, + "logps/chosen": -600.4559326171875, + "logps/rejected": -556.584228515625, + "loss": 0.4925, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.6495497226715088, + "rewards/margins": 0.7601557970046997, + "rewards/rejected": -1.409705400466919, + "step": 1540 + }, + { + "epoch": 1.68, + "grad_norm": 3.2263768026848734, + "learning_rate": 4.169525125262794e-07, + "logits/chosen": 3.729513645172119, + "logits/rejected": 3.8889973163604736, + "logps/chosen": -621.041748046875, + "logps/rejected": -568.662841796875, + "loss": 0.5129, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5739201903343201, + "rewards/margins": 0.6804107427597046, + "rewards/rejected": -1.254331111907959, + "step": 1550 + }, + { + "epoch": 1.69, + "grad_norm": 2.9881390854389966, + "learning_rate": 4.158723210436216e-07, + "logits/chosen": 3.7290759086608887, + "logits/rejected": 3.7448055744171143, + "logps/chosen": -617.1058959960938, + "logps/rejected": -557.6389770507812, + "loss": 0.5065, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5363360643386841, + "rewards/margins": 0.7953433394432068, + "rewards/rejected": -1.3316794633865356, + "step": 1560 + }, + { + "epoch": 1.7, + "grad_norm": 3.027069210987796, + "learning_rate": 4.147865690733834e-07, + "logits/chosen": 3.477890729904175, + "logits/rejected": 3.589928388595581, + "logps/chosen": -560.6275024414062, + "logps/rejected": -495.3143005371094, + "loss": 0.5049, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6722136735916138, + "rewards/margins": 0.5960069894790649, + "rewards/rejected": -1.2682207822799683, + "step": 1570 + }, + { + "epoch": 1.72, + "grad_norm": 3.4567442624175686, + "learning_rate": 4.1369529301289923e-07, + "logits/chosen": 3.4488158226013184, + "logits/rejected": 3.533224582672119, + "logps/chosen": -608.8463134765625, + "logps/rejected": -540.799560546875, + "loss": 0.4991, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.653535008430481, + "rewards/margins": 0.7377526164054871, + "rewards/rejected": -1.3912876844406128, + "step": 1580 + }, + { + "epoch": 1.73, + "grad_norm": 3.173424614233903, + "learning_rate": 4.12598529444686e-07, + "logits/chosen": 3.5706634521484375, + "logits/rejected": 3.687189817428589, + "logps/chosen": -646.652099609375, + "logps/rejected": -616.9104614257812, + "loss": 0.5081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5312900543212891, + "rewards/margins": 0.8549784421920776, + "rewards/rejected": -1.3862684965133667, + "step": 1590 + }, + { + "epoch": 1.74, + "grad_norm": 2.9125334834126515, + "learning_rate": 4.114963151352166e-07, + "logits/chosen": 3.7563605308532715, + "logits/rejected": 3.8214287757873535, + "logps/chosen": -587.04833984375, + "logps/rejected": -572.2859497070312, + "loss": 0.5072, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5536529421806335, + "rewards/margins": 0.6972201466560364, + "rewards/rejected": -1.2508732080459595, + "step": 1600 + }, + { + "epoch": 1.74, + "eval_logits/chosen": 3.661421775817871, + "eval_logits/rejected": 3.7790794372558594, + "eval_logps/chosen": -593.063720703125, + "eval_logps/rejected": -545.2913818359375, + "eval_loss": 0.5138276815414429, + "eval_rewards/accuracies": 0.7797619104385376, + "eval_rewards/chosen": -0.4972679913043976, + "eval_rewards/margins": 0.71490079164505, + "eval_rewards/rejected": -1.2121686935424805, + "eval_runtime": 203.2579, + "eval_samples_per_second": 9.84, + "eval_steps_per_second": 0.31, + "step": 1600 + }, + { + "epoch": 1.75, + "grad_norm": 2.8349589434640756, + "learning_rate": 4.103886870336875e-07, + "logits/chosen": 3.5567550659179688, + "logits/rejected": 3.544466733932495, + "logps/chosen": -576.1727905273438, + "logps/rejected": -553.1214599609375, + "loss": 0.5018, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6350575089454651, + "rewards/margins": 0.660717785358429, + "rewards/rejected": -1.2957751750946045, + "step": 1610 + }, + { + "epoch": 1.76, + "grad_norm": 4.003556107005901, + "learning_rate": 4.0927568227078016e-07, + "logits/chosen": 3.5722336769104004, + "logits/rejected": 3.902299404144287, + "logps/chosen": -658.9724731445312, + "logps/rejected": -556.3914794921875, + "loss": 0.501, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.4931603968143463, + "rewards/margins": 0.7726175785064697, + "rewards/rejected": -1.2657779455184937, + "step": 1620 + }, + { + "epoch": 1.77, + "grad_norm": 3.6688857060794633, + "learning_rate": 4.0815733815741594e-07, + "logits/chosen": 3.5408577919006348, + "logits/rejected": 3.70662260055542, + "logps/chosen": -575.60595703125, + "logps/rejected": -498.02947998046875, + "loss": 0.4935, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.607037365436554, + "rewards/margins": 0.7332677245140076, + "rewards/rejected": -1.3403050899505615, + "step": 1630 + }, + { + "epoch": 1.78, + "grad_norm": 3.335350540464027, + "learning_rate": 4.0703369218350605e-07, + "logits/chosen": 3.5745227336883545, + "logits/rejected": 3.697269916534424, + "logps/chosen": -579.6307983398438, + "logps/rejected": -512.6817016601562, + "loss": 0.4888, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5564176440238953, + "rewards/margins": 0.793745219707489, + "rewards/rejected": -1.3501628637313843, + "step": 1640 + }, + { + "epoch": 1.79, + "grad_norm": 3.106413437721439, + "learning_rate": 4.0590478201669405e-07, + "logits/chosen": 3.532491683959961, + "logits/rejected": 3.675520658493042, + "logps/chosen": -570.85595703125, + "logps/rejected": -524.6241455078125, + "loss": 0.4793, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4330361485481262, + "rewards/margins": 0.7855597734451294, + "rewards/rejected": -1.2185958623886108, + "step": 1650 + }, + { + "epoch": 1.8, + "grad_norm": 3.2501242606500087, + "learning_rate": 4.047706455010936e-07, + "logits/chosen": 3.5161030292510986, + "logits/rejected": 3.4709973335266113, + "logps/chosen": -604.4154663085938, + "logps/rejected": -549.8175659179688, + "loss": 0.5079, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6928753852844238, + "rewards/margins": 0.7038476467132568, + "rewards/rejected": -1.3967231512069702, + "step": 1660 + }, + { + "epoch": 1.81, + "grad_norm": 2.3245973701477403, + "learning_rate": 4.0363132065601955e-07, + "logits/chosen": 3.5682873725891113, + "logits/rejected": 3.549381971359253, + "logps/chosen": -601.1841430664062, + "logps/rejected": -526.7806396484375, + "loss": 0.494, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6761364936828613, + "rewards/margins": 0.595120370388031, + "rewards/rejected": -1.271256685256958, + "step": 1670 + }, + { + "epoch": 1.82, + "grad_norm": 3.1404482260870914, + "learning_rate": 4.024868456747137e-07, + "logits/chosen": 3.565006732940674, + "logits/rejected": 3.710141658782959, + "logps/chosen": -600.9898681640625, + "logps/rejected": -561.8255004882812, + "loss": 0.484, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6002925634384155, + "rewards/margins": 0.7557550668716431, + "rewards/rejected": -1.3560476303100586, + "step": 1680 + }, + { + "epoch": 1.83, + "grad_norm": 3.3290402883292605, + "learning_rate": 4.0133725892306413e-07, + "logits/chosen": 3.5321547985076904, + "logits/rejected": 3.633854389190674, + "logps/chosen": -612.2471923828125, + "logps/rejected": -570.8936767578125, + "loss": 0.4932, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.5366383790969849, + "rewards/margins": 0.9583670496940613, + "rewards/rejected": -1.4950053691864014, + "step": 1690 + }, + { + "epoch": 1.85, + "grad_norm": 3.219385863270907, + "learning_rate": 4.001825989383194e-07, + "logits/chosen": 3.6114490032196045, + "logits/rejected": 3.8093953132629395, + "logps/chosen": -582.7833862304688, + "logps/rejected": -533.4085083007812, + "loss": 0.4908, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7001829147338867, + "rewards/margins": 0.7178817987442017, + "rewards/rejected": -1.4180647134780884, + "step": 1700 + }, + { + "epoch": 1.85, + "eval_logits/chosen": 3.669623374938965, + "eval_logits/rejected": 3.789337396621704, + "eval_logps/chosen": -598.1292114257812, + "eval_logps/rejected": -553.7918090820312, + "eval_loss": 0.507692277431488, + "eval_rewards/accuracies": 0.7797619104385376, + "eval_rewards/chosen": -0.547922670841217, + "eval_rewards/margins": 0.7492501139640808, + "eval_rewards/rejected": -1.2971727848052979, + "eval_runtime": 203.2134, + "eval_samples_per_second": 9.842, + "eval_steps_per_second": 0.31, + "step": 1700 + }, + { + "epoch": 1.86, + "grad_norm": 2.9809501048356926, + "learning_rate": 3.990229044277964e-07, + "logits/chosen": 3.6091561317443848, + "logits/rejected": 3.6311869621276855, + "logps/chosen": -575.5233154296875, + "logps/rejected": -522.2789306640625, + "loss": 0.496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6322982907295227, + "rewards/margins": 0.7211301922798157, + "rewards/rejected": -1.3534284830093384, + "step": 1710 + }, + { + "epoch": 1.87, + "grad_norm": 3.8178999530593045, + "learning_rate": 3.97858214267583e-07, + "logits/chosen": 3.6715168952941895, + "logits/rejected": 3.739348888397217, + "logps/chosen": -579.3275146484375, + "logps/rejected": -527.568359375, + "loss": 0.4925, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.538645327091217, + "rewards/margins": 0.7014168500900269, + "rewards/rejected": -1.2400623559951782, + "step": 1720 + }, + { + "epoch": 1.88, + "grad_norm": 2.88366749396568, + "learning_rate": 3.966885675012348e-07, + "logits/chosen": 3.6033108234405518, + "logits/rejected": 3.6658871173858643, + "logps/chosen": -597.9328002929688, + "logps/rejected": -576.0477294921875, + "loss": 0.4854, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.6976242661476135, + "rewards/margins": 0.7374922037124634, + "rewards/rejected": -1.4351164102554321, + "step": 1730 + }, + { + "epoch": 1.89, + "grad_norm": 3.579737238340416, + "learning_rate": 3.9551400333846594e-07, + "logits/chosen": 3.4998416900634766, + "logits/rejected": 3.552649736404419, + "logps/chosen": -589.7621459960938, + "logps/rejected": -503.41656494140625, + "loss": 0.5067, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5329033136367798, + "rewards/margins": 0.613740861415863, + "rewards/rejected": -1.146644115447998, + "step": 1740 + }, + { + "epoch": 1.9, + "grad_norm": 2.8824881047105753, + "learning_rate": 3.943345611538352e-07, + "logits/chosen": 3.6485018730163574, + "logits/rejected": 3.7200427055358887, + "logps/chosen": -619.30615234375, + "logps/rejected": -504.87335205078125, + "loss": 0.5088, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6067255139350891, + "rewards/margins": 0.5986908674240112, + "rewards/rejected": -1.2054163217544556, + "step": 1750 + }, + { + "epoch": 1.91, + "grad_norm": 4.378653722783705, + "learning_rate": 3.9315028048542564e-07, + "logits/chosen": 3.754995346069336, + "logits/rejected": 3.7115840911865234, + "logps/chosen": -550.1331176757812, + "logps/rejected": -486.9710998535156, + "loss": 0.5035, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4969119131565094, + "rewards/margins": 0.6981099247932434, + "rewards/rejected": -1.1950218677520752, + "step": 1760 + }, + { + "epoch": 1.92, + "grad_norm": 4.133688506753646, + "learning_rate": 3.9196120103351946e-07, + "logits/chosen": 3.7322287559509277, + "logits/rejected": 3.8301990032196045, + "logps/chosen": -561.39404296875, + "logps/rejected": -551.4658203125, + "loss": 0.4846, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.49270591139793396, + "rewards/margins": 0.8821004629135132, + "rewards/rejected": -1.3748066425323486, + "step": 1770 + }, + { + "epoch": 1.93, + "grad_norm": 3.604820904113862, + "learning_rate": 3.9076736265926704e-07, + "logits/chosen": 3.5433831214904785, + "logits/rejected": 3.632005214691162, + "logps/chosen": -612.3604736328125, + "logps/rejected": -566.1151123046875, + "loss": 0.4828, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6337317228317261, + "rewards/margins": 0.7398926615715027, + "rewards/rejected": -1.3736244440078735, + "step": 1780 + }, + { + "epoch": 1.94, + "grad_norm": 2.7111165769919032, + "learning_rate": 3.8956880538335046e-07, + "logits/chosen": 3.6864840984344482, + "logits/rejected": 3.859823226928711, + "logps/chosen": -645.093994140625, + "logps/rejected": -595.4602661132812, + "loss": 0.4817, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6062914729118347, + "rewards/margins": 0.8517727851867676, + "rewards/rejected": -1.458064317703247, + "step": 1790 + }, + { + "epoch": 1.95, + "grad_norm": 3.2623811449746842, + "learning_rate": 3.883655693846425e-07, + "logits/chosen": 3.588616132736206, + "logits/rejected": 3.5179474353790283, + "logps/chosen": -559.8778076171875, + "logps/rejected": -575.5091552734375, + "loss": 0.5109, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7374631762504578, + "rewards/margins": 0.7547104358673096, + "rewards/rejected": -1.4921735525131226, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_logits/chosen": 3.65556263923645, + "eval_logits/rejected": 3.767853260040283, + "eval_logps/chosen": -604.908935546875, + "eval_logps/rejected": -563.373291015625, + "eval_loss": 0.5067973732948303, + "eval_rewards/accuracies": 0.7757936716079712, + "eval_rewards/chosen": -0.6157205700874329, + "eval_rewards/margins": 0.7772676944732666, + "eval_rewards/rejected": -1.3929883241653442, + "eval_runtime": 202.9487, + "eval_samples_per_second": 9.855, + "eval_steps_per_second": 0.31, + "step": 1800 + }, + { + "epoch": 1.97, + "grad_norm": 3.523477549593873, + "learning_rate": 3.87157694998859e-07, + "logits/chosen": 3.5332775115966797, + "logits/rejected": 3.5374343395233154, + "logps/chosen": -559.4539794921875, + "logps/rejected": -560.0755615234375, + "loss": 0.4717, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7071051597595215, + "rewards/margins": 0.756817102432251, + "rewards/rejected": -1.463922142982483, + "step": 1810 + }, + { + "epoch": 1.98, + "grad_norm": 3.8792408245686443, + "learning_rate": 3.8594522271720706e-07, + "logits/chosen": 3.4940123558044434, + "logits/rejected": 3.6339797973632812, + "logps/chosen": -605.3863525390625, + "logps/rejected": -501.80224609375, + "loss": 0.4975, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7022968530654907, + "rewards/margins": 0.7580503225326538, + "rewards/rejected": -1.4603471755981445, + "step": 1820 + }, + { + "epoch": 1.99, + "grad_norm": 3.5373167381585247, + "learning_rate": 3.8472819318502804e-07, + "logits/chosen": 3.5649352073669434, + "logits/rejected": 3.7245540618896484, + "logps/chosen": -655.5385131835938, + "logps/rejected": -578.20947265625, + "loss": 0.4745, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.6181583404541016, + "rewards/margins": 0.8856579065322876, + "rewards/rejected": -1.5038163661956787, + "step": 1830 + }, + { + "epoch": 2.0, + "grad_norm": 3.368152737374401, + "learning_rate": 3.83506647200434e-07, + "logits/chosen": 3.662012815475464, + "logits/rejected": 3.7260677814483643, + "logps/chosen": -599.5283813476562, + "logps/rejected": -570.4051513671875, + "loss": 0.4967, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.6230210065841675, + "rewards/margins": 0.7484488487243652, + "rewards/rejected": -1.3714698553085327, + "step": 1840 + }, + { + "epoch": 2.01, + "grad_norm": 3.769913619495626, + "learning_rate": 3.822806257129413e-07, + "logits/chosen": 3.5493359565734863, + "logits/rejected": 3.633047580718994, + "logps/chosen": -642.0574951171875, + "logps/rejected": -601.875, + "loss": 0.486, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.689389705657959, + "rewards/margins": 0.8089310526847839, + "rewards/rejected": -1.4983208179473877, + "step": 1850 + }, + { + "epoch": 2.02, + "grad_norm": 4.12546180658684, + "learning_rate": 3.810501698220967e-07, + "logits/chosen": 3.6838059425354004, + "logits/rejected": 3.700486660003662, + "logps/chosen": -598.7171630859375, + "logps/rejected": -605.1309814453125, + "loss": 0.4507, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.7325755953788757, + "rewards/margins": 0.9967571496963501, + "rewards/rejected": -1.729332685470581, + "step": 1860 + }, + { + "epoch": 2.03, + "grad_norm": 3.245909512853775, + "learning_rate": 3.7981532077610054e-07, + "logits/chosen": 3.6005501747131348, + "logits/rejected": 3.594416856765747, + "logps/chosen": -607.3951416015625, + "logps/rejected": -601.7537841796875, + "loss": 0.4989, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.7089284658432007, + "rewards/margins": 0.8154546022415161, + "rewards/rejected": -1.5243830680847168, + "step": 1870 + }, + { + "epoch": 2.04, + "grad_norm": 3.5832542745935796, + "learning_rate": 3.785761199704233e-07, + "logits/chosen": 3.505204439163208, + "logits/rejected": 3.4447197914123535, + "logps/chosen": -558.6715698242188, + "logps/rejected": -525.9684448242188, + "loss": 0.4618, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.7238224148750305, + "rewards/margins": 0.8947445154190063, + "rewards/rejected": -1.618566870689392, + "step": 1880 + }, + { + "epoch": 2.05, + "grad_norm": 3.2568125918545268, + "learning_rate": 3.773326089464184e-07, + "logits/chosen": 3.495814800262451, + "logits/rejected": 3.660496473312378, + "logps/chosen": -644.8048095703125, + "logps/rejected": -547.1638793945312, + "loss": 0.4767, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5708962678909302, + "rewards/margins": 0.7958296537399292, + "rewards/rejected": -1.3667261600494385, + "step": 1890 + }, + { + "epoch": 2.06, + "grad_norm": 3.1677742126172275, + "learning_rate": 3.7608482938992903e-07, + "logits/chosen": 3.419759750366211, + "logits/rejected": 3.55739164352417, + "logps/chosen": -620.032470703125, + "logps/rejected": -574.774169921875, + "loss": 0.4779, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.697594404220581, + "rewards/margins": 0.7900949716567993, + "rewards/rejected": -1.48768949508667, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_logits/chosen": 3.606205940246582, + "eval_logits/rejected": 3.711778402328491, + "eval_logps/chosen": -605.808837890625, + "eval_logps/rejected": -565.767333984375, + "eval_loss": 0.50054931640625, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -0.6247199773788452, + "eval_rewards/margins": 0.7922087907791138, + "eval_rewards/rejected": -1.416928768157959, + "eval_runtime": 202.8417, + "eval_samples_per_second": 9.86, + "eval_steps_per_second": 0.311, + "step": 1900 + }, + { + "epoch": 2.07, + "grad_norm": 3.2130345045394435, + "learning_rate": 3.7483282312989155e-07, + "logits/chosen": 3.6094250679016113, + "logits/rejected": 3.8729541301727295, + "logps/chosen": -580.1021728515625, + "logps/rejected": -531.3250732421875, + "loss": 0.4582, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7259335517883301, + "rewards/margins": 0.8915154337882996, + "rewards/rejected": -1.617449164390564, + "step": 1910 + }, + { + "epoch": 2.08, + "grad_norm": 3.05589801382563, + "learning_rate": 3.735766321369325e-07, + "logits/chosen": 3.646768569946289, + "logits/rejected": 3.6266236305236816, + "logps/chosen": -636.1376953125, + "logps/rejected": -559.4783325195312, + "loss": 0.4532, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.670791506767273, + "rewards/margins": 0.8766641616821289, + "rewards/rejected": -1.5474556684494019, + "step": 1920 + }, + { + "epoch": 2.1, + "grad_norm": 3.335600613112421, + "learning_rate": 3.7231629852196214e-07, + "logits/chosen": 3.4616265296936035, + "logits/rejected": 3.530566692352295, + "logps/chosen": -620.8572998046875, + "logps/rejected": -556.129150390625, + "loss": 0.482, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7699755430221558, + "rewards/margins": 0.9273629188537598, + "rewards/rejected": -1.697338342666626, + "step": 1930 + }, + { + "epoch": 2.11, + "grad_norm": 4.404120199762387, + "learning_rate": 3.710518645347626e-07, + "logits/chosen": 3.587388515472412, + "logits/rejected": 3.6373343467712402, + "logps/chosen": -658.3599853515625, + "logps/rejected": -621.992431640625, + "loss": 0.4645, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6640130877494812, + "rewards/margins": 0.8948653340339661, + "rewards/rejected": -1.5588784217834473, + "step": 1940 + }, + { + "epoch": 2.12, + "grad_norm": 3.1491212381401272, + "learning_rate": 3.697833725625713e-07, + "logits/chosen": 3.4838454723358154, + "logits/rejected": 3.38386869430542, + "logps/chosen": -590.5750122070312, + "logps/rejected": -557.3521118164062, + "loss": 0.4744, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7175976037979126, + "rewards/margins": 0.7821000218391418, + "rewards/rejected": -1.4996975660324097, + "step": 1950 + }, + { + "epoch": 2.13, + "grad_norm": 3.6511312470131876, + "learning_rate": 3.685108651286605e-07, + "logits/chosen": 3.554466962814331, + "logits/rejected": 3.6436378955841064, + "logps/chosen": -611.2662353515625, + "logps/rejected": -538.0736083984375, + "loss": 0.4749, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5578540563583374, + "rewards/margins": 0.7443768382072449, + "rewards/rejected": -1.3022308349609375, + "step": 1960 + }, + { + "epoch": 2.14, + "grad_norm": 3.830642631650065, + "learning_rate": 3.672343848909116e-07, + "logits/chosen": 3.5016560554504395, + "logits/rejected": 3.5824344158172607, + "logps/chosen": -616.0037231445312, + "logps/rejected": -570.7191162109375, + "loss": 0.4756, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7442587614059448, + "rewards/margins": 0.8685930371284485, + "rewards/rejected": -1.6128517389297485, + "step": 1970 + }, + { + "epoch": 2.15, + "grad_norm": 3.4247382055549207, + "learning_rate": 3.6595397464038484e-07, + "logits/chosen": 3.4326834678649902, + "logits/rejected": 3.503385066986084, + "logps/chosen": -631.0132446289062, + "logps/rejected": -558.8316650390625, + "loss": 0.4611, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5838363170623779, + "rewards/margins": 1.001306414604187, + "rewards/rejected": -1.585142731666565, + "step": 1980 + }, + { + "epoch": 2.16, + "grad_norm": 3.7535681296567898, + "learning_rate": 3.646696772998854e-07, + "logits/chosen": 3.455758571624756, + "logits/rejected": 3.6762351989746094, + "logps/chosen": -591.5477294921875, + "logps/rejected": -526.0713500976562, + "loss": 0.4858, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.8906404376029968, + "rewards/margins": 0.9278383255004883, + "rewards/rejected": -1.8184788227081299, + "step": 1990 + }, + { + "epoch": 2.17, + "grad_norm": 4.778420887307365, + "learning_rate": 3.6338153592252394e-07, + "logits/chosen": 3.4087131023406982, + "logits/rejected": 3.445772886276245, + "logps/chosen": -578.9254760742188, + "logps/rejected": -527.9382934570312, + "loss": 0.4833, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7731282711029053, + "rewards/margins": 0.8957611918449402, + "rewards/rejected": -1.6688896417617798, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_logits/chosen": 3.584850549697876, + "eval_logits/rejected": 3.67386531829834, + "eval_logps/chosen": -611.7432250976562, + "eval_logps/rejected": -574.3334350585938, + "eval_loss": 0.49917730689048767, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.6840633749961853, + "eval_rewards/margins": 0.8185263872146606, + "eval_rewards/rejected": -1.5025897026062012, + "eval_runtime": 202.8219, + "eval_samples_per_second": 9.861, + "eval_steps_per_second": 0.311, + "step": 2000 + }, + { + "epoch": 2.18, + "grad_norm": 3.6987584600229515, + "learning_rate": 3.6208959369027377e-07, + "logits/chosen": 3.4759509563446045, + "logits/rejected": 3.583934783935547, + "logps/chosen": -629.9797973632812, + "logps/rejected": -546.8692016601562, + "loss": 0.4603, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7248755693435669, + "rewards/margins": 0.8174535632133484, + "rewards/rejected": -1.5423290729522705, + "step": 2010 + }, + { + "epoch": 2.19, + "grad_norm": 3.487846785466506, + "learning_rate": 3.60793893912523e-07, + "logits/chosen": 3.503138780593872, + "logits/rejected": 3.5944199562072754, + "logps/chosen": -598.5306396484375, + "logps/rejected": -539.2348022460938, + "loss": 0.4666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7628095746040344, + "rewards/margins": 0.8675826191902161, + "rewards/rejected": -1.630392074584961, + "step": 2020 + }, + { + "epoch": 2.2, + "grad_norm": 3.651409060327047, + "learning_rate": 3.5949448002462293e-07, + "logits/chosen": 3.439924716949463, + "logits/rejected": 3.5722365379333496, + "logps/chosen": -631.615966796875, + "logps/rejected": -557.09326171875, + "loss": 0.4754, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.7125946283340454, + "rewards/margins": 0.9252546429634094, + "rewards/rejected": -1.63784921169281, + "step": 2030 + }, + { + "epoch": 2.21, + "grad_norm": 3.260618270999553, + "learning_rate": 3.581913955864317e-07, + "logits/chosen": 3.434553861618042, + "logits/rejected": 3.4933903217315674, + "logps/chosen": -606.2720947265625, + "logps/rejected": -597.4681396484375, + "loss": 0.4817, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6869601607322693, + "rewards/margins": 1.0263012647628784, + "rewards/rejected": -1.713261365890503, + "step": 2040 + }, + { + "epoch": 2.23, + "grad_norm": 3.423883637886533, + "learning_rate": 3.5688468428085426e-07, + "logits/chosen": 3.5992112159729004, + "logits/rejected": 3.513864040374756, + "logps/chosen": -611.1129150390625, + "logps/rejected": -611.6151123046875, + "loss": 0.4575, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.754952073097229, + "rewards/margins": 0.8556115031242371, + "rewards/rejected": -1.6105636358261108, + "step": 2050 + }, + { + "epoch": 2.24, + "grad_norm": 3.5077824772903674, + "learning_rate": 3.555743899123779e-07, + "logits/chosen": 3.5471534729003906, + "logits/rejected": 3.6697468757629395, + "logps/chosen": -642.9610595703125, + "logps/rejected": -615.0989379882812, + "loss": 0.4732, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7250288128852844, + "rewards/margins": 0.9433430433273315, + "rewards/rejected": -1.6683717966079712, + "step": 2060 + }, + { + "epoch": 2.25, + "grad_norm": 3.7861732801731605, + "learning_rate": 3.542605564056041e-07, + "logits/chosen": 3.5742199420928955, + "logits/rejected": 3.5410943031311035, + "logps/chosen": -655.3723754882812, + "logps/rejected": -642.4591674804688, + "loss": 0.4655, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.776167631149292, + "rewards/margins": 0.889909565448761, + "rewards/rejected": -1.6660772562026978, + "step": 2070 + }, + { + "epoch": 2.26, + "grad_norm": 5.129957847568381, + "learning_rate": 3.529432278037753e-07, + "logits/chosen": 3.491333484649658, + "logits/rejected": 3.507310152053833, + "logps/chosen": -594.2889404296875, + "logps/rejected": -602.4393310546875, + "loss": 0.4889, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9160470962524414, + "rewards/margins": 0.891708254814148, + "rewards/rejected": -1.807755470275879, + "step": 2080 + }, + { + "epoch": 2.27, + "grad_norm": 3.846187409152628, + "learning_rate": 3.5162244826729947e-07, + "logits/chosen": 3.289222002029419, + "logits/rejected": 3.3087544441223145, + "logps/chosen": -578.4583129882812, + "logps/rejected": -576.5350341796875, + "loss": 0.4732, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9932931065559387, + "rewards/margins": 0.8519379496574402, + "rewards/rejected": -1.845231056213379, + "step": 2090 + }, + { + "epoch": 2.28, + "grad_norm": 4.357246431190218, + "learning_rate": 3.502982620722688e-07, + "logits/chosen": 3.3593249320983887, + "logits/rejected": 3.3827052116394043, + "logps/chosen": -671.4129028320312, + "logps/rejected": -633.4642333984375, + "loss": 0.4879, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1302413940429688, + "rewards/margins": 0.95428466796875, + "rewards/rejected": -2.0845260620117188, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_logits/chosen": 3.5029900074005127, + "eval_logits/rejected": 3.5692081451416016, + "eval_logps/chosen": -624.6126708984375, + "eval_logps/rejected": -590.6145629882812, + "eval_loss": 0.49671605229377747, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.812757670879364, + "eval_rewards/margins": 0.8526439070701599, + "eval_rewards/rejected": -1.6654013395309448, + "eval_runtime": 203.9786, + "eval_samples_per_second": 9.805, + "eval_steps_per_second": 0.309, + "step": 2100 + }, + { + "epoch": 2.29, + "grad_norm": 3.471910343991828, + "learning_rate": 3.489707136089762e-07, + "logits/chosen": 3.3395347595214844, + "logits/rejected": 3.3745200634002686, + "logps/chosen": -571.3670654296875, + "logps/rejected": -526.6174926757812, + "loss": 0.4372, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9002982974052429, + "rewards/margins": 0.8082368969917297, + "rewards/rejected": -1.7085349559783936, + "step": 2110 + }, + { + "epoch": 2.3, + "grad_norm": 3.5446869217682573, + "learning_rate": 3.4763984738042667e-07, + "logits/chosen": 3.3679275512695312, + "logits/rejected": 3.5208847522735596, + "logps/chosen": -645.79052734375, + "logps/rejected": -559.8326416015625, + "loss": 0.4769, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9663489460945129, + "rewards/margins": 0.8492861986160278, + "rewards/rejected": -1.815635323524475, + "step": 2120 + }, + { + "epoch": 2.31, + "grad_norm": 3.6584533129832693, + "learning_rate": 3.4630570800084563e-07, + "logits/chosen": 3.5914466381073, + "logits/rejected": 3.5731983184814453, + "logps/chosen": -645.3328857421875, + "logps/rejected": -612.6683349609375, + "loss": 0.481, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9558261632919312, + "rewards/margins": 0.834132969379425, + "rewards/rejected": -1.7899593114852905, + "step": 2130 + }, + { + "epoch": 2.32, + "grad_norm": 3.2014609575217823, + "learning_rate": 3.449683401941836e-07, + "logits/chosen": 3.53631329536438, + "logits/rejected": 3.6271331310272217, + "logps/chosen": -709.5948486328125, + "logps/rejected": -633.9876708984375, + "loss": 0.4593, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5863145589828491, + "rewards/margins": 0.8807814717292786, + "rewards/rejected": -1.467095971107483, + "step": 2140 + }, + { + "epoch": 2.33, + "grad_norm": 3.488531245410909, + "learning_rate": 3.4362778879261636e-07, + "logits/chosen": 3.443824291229248, + "logits/rejected": 3.513853073120117, + "logps/chosen": -635.8612670898438, + "logps/rejected": -578.5512084960938, + "loss": 0.4394, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8079124689102173, + "rewards/margins": 0.9066284894943237, + "rewards/rejected": -1.7145410776138306, + "step": 2150 + }, + { + "epoch": 2.35, + "grad_norm": 4.1324974690904135, + "learning_rate": 3.422840987350426e-07, + "logits/chosen": 3.401771068572998, + "logits/rejected": 3.5432567596435547, + "logps/chosen": -624.2410888671875, + "logps/rejected": -544.371337890625, + "loss": 0.4629, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.8615863919258118, + "rewards/margins": 0.9290858507156372, + "rewards/rejected": -1.7906723022460938, + "step": 2160 + }, + { + "epoch": 2.36, + "grad_norm": 3.4186991421500146, + "learning_rate": 3.409373150655771e-07, + "logits/chosen": 3.565824508666992, + "logits/rejected": 3.754411220550537, + "logps/chosen": -633.1618041992188, + "logps/rejected": -544.0753173828125, + "loss": 0.4783, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7366576790809631, + "rewards/margins": 0.8337681889533997, + "rewards/rejected": -1.5704257488250732, + "step": 2170 + }, + { + "epoch": 2.37, + "grad_norm": 3.844335021845772, + "learning_rate": 3.39587482932041e-07, + "logits/chosen": 3.404428482055664, + "logits/rejected": 3.5384204387664795, + "logps/chosen": -638.5438842773438, + "logps/rejected": -581.6624755859375, + "loss": 0.4639, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.7160848379135132, + "rewards/margins": 0.9811609387397766, + "rewards/rejected": -1.6972458362579346, + "step": 2180 + }, + { + "epoch": 2.38, + "grad_norm": 4.581285458578208, + "learning_rate": 3.38234647584448e-07, + "logits/chosen": 3.472095012664795, + "logits/rejected": 3.5105583667755127, + "logps/chosen": -580.3779296875, + "logps/rejected": -542.3770141601562, + "loss": 0.4723, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8190044164657593, + "rewards/margins": 0.8088991045951843, + "rewards/rejected": -1.627903699874878, + "step": 2190 + }, + { + "epoch": 2.39, + "grad_norm": 3.414699724215474, + "learning_rate": 3.3687885437348786e-07, + "logits/chosen": 3.4263916015625, + "logits/rejected": 3.499289035797119, + "logps/chosen": -593.12646484375, + "logps/rejected": -599.123779296875, + "loss": 0.4645, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.7351945638656616, + "rewards/margins": 0.959613025188446, + "rewards/rejected": -1.6948076486587524, + "step": 2200 + }, + { + "epoch": 2.39, + "eval_logits/chosen": 3.577197790145874, + "eval_logits/rejected": 3.6647300720214844, + "eval_logps/chosen": -613.0288696289062, + "eval_logps/rejected": -577.7230224609375, + "eval_loss": 0.4926875829696655, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": -0.6969201564788818, + "eval_rewards/margins": 0.8395654559135437, + "eval_rewards/rejected": -1.5364856719970703, + "eval_runtime": 202.831, + "eval_samples_per_second": 9.86, + "eval_steps_per_second": 0.311, + "step": 2200 + }, + { + "epoch": 2.4, + "grad_norm": 3.8557727493229077, + "learning_rate": 3.355201487490056e-07, + "logits/chosen": 3.489133834838867, + "logits/rejected": 3.4847042560577393, + "logps/chosen": -631.4229736328125, + "logps/rejected": -570.99365234375, + "loss": 0.46, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7371636629104614, + "rewards/margins": 0.8246687650680542, + "rewards/rejected": -1.5618324279785156, + "step": 2210 + }, + { + "epoch": 2.41, + "grad_norm": 3.582563306354732, + "learning_rate": 3.3415857625847834e-07, + "logits/chosen": 3.494647264480591, + "logits/rejected": 3.5302734375, + "logps/chosen": -571.177978515625, + "logps/rejected": -532.7431640625, + "loss": 0.4726, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7189729809761047, + "rewards/margins": 0.9511632919311523, + "rewards/rejected": -1.6701362133026123, + "step": 2220 + }, + { + "epoch": 2.42, + "grad_norm": 3.500411233095585, + "learning_rate": 3.327941825454884e-07, + "logits/chosen": 3.5814578533172607, + "logits/rejected": 3.530216932296753, + "logps/chosen": -650.6365356445312, + "logps/rejected": -653.82958984375, + "loss": 0.4643, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5866572856903076, + "rewards/margins": 1.002760648727417, + "rewards/rejected": -1.5894181728363037, + "step": 2230 + }, + { + "epoch": 2.43, + "grad_norm": 4.330062226180967, + "learning_rate": 3.31427013348193e-07, + "logits/chosen": 3.399160385131836, + "logits/rejected": 3.422670364379883, + "logps/chosen": -609.5322875976562, + "logps/rejected": -614.65087890625, + "loss": 0.4885, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6684264540672302, + "rewards/margins": 0.9222012758255005, + "rewards/rejected": -1.5906277894973755, + "step": 2240 + }, + { + "epoch": 2.44, + "grad_norm": 3.2509909424020416, + "learning_rate": 3.3005711449779104e-07, + "logits/chosen": 3.4792771339416504, + "logits/rejected": 3.65201997756958, + "logps/chosen": -677.3572387695312, + "logps/rejected": -588.9779052734375, + "loss": 0.4344, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6901603937149048, + "rewards/margins": 0.8743443489074707, + "rewards/rejected": -1.564504861831665, + "step": 2250 + }, + { + "epoch": 2.45, + "grad_norm": 3.5718986172017377, + "learning_rate": 3.2868453191698667e-07, + "logits/chosen": 3.412238359451294, + "logits/rejected": 3.496849775314331, + "logps/chosen": -640.611083984375, + "logps/rejected": -567.2330322265625, + "loss": 0.4783, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8837859034538269, + "rewards/margins": 0.8499320149421692, + "rewards/rejected": -1.733717918395996, + "step": 2260 + }, + { + "epoch": 2.46, + "grad_norm": 2.8011325615379365, + "learning_rate": 3.2730931161845023e-07, + "logits/chosen": 3.430738925933838, + "logits/rejected": 3.5010883808135986, + "logps/chosen": -609.8501586914062, + "logps/rejected": -563.63037109375, + "loss": 0.4533, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8577713966369629, + "rewards/margins": 0.9224198460578918, + "rewards/rejected": -1.7801910638809204, + "step": 2270 + }, + { + "epoch": 2.48, + "grad_norm": 3.209654315982721, + "learning_rate": 3.2593149970327514e-07, + "logits/chosen": 3.2371573448181152, + "logits/rejected": 3.2815635204315186, + "logps/chosen": -643.8148193359375, + "logps/rejected": -635.4369506835938, + "loss": 0.4541, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0217785835266113, + "rewards/margins": 0.9184284210205078, + "rewards/rejected": -1.9402072429656982, + "step": 2280 + }, + { + "epoch": 2.49, + "grad_norm": 3.5920981527061016, + "learning_rate": 3.245511423594329e-07, + "logits/chosen": 3.521888017654419, + "logits/rejected": 3.4993069171905518, + "logps/chosen": -667.6837768554688, + "logps/rejected": -641.0929565429688, + "loss": 0.483, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.7637149095535278, + "rewards/margins": 0.8890408277511597, + "rewards/rejected": -1.6527557373046875, + "step": 2290 + }, + { + "epoch": 2.5, + "grad_norm": 3.7088867137960184, + "learning_rate": 3.231682858602249e-07, + "logits/chosen": 3.401064395904541, + "logits/rejected": 3.3754830360412598, + "logps/chosen": -633.728515625, + "logps/rejected": -592.6007080078125, + "loss": 0.4587, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.7475706338882446, + "rewards/margins": 1.0294914245605469, + "rewards/rejected": -1.7770618200302124, + "step": 2300 + }, + { + "epoch": 2.5, + "eval_logits/chosen": 3.5790069103240967, + "eval_logits/rejected": 3.6614677906036377, + "eval_logps/chosen": -603.5742797851562, + "eval_logps/rejected": -569.4067993164062, + "eval_loss": 0.4936090111732483, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.6023736596107483, + "eval_rewards/margins": 0.8509496450424194, + "eval_rewards/rejected": -1.4533233642578125, + "eval_runtime": 202.8172, + "eval_samples_per_second": 9.861, + "eval_steps_per_second": 0.311, + "step": 2300 + }, + { + "epoch": 2.51, + "grad_norm": 3.48598402062857, + "learning_rate": 3.217829765627304e-07, + "logits/chosen": 3.4837124347686768, + "logits/rejected": 3.496872663497925, + "logps/chosen": -587.3186645507812, + "logps/rejected": -550.0616455078125, + "loss": 0.4546, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8458267450332642, + "rewards/margins": 0.837451159954071, + "rewards/rejected": -1.6832778453826904, + "step": 2310 + }, + { + "epoch": 2.52, + "grad_norm": 3.3428599640744388, + "learning_rate": 3.203952609062537e-07, + "logits/chosen": 3.560152769088745, + "logits/rejected": 3.5849337577819824, + "logps/chosen": -558.6005249023438, + "logps/rejected": -551.5780029296875, + "loss": 0.4638, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5677013993263245, + "rewards/margins": 0.9342495799064636, + "rewards/rejected": -1.5019508600234985, + "step": 2320 + }, + { + "epoch": 2.53, + "grad_norm": 3.0701836045771067, + "learning_rate": 3.1900518541076625e-07, + "logits/chosen": 3.492565870285034, + "logits/rejected": 3.5176196098327637, + "logps/chosen": -588.5086669921875, + "logps/rejected": -570.9232177734375, + "loss": 0.4737, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6482713222503662, + "rewards/margins": 0.9163777232170105, + "rewards/rejected": -1.564648985862732, + "step": 2330 + }, + { + "epoch": 2.54, + "grad_norm": 4.270506129920829, + "learning_rate": 3.17612796675348e-07, + "logits/chosen": 3.344141721725464, + "logits/rejected": 3.400237560272217, + "logps/chosen": -656.4193725585938, + "logps/rejected": -600.7810668945312, + "loss": 0.4652, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.793533444404602, + "rewards/margins": 0.9670297503471375, + "rewards/rejected": -1.7605631351470947, + "step": 2340 + }, + { + "epoch": 2.55, + "grad_norm": 5.4624457929603745, + "learning_rate": 3.1621814137662477e-07, + "logits/chosen": 3.4089431762695312, + "logits/rejected": 3.561499834060669, + "logps/chosen": -656.7151489257812, + "logps/rejected": -571.5653686523438, + "loss": 0.4781, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7137613296508789, + "rewards/margins": 1.0217043161392212, + "rewards/rejected": -1.7354657649993896, + "step": 2350 + }, + { + "epoch": 2.56, + "grad_norm": 3.8692615319841246, + "learning_rate": 3.148212662672038e-07, + "logits/chosen": 3.565295457839966, + "logits/rejected": 3.513939619064331, + "logps/chosen": -589.4000244140625, + "logps/rejected": -578.9471435546875, + "loss": 0.4655, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.663978636264801, + "rewards/margins": 1.0130146741867065, + "rewards/rejected": -1.6769931316375732, + "step": 2360 + }, + { + "epoch": 2.57, + "grad_norm": 3.745757139606091, + "learning_rate": 3.1342221817410615e-07, + "logits/chosen": 3.4632372856140137, + "logits/rejected": 3.605541944503784, + "logps/chosen": -666.013671875, + "logps/rejected": -610.5642700195312, + "loss": 0.4481, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7232716083526611, + "rewards/margins": 0.9818176031112671, + "rewards/rejected": -1.7050892114639282, + "step": 2370 + }, + { + "epoch": 2.58, + "grad_norm": 4.735659043960875, + "learning_rate": 3.120210439971974e-07, + "logits/chosen": 3.443312168121338, + "logits/rejected": 3.4171016216278076, + "logps/chosen": -605.1135864257812, + "logps/rejected": -626.380615234375, + "loss": 0.4638, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0124810934066772, + "rewards/margins": 0.91313636302948, + "rewards/rejected": -1.9256175756454468, + "step": 2380 + }, + { + "epoch": 2.6, + "grad_norm": 4.523496777396923, + "learning_rate": 3.1061779070761523e-07, + "logits/chosen": 3.4363415241241455, + "logits/rejected": 3.4582061767578125, + "logps/chosen": -649.8582763671875, + "logps/rejected": -642.29833984375, + "loss": 0.4333, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7597558498382568, + "rewards/margins": 1.084705114364624, + "rewards/rejected": -1.8444608449935913, + "step": 2390 + }, + { + "epoch": 2.61, + "grad_norm": 4.200268270134274, + "learning_rate": 3.0921250534619447e-07, + "logits/chosen": 3.296509265899658, + "logits/rejected": 3.377551317214966, + "logps/chosen": -683.1427001953125, + "logps/rejected": -585.7415161132812, + "loss": 0.437, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7686322331428528, + "rewards/margins": 1.0582411289215088, + "rewards/rejected": -1.8268734216690063, + "step": 2400 + }, + { + "epoch": 2.61, + "eval_logits/chosen": 3.4342610836029053, + "eval_logits/rejected": 3.4902870655059814, + "eval_logps/chosen": -631.598388671875, + "eval_logps/rejected": -601.3099365234375, + "eval_loss": 0.4921112656593323, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -0.8826150298118591, + "eval_rewards/margins": 0.8897396922111511, + "eval_rewards/rejected": -1.7723547220230103, + "eval_runtime": 203.1843, + "eval_samples_per_second": 9.843, + "eval_steps_per_second": 0.31, + "step": 2400 + }, + { + "epoch": 2.62, + "grad_norm": 6.363573204461412, + "learning_rate": 3.0780523502189075e-07, + "logits/chosen": 3.3436951637268066, + "logits/rejected": 3.3255093097686768, + "logps/chosen": -615.0836181640625, + "logps/rejected": -604.822265625, + "loss": 0.4602, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9606714248657227, + "rewards/margins": 1.0685679912567139, + "rewards/rejected": -2.0292391777038574, + "step": 2410 + }, + { + "epoch": 2.63, + "grad_norm": 3.203187461754593, + "learning_rate": 3.0639602691020093e-07, + "logits/chosen": 3.455970287322998, + "logits/rejected": 3.450157642364502, + "logps/chosen": -680.4078369140625, + "logps/rejected": -633.8014526367188, + "loss": 0.46, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7696986794471741, + "rewards/margins": 1.0905230045318604, + "rewards/rejected": -1.8602216243743896, + "step": 2420 + }, + { + "epoch": 2.64, + "grad_norm": 4.14139711172661, + "learning_rate": 3.0498492825158176e-07, + "logits/chosen": 3.320502519607544, + "logits/rejected": 3.3242201805114746, + "logps/chosen": -566.0848388671875, + "logps/rejected": -572.5762329101562, + "loss": 0.4654, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9355268478393555, + "rewards/margins": 0.9019051790237427, + "rewards/rejected": -1.8374321460723877, + "step": 2430 + }, + { + "epoch": 2.65, + "grad_norm": 2.8171823269826555, + "learning_rate": 3.0357198634986613e-07, + "logits/chosen": 3.4914822578430176, + "logits/rejected": 3.622373580932617, + "logps/chosen": -582.17578125, + "logps/rejected": -543.8931274414062, + "loss": 0.4891, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6903704404830933, + "rewards/margins": 0.7755452990531921, + "rewards/rejected": -1.4659157991409302, + "step": 2440 + }, + { + "epoch": 2.66, + "grad_norm": 3.50063011012773, + "learning_rate": 3.0215724857067757e-07, + "logits/chosen": 3.379225492477417, + "logits/rejected": 3.545624256134033, + "logps/chosen": -651.8958129882812, + "logps/rejected": -535.3997802734375, + "loss": 0.4437, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5937511324882507, + "rewards/margins": 1.0226200819015503, + "rewards/rejected": -1.6163713932037354, + "step": 2450 + }, + { + "epoch": 2.67, + "grad_norm": 3.430383815773576, + "learning_rate": 3.007407623398421e-07, + "logits/chosen": 3.4755587577819824, + "logits/rejected": 3.6219284534454346, + "logps/chosen": -599.4044189453125, + "logps/rejected": -561.2335205078125, + "loss": 0.4602, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6869359016418457, + "rewards/margins": 0.9378687143325806, + "rewards/rejected": -1.6248044967651367, + "step": 2460 + }, + { + "epoch": 2.68, + "grad_norm": 3.419580465406847, + "learning_rate": 2.9932257514179854e-07, + "logits/chosen": 3.4234156608581543, + "logits/rejected": 3.3968899250030518, + "logps/chosen": -549.8673095703125, + "logps/rejected": -551.0244750976562, + "loss": 0.4478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8541863560676575, + "rewards/margins": 0.8430056571960449, + "rewards/rejected": -1.6971919536590576, + "step": 2470 + }, + { + "epoch": 2.69, + "grad_norm": 3.4971290852379857, + "learning_rate": 2.97902734518007e-07, + "logits/chosen": 3.5135090351104736, + "logits/rejected": 3.6605193614959717, + "logps/chosen": -658.0453491210938, + "logps/rejected": -585.8455200195312, + "loss": 0.4364, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6410279273986816, + "rewards/margins": 1.001507043838501, + "rewards/rejected": -1.642534852027893, + "step": 2480 + }, + { + "epoch": 2.7, + "grad_norm": 3.4235010827410073, + "learning_rate": 2.9648128806535445e-07, + "logits/chosen": 3.4521877765655518, + "logits/rejected": 3.495452880859375, + "logps/chosen": -621.9888916015625, + "logps/rejected": -625.4398193359375, + "loss": 0.4703, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.6590943336486816, + "rewards/margins": 1.0803091526031494, + "rewards/rejected": -1.739403486251831, + "step": 2490 + }, + { + "epoch": 2.71, + "grad_norm": 5.415917141603628, + "learning_rate": 2.9505828343456005e-07, + "logits/chosen": 3.4482123851776123, + "logits/rejected": 3.561087131500244, + "logps/chosen": -622.19482421875, + "logps/rejected": -625.0249633789062, + "loss": 0.4204, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.8436199426651001, + "rewards/margins": 1.1915690898895264, + "rewards/rejected": -2.035189151763916, + "step": 2500 + }, + { + "epoch": 2.71, + "eval_logits/chosen": 3.4803626537323, + "eval_logits/rejected": 3.5446834564208984, + "eval_logps/chosen": -626.717529296875, + "eval_logps/rejected": -597.4497680664062, + "eval_loss": 0.4889599084854126, + "eval_rewards/accuracies": 0.7757936716079712, + "eval_rewards/chosen": -0.8338061571121216, + "eval_rewards/margins": 0.8999470472335815, + "eval_rewards/rejected": -1.7337533235549927, + "eval_runtime": 203.1467, + "eval_samples_per_second": 9.845, + "eval_steps_per_second": 0.31, + "step": 2500 + }, + { + "epoch": 2.73, + "grad_norm": 3.435022858379588, + "learning_rate": 2.936337683285768e-07, + "logits/chosen": 3.328322649002075, + "logits/rejected": 3.357412338256836, + "logps/chosen": -610.0457763671875, + "logps/rejected": -569.2310180664062, + "loss": 0.4554, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9784911870956421, + "rewards/margins": 0.7509005069732666, + "rewards/rejected": -1.7293916940689087, + "step": 2510 + }, + { + "epoch": 2.74, + "grad_norm": 3.5359479752810667, + "learning_rate": 2.9220779050099344e-07, + "logits/chosen": 3.3794853687286377, + "logits/rejected": 3.358579158782959, + "logps/chosen": -581.7650146484375, + "logps/rejected": -571.266357421875, + "loss": 0.4446, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9221858978271484, + "rewards/margins": 1.0501763820648193, + "rewards/rejected": -1.9723621606826782, + "step": 2520 + }, + { + "epoch": 2.75, + "grad_norm": 3.5952662863841818, + "learning_rate": 2.9078039775443247e-07, + "logits/chosen": 3.4976468086242676, + "logits/rejected": 3.4828574657440186, + "logps/chosen": -619.1820678710938, + "logps/rejected": -594.2859497070312, + "loss": 0.4535, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9900520443916321, + "rewards/margins": 0.9431132078170776, + "rewards/rejected": -1.9331653118133545, + "step": 2530 + }, + { + "epoch": 2.76, + "grad_norm": 4.430475128483214, + "learning_rate": 2.893516379389489e-07, + "logits/chosen": 3.391080141067505, + "logits/rejected": 3.426509380340576, + "logps/chosen": -693.9683837890625, + "logps/rejected": -606.825927734375, + "loss": 0.4668, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.917786717414856, + "rewards/margins": 1.123294711112976, + "rewards/rejected": -2.041081666946411, + "step": 2540 + }, + { + "epoch": 2.77, + "grad_norm": 4.42604200822975, + "learning_rate": 2.879215589504252e-07, + "logits/chosen": 3.4469542503356934, + "logits/rejected": 3.3772597312927246, + "logps/chosen": -626.1776123046875, + "logps/rejected": -585.4291381835938, + "loss": 0.4767, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.6976214647293091, + "rewards/margins": 0.9503000974655151, + "rewards/rejected": -1.6479215621948242, + "step": 2550 + }, + { + "epoch": 2.78, + "grad_norm": 3.492132180536463, + "learning_rate": 2.8649020872896606e-07, + "logits/chosen": 3.2876811027526855, + "logits/rejected": 3.3829731941223145, + "logps/chosen": -637.89892578125, + "logps/rejected": -559.44580078125, + "loss": 0.4547, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7019818425178528, + "rewards/margins": 0.9280465245246887, + "rewards/rejected": -1.6300283670425415, + "step": 2560 + }, + { + "epoch": 2.79, + "grad_norm": 5.386088412615246, + "learning_rate": 2.850576352572916e-07, + "logits/chosen": 3.428292751312256, + "logits/rejected": 3.427203416824341, + "logps/chosen": -631.5196533203125, + "logps/rejected": -579.7611694335938, + "loss": 0.4626, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8275982737541199, + "rewards/margins": 0.9573984146118164, + "rewards/rejected": -1.784996747970581, + "step": 2570 + }, + { + "epoch": 2.8, + "grad_norm": 3.619396695108091, + "learning_rate": 2.8362388655912826e-07, + "logits/chosen": 3.3158061504364014, + "logits/rejected": 3.344578981399536, + "logps/chosen": -585.06689453125, + "logps/rejected": -551.7364501953125, + "loss": 0.4544, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7403179407119751, + "rewards/margins": 0.9482278823852539, + "rewards/rejected": -1.688545823097229, + "step": 2580 + }, + { + "epoch": 2.81, + "grad_norm": 3.43659878268593, + "learning_rate": 2.821890106975996e-07, + "logits/chosen": 3.4369442462921143, + "logits/rejected": 3.549891710281372, + "logps/chosen": -687.2131958007812, + "logps/rejected": -632.4449462890625, + "loss": 0.4885, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.4967229962348938, + "rewards/margins": 0.9954892992973328, + "rewards/rejected": -1.492212176322937, + "step": 2590 + }, + { + "epoch": 2.82, + "grad_norm": 3.268971458217506, + "learning_rate": 2.807530557736144e-07, + "logits/chosen": 3.5161678791046143, + "logits/rejected": 3.4077048301696777, + "logps/chosen": -585.3170166015625, + "logps/rejected": -609.959716796875, + "loss": 0.467, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.6477356553077698, + "rewards/margins": 0.7576956748962402, + "rewards/rejected": -1.4054313898086548, + "step": 2600 + }, + { + "epoch": 2.82, + "eval_logits/chosen": 3.499997854232788, + "eval_logits/rejected": 3.569040060043335, + "eval_logps/chosen": -602.4325561523438, + "eval_logps/rejected": -569.2332763671875, + "eval_loss": 0.4865441918373108, + "eval_rewards/accuracies": 0.7876983880996704, + "eval_rewards/chosen": -0.590956449508667, + "eval_rewards/margins": 0.8606314659118652, + "eval_rewards/rejected": -1.4515879154205322, + "eval_runtime": 203.3268, + "eval_samples_per_second": 9.836, + "eval_steps_per_second": 0.31, + "step": 2600 + }, + { + "epoch": 2.83, + "grad_norm": 4.278479470624274, + "learning_rate": 2.793160699242548e-07, + "logits/chosen": 3.505661725997925, + "logits/rejected": 3.3195648193359375, + "logps/chosen": -605.200927734375, + "logps/rejected": -576.4964599609375, + "loss": 0.4454, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6035189628601074, + "rewards/margins": 0.8821620941162109, + "rewards/rejected": -1.4856809377670288, + "step": 2610 + }, + { + "epoch": 2.84, + "grad_norm": 4.114384313390409, + "learning_rate": 2.7787810132116196e-07, + "logits/chosen": 3.464247465133667, + "logits/rejected": 3.4501852989196777, + "logps/chosen": -594.6990356445312, + "logps/rejected": -574.1302490234375, + "loss": 0.4631, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7224303483963013, + "rewards/margins": 0.9456484913825989, + "rewards/rejected": -1.6680786609649658, + "step": 2620 + }, + { + "epoch": 2.86, + "grad_norm": 4.081478700217825, + "learning_rate": 2.7643919816892215e-07, + "logits/chosen": 3.2647666931152344, + "logits/rejected": 3.3138420581817627, + "logps/chosen": -562.5584106445312, + "logps/rejected": -558.0347900390625, + "loss": 0.4537, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8194862604141235, + "rewards/margins": 0.8074823617935181, + "rewards/rejected": -1.6269683837890625, + "step": 2630 + }, + { + "epoch": 2.87, + "grad_norm": 4.121806949453411, + "learning_rate": 2.749994087034498e-07, + "logits/chosen": 3.2870640754699707, + "logits/rejected": 3.3868587017059326, + "logps/chosen": -646.748779296875, + "logps/rejected": -587.2466430664062, + "loss": 0.4233, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.8164499998092651, + "rewards/margins": 1.0539710521697998, + "rewards/rejected": -1.870421051979065, + "step": 2640 + }, + { + "epoch": 2.88, + "grad_norm": 3.75911473750284, + "learning_rate": 2.7355878119037097e-07, + "logits/chosen": 3.459559679031372, + "logits/rejected": 3.4844887256622314, + "logps/chosen": -644.4876098632812, + "logps/rejected": -624.3375854492188, + "loss": 0.435, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9554063677787781, + "rewards/margins": 1.0268898010253906, + "rewards/rejected": -1.9822959899902344, + "step": 2650 + }, + { + "epoch": 2.89, + "grad_norm": 5.672731475848124, + "learning_rate": 2.7211736392340567e-07, + "logits/chosen": 3.3097336292266846, + "logits/rejected": 3.4127914905548096, + "logps/chosen": -670.7672119140625, + "logps/rejected": -658.6070556640625, + "loss": 0.4542, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9260674715042114, + "rewards/margins": 1.0306978225708008, + "rewards/rejected": -1.9567651748657227, + "step": 2660 + }, + { + "epoch": 2.9, + "grad_norm": 3.944618363860535, + "learning_rate": 2.706752052227483e-07, + "logits/chosen": 3.387530565261841, + "logits/rejected": 3.305820941925049, + "logps/chosen": -629.259521484375, + "logps/rejected": -633.9971923828125, + "loss": 0.4659, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.833823025226593, + "rewards/margins": 0.9765887260437012, + "rewards/rejected": -1.810411810874939, + "step": 2670 + }, + { + "epoch": 2.91, + "grad_norm": 5.4320141900216115, + "learning_rate": 2.692323534334481e-07, + "logits/chosen": 3.3067660331726074, + "logits/rejected": 3.3124756813049316, + "logps/chosen": -656.865234375, + "logps/rejected": -581.8089599609375, + "loss": 0.4874, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8518417477607727, + "rewards/margins": 1.005443811416626, + "rewards/rejected": -1.857285737991333, + "step": 2680 + }, + { + "epoch": 2.92, + "grad_norm": 3.7610827920428194, + "learning_rate": 2.6778885692378866e-07, + "logits/chosen": 3.406977415084839, + "logits/rejected": 3.33898663520813, + "logps/chosen": -633.6414794921875, + "logps/rejected": -593.6680297851562, + "loss": 0.433, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.656938374042511, + "rewards/margins": 1.116265058517456, + "rewards/rejected": -1.7732034921646118, + "step": 2690 + }, + { + "epoch": 2.93, + "grad_norm": 3.432377080662618, + "learning_rate": 2.663447640836663e-07, + "logits/chosen": 3.429065704345703, + "logits/rejected": 3.451131820678711, + "logps/chosen": -617.0606689453125, + "logps/rejected": -597.2569580078125, + "loss": 0.458, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.8843738436698914, + "rewards/margins": 0.9283639788627625, + "rewards/rejected": -1.812738060951233, + "step": 2700 + }, + { + "epoch": 2.93, + "eval_logits/chosen": 3.457927942276001, + "eval_logits/rejected": 3.5208330154418945, + "eval_logps/chosen": -620.0014038085938, + "eval_logps/rejected": -591.329833984375, + "eval_loss": 0.48605242371559143, + "eval_rewards/accuracies": 0.783730149269104, + "eval_rewards/chosen": -0.7666451334953308, + "eval_rewards/margins": 0.9059080481529236, + "eval_rewards/rejected": -1.672553300857544, + "eval_runtime": 203.0125, + "eval_samples_per_second": 9.852, + "eval_steps_per_second": 0.31, + "step": 2700 + }, + { + "epoch": 2.94, + "grad_norm": 3.5814744568515438, + "learning_rate": 2.6490012332296796e-07, + "logits/chosen": 3.4098687171936035, + "logits/rejected": 3.5238165855407715, + "logps/chosen": -650.0216674804688, + "logps/rejected": -568.7059326171875, + "loss": 0.4647, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.8275938034057617, + "rewards/margins": 0.9160548448562622, + "rewards/rejected": -1.7436487674713135, + "step": 2710 + }, + { + "epoch": 2.95, + "grad_norm": 3.7058563105254945, + "learning_rate": 2.634549830699483e-07, + "logits/chosen": 3.4752769470214844, + "logits/rejected": 3.4913477897644043, + "logps/chosen": -596.112548828125, + "logps/rejected": -571.9755859375, + "loss": 0.4499, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.6368006467819214, + "rewards/margins": 0.9434793591499329, + "rewards/rejected": -1.580280065536499, + "step": 2720 + }, + { + "epoch": 2.96, + "grad_norm": 4.263349011982545, + "learning_rate": 2.620093917696063e-07, + "logits/chosen": 3.3458282947540283, + "logits/rejected": 3.513139247894287, + "logps/chosen": -624.7337646484375, + "logps/rejected": -580.9771728515625, + "loss": 0.4574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8148934245109558, + "rewards/margins": 0.9292934536933899, + "rewards/rejected": -1.7441869974136353, + "step": 2730 + }, + { + "epoch": 2.98, + "grad_norm": 3.9498017642080914, + "learning_rate": 2.605633978820613e-07, + "logits/chosen": 3.3828988075256348, + "logits/rejected": 3.3466384410858154, + "logps/chosen": -639.5057373046875, + "logps/rejected": -615.6836547851562, + "loss": 0.4581, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8807100057601929, + "rewards/margins": 1.0138176679611206, + "rewards/rejected": -1.894527792930603, + "step": 2740 + }, + { + "epoch": 2.99, + "grad_norm": 3.788532661853087, + "learning_rate": 2.591170498809284e-07, + "logits/chosen": 3.449664354324341, + "logits/rejected": 3.54020357131958, + "logps/chosen": -686.0653076171875, + "logps/rejected": -632.3584594726562, + "loss": 0.4519, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7908682823181152, + "rewards/margins": 0.9529739618301392, + "rewards/rejected": -1.7438421249389648, + "step": 2750 + }, + { + "epoch": 3.0, + "grad_norm": 4.184260963488442, + "learning_rate": 2.576703962516937e-07, + "logits/chosen": 3.3226122856140137, + "logits/rejected": 3.37495493888855, + "logps/chosen": -627.0601196289062, + "logps/rejected": -629.4278564453125, + "loss": 0.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8550036549568176, + "rewards/margins": 0.9417628049850464, + "rewards/rejected": -1.7967665195465088, + "step": 2760 + }, + { + "epoch": 3.01, + "grad_norm": 3.8352731309247283, + "learning_rate": 2.5622348549008854e-07, + "logits/chosen": 3.4446792602539062, + "logits/rejected": 3.4306163787841797, + "logps/chosen": -592.9773559570312, + "logps/rejected": -580.4371337890625, + "loss": 0.4588, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.8938300013542175, + "rewards/margins": 1.0764323472976685, + "rewards/rejected": -1.9702622890472412, + "step": 2770 + }, + { + "epoch": 3.02, + "grad_norm": 4.151038171191166, + "learning_rate": 2.547763661004642e-07, + "logits/chosen": 3.352771759033203, + "logits/rejected": 3.3830676078796387, + "logps/chosen": -684.36279296875, + "logps/rejected": -622.1414794921875, + "loss": 0.4183, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.8378265500068665, + "rewards/margins": 0.9874498248100281, + "rewards/rejected": -1.8252766132354736, + "step": 2780 + }, + { + "epoch": 3.03, + "grad_norm": 4.584896186132231, + "learning_rate": 2.533290865941658e-07, + "logits/chosen": 3.3703866004943848, + "logits/rejected": 3.248610258102417, + "logps/chosen": -632.7425537109375, + "logps/rejected": -617.5534057617188, + "loss": 0.4505, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9305559396743774, + "rewards/margins": 1.0056320428848267, + "rewards/rejected": -1.9361881017684937, + "step": 2790 + }, + { + "epoch": 3.04, + "grad_norm": 3.9983708065161148, + "learning_rate": 2.518816954879057e-07, + "logits/chosen": 3.4359169006347656, + "logits/rejected": 3.474580764770508, + "logps/chosen": -594.47900390625, + "logps/rejected": -558.6627197265625, + "loss": 0.462, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8003416061401367, + "rewards/margins": 0.8039595484733582, + "rewards/rejected": -1.6043012142181396, + "step": 2800 + }, + { + "epoch": 3.04, + "eval_logits/chosen": 3.49540376663208, + "eval_logits/rejected": 3.5552937984466553, + "eval_logps/chosen": -614.4227294921875, + "eval_logps/rejected": -585.52685546875, + "eval_loss": 0.4844111204147339, + "eval_rewards/accuracies": 0.7916666865348816, + "eval_rewards/chosen": -0.7108585834503174, + "eval_rewards/margins": 0.9036649465560913, + "eval_rewards/rejected": -1.6145235300064087, + "eval_runtime": 203.0036, + "eval_samples_per_second": 9.852, + "eval_steps_per_second": 0.31, + "step": 2800 + }, + { + "epoch": 3.05, + "grad_norm": 4.375685459289675, + "learning_rate": 2.504342413021377e-07, + "logits/chosen": 3.48063588142395, + "logits/rejected": 3.4624500274658203, + "logps/chosen": -573.4635009765625, + "logps/rejected": -599.910888671875, + "loss": 0.4279, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8659281730651855, + "rewards/margins": 0.9196036458015442, + "rewards/rejected": -1.785531759262085, + "step": 2810 + }, + { + "epoch": 3.06, + "grad_norm": 3.448289578171233, + "learning_rate": 2.4898677255943006e-07, + "logits/chosen": 3.5123367309570312, + "logits/rejected": 3.425743579864502, + "logps/chosen": -659.736572265625, + "logps/rejected": -652.95458984375, + "loss": 0.4334, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.8693497776985168, + "rewards/margins": 0.971311092376709, + "rewards/rejected": -1.8406610488891602, + "step": 2820 + }, + { + "epoch": 3.07, + "grad_norm": 4.581154617393512, + "learning_rate": 2.47539337782839e-07, + "logits/chosen": 3.2755444049835205, + "logits/rejected": 3.3720130920410156, + "logps/chosen": -644.5631103515625, + "logps/rejected": -581.09228515625, + "loss": 0.4364, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9709026217460632, + "rewards/margins": 1.0490353107452393, + "rewards/rejected": -2.0199379920959473, + "step": 2830 + }, + { + "epoch": 3.08, + "grad_norm": 3.7613899547216763, + "learning_rate": 2.460919854942822e-07, + "logits/chosen": 3.4118473529815674, + "logits/rejected": 3.4696223735809326, + "logps/chosen": -647.2174072265625, + "logps/rejected": -637.8060302734375, + "loss": 0.4491, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9573391675949097, + "rewards/margins": 0.8948485255241394, + "rewards/rejected": -1.8521878719329834, + "step": 2840 + }, + { + "epoch": 3.09, + "grad_norm": 3.2320273947918983, + "learning_rate": 2.44644764212912e-07, + "logits/chosen": 3.3305840492248535, + "logits/rejected": 3.2426304817199707, + "logps/chosen": -624.9818115234375, + "logps/rejected": -609.2310791015625, + "loss": 0.4306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7739254832267761, + "rewards/margins": 0.9810514450073242, + "rewards/rejected": -1.7549769878387451, + "step": 2850 + }, + { + "epoch": 3.11, + "grad_norm": 3.7286268455772755, + "learning_rate": 2.4319772245348927e-07, + "logits/chosen": 3.474033832550049, + "logits/rejected": 3.4578864574432373, + "logps/chosen": -619.8948974609375, + "logps/rejected": -615.1239013671875, + "loss": 0.4383, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.7849314212799072, + "rewards/margins": 1.0505647659301758, + "rewards/rejected": -1.835496187210083, + "step": 2860 + }, + { + "epoch": 3.12, + "grad_norm": 3.9099335431162823, + "learning_rate": 2.4175090872475645e-07, + "logits/chosen": 3.5397415161132812, + "logits/rejected": 3.4615206718444824, + "logps/chosen": -599.0786743164062, + "logps/rejected": -655.3748779296875, + "loss": 0.4652, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7803353071212769, + "rewards/margins": 0.8833073377609253, + "rewards/rejected": -1.6636425256729126, + "step": 2870 + }, + { + "epoch": 3.13, + "grad_norm": 3.9574565601831293, + "learning_rate": 2.40304371527812e-07, + "logits/chosen": 3.4028964042663574, + "logits/rejected": 3.4590251445770264, + "logps/chosen": -649.6727294921875, + "logps/rejected": -638.2109985351562, + "loss": 0.4611, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9411758184432983, + "rewards/margins": 0.9360073804855347, + "rewards/rejected": -1.8771835565567017, + "step": 2880 + }, + { + "epoch": 3.14, + "grad_norm": 3.395616713371064, + "learning_rate": 2.3885815935448435e-07, + "logits/chosen": 3.278578519821167, + "logits/rejected": 3.398362636566162, + "logps/chosen": -619.1382446289062, + "logps/rejected": -610.914306640625, + "loss": 0.4205, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0494105815887451, + "rewards/margins": 1.2443387508392334, + "rewards/rejected": -2.2937493324279785, + "step": 2890 + }, + { + "epoch": 3.15, + "grad_norm": 4.10371958008165, + "learning_rate": 2.3741232068570605e-07, + "logits/chosen": 3.4090774059295654, + "logits/rejected": 3.3995368480682373, + "logps/chosen": -701.1258544921875, + "logps/rejected": -666.7429809570312, + "loss": 0.4258, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9114104509353638, + "rewards/margins": 1.1804288625717163, + "rewards/rejected": -2.09183931350708, + "step": 2900 + }, + { + "epoch": 3.15, + "eval_logits/chosen": 3.4227328300476074, + "eval_logits/rejected": 3.476104259490967, + "eval_logps/chosen": -641.4771728515625, + "eval_logps/rejected": -618.2141723632812, + "eval_loss": 0.48884913325309753, + "eval_rewards/accuracies": 0.7817460298538208, + "eval_rewards/chosen": -0.9814031720161438, + "eval_rewards/margins": 0.9599937200546265, + "eval_rewards/rejected": -1.9413968324661255, + "eval_runtime": 203.217, + "eval_samples_per_second": 9.842, + "eval_steps_per_second": 0.31, + "step": 2900 + }, + { + "epoch": 3.16, + "grad_norm": 4.071596946207353, + "learning_rate": 2.3596690398988903e-07, + "logits/chosen": 3.367732286453247, + "logits/rejected": 3.335725784301758, + "logps/chosen": -563.6433715820312, + "logps/rejected": -563.1458740234375, + "loss": 0.4532, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1280577182769775, + "rewards/margins": 1.1104881763458252, + "rewards/rejected": -2.2385458946228027, + "step": 2910 + }, + { + "epoch": 3.17, + "grad_norm": 3.3245284364131638, + "learning_rate": 2.3452195772129937e-07, + "logits/chosen": 3.4156277179718018, + "logits/rejected": 3.4742846488952637, + "logps/chosen": -667.6085205078125, + "logps/rejected": -634.3673095703125, + "loss": 0.4439, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9609830975532532, + "rewards/margins": 1.044752597808838, + "rewards/rejected": -2.0057358741760254, + "step": 2920 + }, + { + "epoch": 3.18, + "grad_norm": 3.6395486327587605, + "learning_rate": 2.3307753031843312e-07, + "logits/chosen": 3.436938524246216, + "logits/rejected": 3.4605090618133545, + "logps/chosen": -603.6176147460938, + "logps/rejected": -600.2156982421875, + "loss": 0.4564, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8700806498527527, + "rewards/margins": 1.0620027780532837, + "rewards/rejected": -1.9320834875106812, + "step": 2930 + }, + { + "epoch": 3.19, + "grad_norm": 4.305925067636892, + "learning_rate": 2.3163367020239264e-07, + "logits/chosen": 3.4125266075134277, + "logits/rejected": 3.426459550857544, + "logps/chosen": -641.9744262695312, + "logps/rejected": -601.0375366210938, + "loss": 0.4413, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.8042587041854858, + "rewards/margins": 1.035605788230896, + "rewards/rejected": -1.8398644924163818, + "step": 2940 + }, + { + "epoch": 3.2, + "grad_norm": 3.7325297948726406, + "learning_rate": 2.3019042577526337e-07, + "logits/chosen": 3.386017322540283, + "logits/rejected": 3.4424185752868652, + "logps/chosen": -706.9832763671875, + "logps/rejected": -649.3670654296875, + "loss": 0.4625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8057816624641418, + "rewards/margins": 0.9790631532669067, + "rewards/rejected": -1.7848447561264038, + "step": 2950 + }, + { + "epoch": 3.21, + "grad_norm": 5.9773377993199475, + "learning_rate": 2.2874784541849105e-07, + "logits/chosen": 3.3580546379089355, + "logits/rejected": 3.3814473152160645, + "logps/chosen": -612.360595703125, + "logps/rejected": -565.7884521484375, + "loss": 0.4429, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.806524395942688, + "rewards/margins": 0.977981448173523, + "rewards/rejected": -1.78450608253479, + "step": 2960 + }, + { + "epoch": 3.22, + "grad_norm": 4.99474748571327, + "learning_rate": 2.2730597749126014e-07, + "logits/chosen": 3.396486759185791, + "logits/rejected": 3.502777099609375, + "logps/chosen": -637.4274291992188, + "logps/rejected": -604.4832763671875, + "loss": 0.4255, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8480658531188965, + "rewards/margins": 1.0878616571426392, + "rewards/rejected": -1.9359276294708252, + "step": 2970 + }, + { + "epoch": 3.24, + "grad_norm": 4.333271910357194, + "learning_rate": 2.2586487032887237e-07, + "logits/chosen": 3.3672001361846924, + "logits/rejected": 3.4402644634246826, + "logps/chosen": -658.7831420898438, + "logps/rejected": -598.9805297851562, + "loss": 0.4417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0384938716888428, + "rewards/margins": 0.9810712933540344, + "rewards/rejected": -2.0195651054382324, + "step": 2980 + }, + { + "epoch": 3.25, + "grad_norm": 6.272194848676277, + "learning_rate": 2.2442457224112676e-07, + "logits/chosen": 3.4500479698181152, + "logits/rejected": 3.4857699871063232, + "logps/chosen": -698.882568359375, + "logps/rejected": -676.0650634765625, + "loss": 0.4398, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.004777193069458, + "rewards/margins": 1.0942362546920776, + "rewards/rejected": -2.099013566970825, + "step": 2990 + }, + { + "epoch": 3.26, + "grad_norm": 4.330090833439255, + "learning_rate": 2.229851315106999e-07, + "logits/chosen": 3.323119640350342, + "logits/rejected": 3.391847610473633, + "logps/chosen": -649.1282348632812, + "logps/rejected": -610.1156616210938, + "loss": 0.4219, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.028066873550415, + "rewards/margins": 1.0071299076080322, + "rewards/rejected": -2.0351967811584473, + "step": 3000 + }, + { + "epoch": 3.26, + "eval_logits/chosen": 3.4361824989318848, + "eval_logits/rejected": 3.4894895553588867, + "eval_logps/chosen": -631.9180908203125, + "eval_logps/rejected": -607.30712890625, + "eval_loss": 0.4856274127960205, + "eval_rewards/accuracies": 0.7936508059501648, + "eval_rewards/chosen": -0.885812520980835, + "eval_rewards/margins": 0.9465143084526062, + "eval_rewards/rejected": -1.8323270082473755, + "eval_runtime": 203.2097, + "eval_samples_per_second": 9.842, + "eval_steps_per_second": 0.31, + "step": 3000 + }, + { + "epoch": 3.27, + "grad_norm": 3.9494363632047027, + "learning_rate": 2.2154659639152728e-07, + "logits/chosen": 3.4298388957977295, + "logits/rejected": 3.5834994316101074, + "logps/chosen": -623.3873291015625, + "logps/rejected": -590.0366821289062, + "loss": 0.4373, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8894636034965515, + "rewards/margins": 0.947036862373352, + "rewards/rejected": -1.8365005254745483, + "step": 3010 + }, + { + "epoch": 3.28, + "grad_norm": 4.37004923030797, + "learning_rate": 2.2010901510718623e-07, + "logits/chosen": 3.4085888862609863, + "logits/rejected": 3.4778428077697754, + "logps/chosen": -661.8770141601562, + "logps/rejected": -620.317138671875, + "loss": 0.4165, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7813987135887146, + "rewards/margins": 1.2368358373641968, + "rewards/rejected": -2.0182344913482666, + "step": 3020 + }, + { + "epoch": 3.29, + "grad_norm": 4.000055891899185, + "learning_rate": 2.186724358492785e-07, + "logits/chosen": 3.410771131515503, + "logits/rejected": 3.4011940956115723, + "logps/chosen": -627.5743408203125, + "logps/rejected": -614.0305786132812, + "loss": 0.4333, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7949288487434387, + "rewards/margins": 1.1482188701629639, + "rewards/rejected": -1.943147897720337, + "step": 3030 + }, + { + "epoch": 3.3, + "grad_norm": 3.8870438933659472, + "learning_rate": 2.1723690677581567e-07, + "logits/chosen": 3.366727352142334, + "logits/rejected": 3.4489874839782715, + "logps/chosen": -683.2049560546875, + "logps/rejected": -597.3983154296875, + "loss": 0.4399, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9306663274765015, + "rewards/margins": 0.9225971102714539, + "rewards/rejected": -1.8532634973526, + "step": 3040 + }, + { + "epoch": 3.31, + "grad_norm": 3.621627142820652, + "learning_rate": 2.1580247600960392e-07, + "logits/chosen": 3.2974860668182373, + "logits/rejected": 3.3342857360839844, + "logps/chosen": -685.1504516601562, + "logps/rejected": -596.3431396484375, + "loss": 0.4464, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7522670030593872, + "rewards/margins": 1.0248292684555054, + "rewards/rejected": -1.7770963907241821, + "step": 3050 + }, + { + "epoch": 3.32, + "grad_norm": 3.5254386902793557, + "learning_rate": 2.1436919163663153e-07, + "logits/chosen": 3.359710693359375, + "logits/rejected": 3.3922533988952637, + "logps/chosen": -635.1802978515625, + "logps/rejected": -596.8336181640625, + "loss": 0.4298, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8741143345832825, + "rewards/margins": 1.2291319370269775, + "rewards/rejected": -2.1032462120056152, + "step": 3060 + }, + { + "epoch": 3.33, + "grad_norm": 3.475210735676778, + "learning_rate": 2.1293710170445633e-07, + "logits/chosen": 3.355104446411133, + "logits/rejected": 3.3470497131347656, + "logps/chosen": -659.673095703125, + "logps/rejected": -600.4511108398438, + "loss": 0.4592, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9288552403450012, + "rewards/margins": 0.7822314500808716, + "rewards/rejected": -1.711086630821228, + "step": 3070 + }, + { + "epoch": 3.34, + "grad_norm": 3.963907942625747, + "learning_rate": 2.1150625422059537e-07, + "logits/chosen": 3.4125232696533203, + "logits/rejected": 3.4644877910614014, + "logps/chosen": -651.6043090820312, + "logps/rejected": -637.5367431640625, + "loss": 0.4183, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.7479305267333984, + "rewards/margins": 1.0954493284225464, + "rewards/rejected": -1.8433797359466553, + "step": 3080 + }, + { + "epoch": 3.36, + "grad_norm": 4.79735214093586, + "learning_rate": 2.100766971509156e-07, + "logits/chosen": 3.3415913581848145, + "logits/rejected": 3.482771396636963, + "logps/chosen": -642.8465576171875, + "logps/rejected": -587.8502197265625, + "loss": 0.4427, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8390420079231262, + "rewards/margins": 1.0515780448913574, + "rewards/rejected": -1.8906199932098389, + "step": 3090 + }, + { + "epoch": 3.37, + "grad_norm": 3.9755901299610765, + "learning_rate": 2.0864847841802555e-07, + "logits/chosen": 3.313795566558838, + "logits/rejected": 3.35662841796875, + "logps/chosen": -625.80615234375, + "logps/rejected": -577.7479248046875, + "loss": 0.4295, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9515978097915649, + "rewards/margins": 1.0637649297714233, + "rewards/rejected": -2.0153627395629883, + "step": 3100 + }, + { + "epoch": 3.37, + "eval_logits/chosen": 3.4356937408447266, + "eval_logits/rejected": 3.4879915714263916, + "eval_logps/chosen": -624.732666015625, + "eval_logps/rejected": -600.5797119140625, + "eval_loss": 0.48225274682044983, + "eval_rewards/accuracies": 0.7976190447807312, + "eval_rewards/chosen": -0.8139576315879822, + "eval_rewards/margins": 0.9510951042175293, + "eval_rewards/rejected": -1.7650526762008667, + "eval_runtime": 203.1657, + "eval_samples_per_second": 9.844, + "eval_steps_per_second": 0.31, + "step": 3100 + }, + { + "epoch": 3.38, + "grad_norm": 3.5189161775928937, + "learning_rate": 2.0722164589966936e-07, + "logits/chosen": 3.437371015548706, + "logits/rejected": 3.6356558799743652, + "logps/chosen": -656.8878173828125, + "logps/rejected": -565.6344604492188, + "loss": 0.4507, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.805374026298523, + "rewards/margins": 1.0437090396881104, + "rewards/rejected": -1.8490829467773438, + "step": 3110 + }, + { + "epoch": 3.39, + "grad_norm": 4.645784250708397, + "learning_rate": 2.0579624742712128e-07, + "logits/chosen": 3.2445125579833984, + "logits/rejected": 3.2976136207580566, + "logps/chosen": -600.2672729492188, + "logps/rejected": -565.233642578125, + "loss": 0.4238, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8405818939208984, + "rewards/margins": 1.0852935314178467, + "rewards/rejected": -1.9258753061294556, + "step": 3120 + }, + { + "epoch": 3.4, + "grad_norm": 3.405528009644704, + "learning_rate": 2.0437233078358275e-07, + "logits/chosen": 3.4257736206054688, + "logits/rejected": 3.4833385944366455, + "logps/chosen": -643.4204711914062, + "logps/rejected": -617.6407470703125, + "loss": 0.4252, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -0.658096194267273, + "rewards/margins": 0.9798544645309448, + "rewards/rejected": -1.6379506587982178, + "step": 3130 + }, + { + "epoch": 3.41, + "grad_norm": 3.8784222204050627, + "learning_rate": 2.0294994370258e-07, + "logits/chosen": 3.4538047313690186, + "logits/rejected": 3.4483654499053955, + "logps/chosen": -607.20654296875, + "logps/rejected": -605.8489990234375, + "loss": 0.439, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9570397138595581, + "rewards/margins": 1.0782394409179688, + "rewards/rejected": -2.0352795124053955, + "step": 3140 + }, + { + "epoch": 3.42, + "grad_norm": 3.5584730989638045, + "learning_rate": 2.015291338663644e-07, + "logits/chosen": 3.2920451164245605, + "logits/rejected": 3.3430659770965576, + "logps/chosen": -635.3215942382812, + "logps/rejected": -594.5906372070312, + "loss": 0.4207, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8983471989631653, + "rewards/margins": 1.1466476917266846, + "rewards/rejected": -2.044994831085205, + "step": 3150 + }, + { + "epoch": 3.43, + "grad_norm": 5.678551225331759, + "learning_rate": 2.001099489043138e-07, + "logits/chosen": 3.4382717609405518, + "logits/rejected": 3.3808326721191406, + "logps/chosen": -643.2792358398438, + "logps/rejected": -634.2994384765625, + "loss": 0.4408, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7990803718566895, + "rewards/margins": 0.9194300770759583, + "rewards/rejected": -1.718510389328003, + "step": 3160 + }, + { + "epoch": 3.44, + "grad_norm": 5.085374543246204, + "learning_rate": 1.9869243639133577e-07, + "logits/chosen": 3.4107460975646973, + "logits/rejected": 3.396080732345581, + "logps/chosen": -594.4137573242188, + "logps/rejected": -613.5217895507812, + "loss": 0.4556, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7500702142715454, + "rewards/margins": 1.1137340068817139, + "rewards/rejected": -1.8638041019439697, + "step": 3170 + }, + { + "epoch": 3.45, + "grad_norm": 4.250629396797435, + "learning_rate": 1.9727664384627306e-07, + "logits/chosen": 3.4325294494628906, + "logits/rejected": 3.4035377502441406, + "logps/chosen": -596.2446899414062, + "logps/rejected": -564.5435791015625, + "loss": 0.4234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.822806179523468, + "rewards/margins": 1.038006067276001, + "rewards/rejected": -1.8608121871948242, + "step": 3180 + }, + { + "epoch": 3.46, + "grad_norm": 4.019039940993413, + "learning_rate": 1.9586261873031025e-07, + "logits/chosen": 3.2630324363708496, + "logits/rejected": 3.3803462982177734, + "logps/chosen": -616.1365966796875, + "logps/rejected": -603.97216796875, + "loss": 0.4576, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8340195417404175, + "rewards/margins": 1.0018597841262817, + "rewards/rejected": -1.8358793258666992, + "step": 3190 + }, + { + "epoch": 3.47, + "grad_norm": 4.141701500597555, + "learning_rate": 1.9445040844538313e-07, + "logits/chosen": 3.283325672149658, + "logits/rejected": 3.3204421997070312, + "logps/chosen": -604.6246948242188, + "logps/rejected": -588.3277587890625, + "loss": 0.4268, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9514004588127136, + "rewards/margins": 0.8670446276664734, + "rewards/rejected": -1.8184449672698975, + "step": 3200 + }, + { + "epoch": 3.47, + "eval_logits/chosen": 3.4126436710357666, + "eval_logits/rejected": 3.453640937805176, + "eval_logps/chosen": -629.2567138671875, + "eval_logps/rejected": -606.8929443359375, + "eval_loss": 0.480047345161438, + "eval_rewards/accuracies": 0.7976190447807312, + "eval_rewards/chosen": -0.8591986298561096, + "eval_rewards/margins": 0.9689861536026001, + "eval_rewards/rejected": -1.8281848430633545, + "eval_runtime": 203.0923, + "eval_samples_per_second": 9.848, + "eval_steps_per_second": 0.31, + "step": 3200 + }, + { + "epoch": 3.49, + "grad_norm": 6.599030236425171, + "learning_rate": 1.930400603325893e-07, + "logits/chosen": 3.302114963531494, + "logits/rejected": 3.29858136177063, + "logps/chosen": -666.8406982421875, + "logps/rejected": -597.25927734375, + "loss": 0.4352, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0344960689544678, + "rewards/margins": 1.0878442525863647, + "rewards/rejected": -2.122340679168701, + "step": 3210 + }, + { + "epoch": 3.5, + "grad_norm": 5.78140760080788, + "learning_rate": 1.9163162167060144e-07, + "logits/chosen": 3.367105484008789, + "logits/rejected": 3.3133113384246826, + "logps/chosen": -611.474365234375, + "logps/rejected": -590.4954833984375, + "loss": 0.4498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9751068353652954, + "rewards/margins": 0.9375247955322266, + "rewards/rejected": -1.912631630897522, + "step": 3220 + }, + { + "epoch": 3.51, + "grad_norm": 4.831797084110146, + "learning_rate": 1.9022513967408227e-07, + "logits/chosen": 3.2902634143829346, + "logits/rejected": 3.23241925239563, + "logps/chosen": -622.2492065429688, + "logps/rejected": -596.3099365234375, + "loss": 0.4465, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.8274933695793152, + "rewards/margins": 1.0548157691955566, + "rewards/rejected": -1.8823089599609375, + "step": 3230 + }, + { + "epoch": 3.52, + "grad_norm": 3.8203582598938897, + "learning_rate": 1.8882066149210164e-07, + "logits/chosen": 3.298330307006836, + "logits/rejected": 3.444213390350342, + "logps/chosen": -691.8465576171875, + "logps/rejected": -577.5040283203125, + "loss": 0.433, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8670648336410522, + "rewards/margins": 1.0957075357437134, + "rewards/rejected": -1.9627723693847656, + "step": 3240 + }, + { + "epoch": 3.53, + "grad_norm": 4.931291520321011, + "learning_rate": 1.8741823420655642e-07, + "logits/chosen": 3.3172030448913574, + "logits/rejected": 3.2955126762390137, + "logps/chosen": -661.6648559570312, + "logps/rejected": -650.8112182617188, + "loss": 0.4275, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8657820820808411, + "rewards/margins": 1.2847552299499512, + "rewards/rejected": -2.1505374908447266, + "step": 3250 + }, + { + "epoch": 3.54, + "grad_norm": 4.223378935572041, + "learning_rate": 1.8601790483059165e-07, + "logits/chosen": 3.3851325511932373, + "logits/rejected": 3.3895251750946045, + "logps/chosen": -630.2635498046875, + "logps/rejected": -625.9153442382812, + "loss": 0.3994, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -0.9566653370857239, + "rewards/margins": 1.256395697593689, + "rewards/rejected": -2.2130610942840576, + "step": 3260 + }, + { + "epoch": 3.55, + "grad_norm": 4.152887166652273, + "learning_rate": 1.846197203070249e-07, + "logits/chosen": 3.180065393447876, + "logits/rejected": 3.228161573410034, + "logps/chosen": -635.3770751953125, + "logps/rejected": -615.8402099609375, + "loss": 0.4244, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.1644928455352783, + "rewards/margins": 1.269607663154602, + "rewards/rejected": -2.43410062789917, + "step": 3270 + }, + { + "epoch": 3.56, + "grad_norm": 5.405068734416844, + "learning_rate": 1.8322372750677247e-07, + "logits/chosen": 3.346701145172119, + "logits/rejected": 3.2889461517333984, + "logps/chosen": -687.7290649414062, + "logps/rejected": -670.6973876953125, + "loss": 0.4369, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.013751745223999, + "rewards/margins": 1.1379069089889526, + "rewards/rejected": -2.151658535003662, + "step": 3280 + }, + { + "epoch": 3.57, + "grad_norm": 4.239245009187299, + "learning_rate": 1.8182997322727828e-07, + "logits/chosen": 3.3153834342956543, + "logits/rejected": 3.370880603790283, + "logps/chosen": -688.4138793945312, + "logps/rejected": -635.6961669921875, + "loss": 0.4318, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9035437703132629, + "rewards/margins": 1.1481924057006836, + "rewards/rejected": -2.0517361164093018, + "step": 3290 + }, + { + "epoch": 3.58, + "grad_norm": 4.219791148370747, + "learning_rate": 1.8043850419094478e-07, + "logits/chosen": 3.151858329772949, + "logits/rejected": 3.2416319847106934, + "logps/chosen": -656.4683227539062, + "logps/rejected": -635.33740234375, + "loss": 0.4338, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9841516613960266, + "rewards/margins": 1.0485494136810303, + "rewards/rejected": -2.032701015472412, + "step": 3300 + }, + { + "epoch": 3.58, + "eval_logits/chosen": 3.409619092941284, + "eval_logits/rejected": 3.447129249572754, + "eval_logps/chosen": -631.173095703125, + "eval_logps/rejected": -608.6550903320312, + "eval_loss": 0.47853177785873413, + "eval_rewards/accuracies": 0.795634925365448, + "eval_rewards/chosen": -0.878362238407135, + "eval_rewards/margins": 0.967444121837616, + "eval_rewards/rejected": -1.8458064794540405, + "eval_runtime": 203.0003, + "eval_samples_per_second": 9.852, + "eval_steps_per_second": 0.31, + "step": 3300 + }, + { + "epoch": 3.59, + "grad_norm": 3.502741163655858, + "learning_rate": 1.7904936704356715e-07, + "logits/chosen": 3.304037570953369, + "logits/rejected": 3.3648738861083984, + "logps/chosen": -668.6808471679688, + "logps/rejected": -606.8001098632812, + "loss": 0.4399, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7760350108146667, + "rewards/margins": 0.9891276359558105, + "rewards/rejected": -1.765162706375122, + "step": 3310 + }, + { + "epoch": 3.6, + "grad_norm": 3.2909726228421965, + "learning_rate": 1.7766260835276919e-07, + "logits/chosen": 3.3433239459991455, + "logits/rejected": 3.241948366165161, + "logps/chosen": -656.866943359375, + "logps/rejected": -676.34765625, + "loss": 0.4217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8042502403259277, + "rewards/margins": 1.105515956878662, + "rewards/rejected": -1.909766435623169, + "step": 3320 + }, + { + "epoch": 3.62, + "grad_norm": 4.470996733143754, + "learning_rate": 1.7627827460644256e-07, + "logits/chosen": 3.405937671661377, + "logits/rejected": 3.56890869140625, + "logps/chosen": -654.4332885742188, + "logps/rejected": -613.1698608398438, + "loss": 0.4156, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8114805221557617, + "rewards/margins": 1.1845698356628418, + "rewards/rejected": -1.996050238609314, + "step": 3330 + }, + { + "epoch": 3.63, + "grad_norm": 4.0251845812339155, + "learning_rate": 1.7489641221118807e-07, + "logits/chosen": 3.2184112071990967, + "logits/rejected": 3.236548900604248, + "logps/chosen": -621.3856201171875, + "logps/rejected": -585.7027587890625, + "loss": 0.4408, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9119928479194641, + "rewards/margins": 1.1261918544769287, + "rewards/rejected": -2.038184642791748, + "step": 3340 + }, + { + "epoch": 3.64, + "grad_norm": 3.0931365871130216, + "learning_rate": 1.7351706749076034e-07, + "logits/chosen": 3.437945604324341, + "logits/rejected": 3.340913772583008, + "logps/chosen": -591.0047607421875, + "logps/rejected": -596.6959228515625, + "loss": 0.4213, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9432751536369324, + "rewards/margins": 0.9771154522895813, + "rewards/rejected": -1.9203904867172241, + "step": 3350 + }, + { + "epoch": 3.65, + "grad_norm": 4.426085674307634, + "learning_rate": 1.7214028668451463e-07, + "logits/chosen": 3.311800003051758, + "logits/rejected": 3.3910815715789795, + "logps/chosen": -671.0889892578125, + "logps/rejected": -613.1412353515625, + "loss": 0.4266, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.7964714169502258, + "rewards/margins": 1.3165581226348877, + "rewards/rejected": -2.1130294799804688, + "step": 3360 + }, + { + "epoch": 3.66, + "grad_norm": 5.41069681121707, + "learning_rate": 1.707661159458569e-07, + "logits/chosen": 3.3232052326202393, + "logits/rejected": 3.3433849811553955, + "logps/chosen": -629.1781005859375, + "logps/rejected": -612.620849609375, + "loss": 0.45, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8367223739624023, + "rewards/margins": 0.9473745226860046, + "rewards/rejected": -1.7840969562530518, + "step": 3370 + }, + { + "epoch": 3.67, + "grad_norm": 3.743840034344002, + "learning_rate": 1.693946013406967e-07, + "logits/chosen": 3.320786714553833, + "logits/rejected": 3.3713016510009766, + "logps/chosen": -645.5264892578125, + "logps/rejected": -607.998291015625, + "loss": 0.421, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8214617967605591, + "rewards/margins": 1.0396496057510376, + "rewards/rejected": -1.8611112833023071, + "step": 3380 + }, + { + "epoch": 3.68, + "grad_norm": 3.799964746211055, + "learning_rate": 1.6802578884590266e-07, + "logits/chosen": 3.4401779174804688, + "logits/rejected": 3.4008584022521973, + "logps/chosen": -610.3036499023438, + "logps/rejected": -604.7216796875, + "loss": 0.4236, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7632851004600525, + "rewards/margins": 1.1178282499313354, + "rewards/rejected": -1.8811134099960327, + "step": 3390 + }, + { + "epoch": 3.69, + "grad_norm": 4.5967188360385585, + "learning_rate": 1.6665972434776154e-07, + "logits/chosen": 3.250286817550659, + "logits/rejected": 3.3413288593292236, + "logps/chosen": -614.2950439453125, + "logps/rejected": -629.7451782226562, + "loss": 0.4297, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9370313882827759, + "rewards/margins": 1.132611870765686, + "rewards/rejected": -2.069643020629883, + "step": 3400 + }, + { + "epoch": 3.69, + "eval_logits/chosen": 3.432600259780884, + "eval_logits/rejected": 3.471015691757202, + "eval_logps/chosen": -633.59619140625, + "eval_logps/rejected": -613.3634033203125, + "eval_loss": 0.477384477853775, + "eval_rewards/accuracies": 0.795634925365448, + "eval_rewards/chosen": -0.9025925397872925, + "eval_rewards/margins": 0.990296483039856, + "eval_rewards/rejected": -1.8928890228271484, + "eval_runtime": 202.9773, + "eval_samples_per_second": 9.853, + "eval_steps_per_second": 0.31, + "step": 3400 + }, + { + "epoch": 3.7, + "grad_norm": 3.5030890328455397, + "learning_rate": 1.652964536404397e-07, + "logits/chosen": 3.3207294940948486, + "logits/rejected": 3.187941312789917, + "logps/chosen": -633.8218994140625, + "logps/rejected": -643.2042236328125, + "loss": 0.4357, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.082877516746521, + "rewards/margins": 0.9908887147903442, + "rewards/rejected": -2.0737662315368652, + "step": 3410 + }, + { + "epoch": 3.71, + "grad_norm": 4.5600807867940265, + "learning_rate": 1.6393602242444826e-07, + "logits/chosen": 3.4069080352783203, + "logits/rejected": 3.401517868041992, + "logps/chosen": -687.3406982421875, + "logps/rejected": -671.100830078125, + "loss": 0.4146, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.9297691583633423, + "rewards/margins": 1.207360863685608, + "rewards/rejected": -2.1371302604675293, + "step": 3420 + }, + { + "epoch": 3.72, + "grad_norm": 3.7194151093988306, + "learning_rate": 1.625784763051108e-07, + "logits/chosen": 3.2494473457336426, + "logits/rejected": 3.287376880645752, + "logps/chosen": -664.7005615234375, + "logps/rejected": -613.7723999023438, + "loss": 0.4432, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0459792613983154, + "rewards/margins": 1.1180553436279297, + "rewards/rejected": -2.164034605026245, + "step": 3430 + }, + { + "epoch": 3.74, + "grad_norm": 3.819169373247336, + "learning_rate": 1.6122386079103466e-07, + "logits/chosen": 3.338268280029297, + "logits/rejected": 3.398308515548706, + "logps/chosen": -608.5794067382812, + "logps/rejected": -587.7130737304688, + "loss": 0.4226, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9657732844352722, + "rewards/margins": 0.9586571455001831, + "rewards/rejected": -1.9244304895401, + "step": 3440 + }, + { + "epoch": 3.75, + "grad_norm": 3.7443775545990237, + "learning_rate": 1.5987222129258548e-07, + "logits/chosen": 3.3651328086853027, + "logits/rejected": 3.421379804611206, + "logps/chosen": -688.8221435546875, + "logps/rejected": -598.705322265625, + "loss": 0.4318, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8808202743530273, + "rewards/margins": 1.1631778478622437, + "rewards/rejected": -2.0439980030059814, + "step": 3450 + }, + { + "epoch": 3.76, + "grad_norm": 3.375910739290284, + "learning_rate": 1.585236031203648e-07, + "logits/chosen": 3.4626574516296387, + "logits/rejected": 3.4364490509033203, + "logps/chosen": -666.2822265625, + "logps/rejected": -651.7742919921875, + "loss": 0.4027, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9553236961364746, + "rewards/margins": 1.0626842975616455, + "rewards/rejected": -2.018007755279541, + "step": 3460 + }, + { + "epoch": 3.77, + "grad_norm": 5.077748851404105, + "learning_rate": 1.571780514836912e-07, + "logits/chosen": 3.360103130340576, + "logits/rejected": 3.424436569213867, + "logps/chosen": -623.270751953125, + "logps/rejected": -584.3328857421875, + "loss": 0.4326, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0589148998260498, + "rewards/margins": 0.9808281660079956, + "rewards/rejected": -2.039742946624756, + "step": 3470 + }, + { + "epoch": 3.78, + "grad_norm": 4.37910107687616, + "learning_rate": 1.5583561148908456e-07, + "logits/chosen": 3.3990330696105957, + "logits/rejected": 3.3405818939208984, + "logps/chosen": -691.7584228515625, + "logps/rejected": -686.2445068359375, + "loss": 0.4209, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9749807119369507, + "rewards/margins": 1.1193406581878662, + "rewards/rejected": -2.0943212509155273, + "step": 3480 + }, + { + "epoch": 3.79, + "grad_norm": 5.237860985497682, + "learning_rate": 1.5449632813875435e-07, + "logits/chosen": 3.300654172897339, + "logits/rejected": 3.2150654792785645, + "logps/chosen": -606.1732788085938, + "logps/rejected": -579.9727783203125, + "loss": 0.4591, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0850937366485596, + "rewards/margins": 0.8942669630050659, + "rewards/rejected": -1.9793609380722046, + "step": 3490 + }, + { + "epoch": 3.8, + "grad_norm": 3.343823323897354, + "learning_rate": 1.531602463290906e-07, + "logits/chosen": 3.3689708709716797, + "logits/rejected": 3.375143051147461, + "logps/chosen": -665.4837036132812, + "logps/rejected": -624.3529052734375, + "loss": 0.4133, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0091758966445923, + "rewards/margins": 1.250361442565918, + "rewards/rejected": -2.2595372200012207, + "step": 3500 + }, + { + "epoch": 3.8, + "eval_logits/chosen": 3.4232242107391357, + "eval_logits/rejected": 3.460996627807617, + "eval_logps/chosen": -635.0674438476562, + "eval_logps/rejected": -614.7964477539062, + "eval_loss": 0.478471577167511, + "eval_rewards/accuracies": 0.7936508059501648, + "eval_rewards/chosen": -0.9173057079315186, + "eval_rewards/margins": 0.9899141788482666, + "eval_rewards/rejected": -1.9072200059890747, + "eval_runtime": 202.9263, + "eval_samples_per_second": 9.856, + "eval_steps_per_second": 0.31, + "step": 3500 + }, + { + "epoch": 3.81, + "grad_norm": 3.4095309561138643, + "learning_rate": 1.5182741084915916e-07, + "logits/chosen": 3.3521697521209717, + "logits/rejected": 3.3369452953338623, + "logps/chosen": -695.1788330078125, + "logps/rejected": -645.53125, + "loss": 0.4328, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7928581833839417, + "rewards/margins": 0.9856538772583008, + "rewards/rejected": -1.7785122394561768, + "step": 3510 + }, + { + "epoch": 3.82, + "grad_norm": 3.8735531647630554, + "learning_rate": 1.5049786637920023e-07, + "logits/chosen": 3.3590404987335205, + "logits/rejected": 3.385971784591675, + "logps/chosen": -658.2880249023438, + "logps/rejected": -650.6922607421875, + "loss": 0.4226, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9198244214057922, + "rewards/margins": 1.0969880819320679, + "rewards/rejected": -2.016812562942505, + "step": 3520 + }, + { + "epoch": 3.83, + "grad_norm": 3.9959550332302243, + "learning_rate": 1.4917165748913027e-07, + "logits/chosen": 3.255180835723877, + "logits/rejected": 3.1931753158569336, + "logps/chosen": -608.6309204101562, + "logps/rejected": -671.03076171875, + "loss": 0.4369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8883172869682312, + "rewards/margins": 1.0490520000457764, + "rewards/rejected": -1.9373695850372314, + "step": 3530 + }, + { + "epoch": 3.84, + "grad_norm": 5.368006978349376, + "learning_rate": 1.4784882863704837e-07, + "logits/chosen": 3.223203182220459, + "logits/rejected": 3.1401946544647217, + "logps/chosen": -638.2977905273438, + "logps/rejected": -630.3391723632812, + "loss": 0.4292, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0007436275482178, + "rewards/margins": 1.2180492877960205, + "rewards/rejected": -2.2187929153442383, + "step": 3540 + }, + { + "epoch": 3.85, + "grad_norm": 5.818751422041991, + "learning_rate": 1.4652942416774538e-07, + "logits/chosen": 3.483919858932495, + "logits/rejected": 3.515242338180542, + "logps/chosen": -657.88818359375, + "logps/rejected": -634.6720581054688, + "loss": 0.4236, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.857334315776825, + "rewards/margins": 1.2372469902038574, + "rewards/rejected": -2.094581127166748, + "step": 3550 + }, + { + "epoch": 3.87, + "grad_norm": 4.515749178887459, + "learning_rate": 1.452134883112178e-07, + "logits/chosen": 3.3508925437927246, + "logits/rejected": 3.3791816234588623, + "logps/chosen": -668.8388671875, + "logps/rejected": -632.3463134765625, + "loss": 0.4399, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.8480029106140137, + "rewards/margins": 1.2185410261154175, + "rewards/rejected": -2.0665438175201416, + "step": 3560 + }, + { + "epoch": 3.88, + "grad_norm": 4.067413866713914, + "learning_rate": 1.4390106518118473e-07, + "logits/chosen": 3.379002094268799, + "logits/rejected": 3.3554329872131348, + "logps/chosen": -599.6312255859375, + "logps/rejected": -620.1608276367188, + "loss": 0.439, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9548500180244446, + "rewards/margins": 1.1053146123886108, + "rewards/rejected": -2.0601646900177, + "step": 3570 + }, + { + "epoch": 3.89, + "grad_norm": 5.089219361186417, + "learning_rate": 1.4259219877360934e-07, + "logits/chosen": 3.4198906421661377, + "logits/rejected": 3.3556511402130127, + "logps/chosen": -652.1272583007812, + "logps/rejected": -668.5350952148438, + "loss": 0.4417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8929985761642456, + "rewards/margins": 1.2331557273864746, + "rewards/rejected": -2.1261544227600098, + "step": 3580 + }, + { + "epoch": 3.9, + "grad_norm": 4.432906259186991, + "learning_rate": 1.4128693296522364e-07, + "logits/chosen": 3.2374885082244873, + "logits/rejected": 3.130519390106201, + "logps/chosen": -585.5250854492188, + "logps/rejected": -623.3826293945312, + "loss": 0.4117, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0717624425888062, + "rewards/margins": 1.0361723899841309, + "rewards/rejected": -2.1079349517822266, + "step": 3590 + }, + { + "epoch": 3.91, + "grad_norm": 3.636654655309098, + "learning_rate": 1.3998531151205805e-07, + "logits/chosen": 3.287585496902466, + "logits/rejected": 3.278582811355591, + "logps/chosen": -686.5152587890625, + "logps/rejected": -646.55908203125, + "loss": 0.4275, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.1729423999786377, + "rewards/margins": 1.310024619102478, + "rewards/rejected": -2.482966899871826, + "step": 3600 + }, + { + "epoch": 3.91, + "eval_logits/chosen": 3.422714948654175, + "eval_logits/rejected": 3.4634690284729004, + "eval_logps/chosen": -645.4227294921875, + "eval_logps/rejected": -627.8748168945312, + "eval_loss": 0.47942695021629333, + "eval_rewards/accuracies": 0.783730149269104, + "eval_rewards/chosen": -1.020858645439148, + "eval_rewards/margins": 1.0171442031860352, + "eval_rewards/rejected": -2.0380029678344727, + "eval_runtime": 202.8836, + "eval_samples_per_second": 9.858, + "eval_steps_per_second": 0.311, + "step": 3600 + }, + { + "epoch": 3.92, + "grad_norm": 4.160209491669761, + "learning_rate": 1.3868737804797454e-07, + "logits/chosen": 3.408064603805542, + "logits/rejected": 3.399200439453125, + "logps/chosen": -596.4379272460938, + "logps/rejected": -607.2189331054688, + "loss": 0.4449, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0274633169174194, + "rewards/margins": 1.101192593574524, + "rewards/rejected": -2.1286559104919434, + "step": 3610 + }, + { + "epoch": 3.93, + "grad_norm": 3.7399365896773533, + "learning_rate": 1.3739317608320317e-07, + "logits/chosen": 3.3484718799591064, + "logits/rejected": 3.4230563640594482, + "logps/chosen": -610.9447021484375, + "logps/rejected": -623.60986328125, + "loss": 0.4218, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.981761634349823, + "rewards/margins": 0.9789684414863586, + "rewards/rejected": -1.9607301950454712, + "step": 3620 + }, + { + "epoch": 3.94, + "grad_norm": 4.638442822469067, + "learning_rate": 1.3610274900288465e-07, + "logits/chosen": 3.3025145530700684, + "logits/rejected": 3.3533711433410645, + "logps/chosen": -610.4231567382812, + "logps/rejected": -615.0877685546875, + "loss": 0.4159, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0207935571670532, + "rewards/margins": 1.1630699634552002, + "rewards/rejected": -2.183863401412964, + "step": 3630 + }, + { + "epoch": 3.95, + "grad_norm": 5.474794030062789, + "learning_rate": 1.3481614006561518e-07, + "logits/chosen": 3.3728854656219482, + "logits/rejected": 3.40693736076355, + "logps/chosen": -727.2763671875, + "logps/rejected": -657.0306396484375, + "loss": 0.4278, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9185264706611633, + "rewards/margins": 1.1274440288543701, + "rewards/rejected": -2.0459704399108887, + "step": 3640 + }, + { + "epoch": 3.96, + "grad_norm": 6.250079676188916, + "learning_rate": 1.3353339240199633e-07, + "logits/chosen": 3.205505847930908, + "logits/rejected": 3.2889225482940674, + "logps/chosen": -590.033447265625, + "logps/rejected": -564.714111328125, + "loss": 0.4495, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1220542192459106, + "rewards/margins": 0.9148964881896973, + "rewards/rejected": -2.0369505882263184, + "step": 3650 + }, + { + "epoch": 3.97, + "grad_norm": 4.8571334532279655, + "learning_rate": 1.322545490131896e-07, + "logits/chosen": 3.418250322341919, + "logits/rejected": 3.4689507484436035, + "logps/chosen": -655.3072509765625, + "logps/rejected": -578.1636962890625, + "loss": 0.4413, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8048042058944702, + "rewards/margins": 1.078054428100586, + "rewards/rejected": -1.8828586339950562, + "step": 3660 + }, + { + "epoch": 3.98, + "grad_norm": 3.7546128361643647, + "learning_rate": 1.309796527694746e-07, + "logits/chosen": 3.2979984283447266, + "logits/rejected": 3.276188611984253, + "logps/chosen": -575.6425170898438, + "logps/rejected": -578.5830078125, + "loss": 0.4413, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.0049642324447632, + "rewards/margins": 1.0463021993637085, + "rewards/rejected": -2.0512664318084717, + "step": 3670 + }, + { + "epoch": 4.0, + "grad_norm": 4.927909946213724, + "learning_rate": 1.2970874640881205e-07, + "logits/chosen": 3.393319606781006, + "logits/rejected": 3.380615234375, + "logps/chosen": -633.817626953125, + "logps/rejected": -645.5632934570312, + "loss": 0.4437, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8656120300292969, + "rewards/margins": 1.1373414993286133, + "rewards/rejected": -2.00295352935791, + "step": 3680 + }, + { + "epoch": 4.01, + "grad_norm": 3.635090670963502, + "learning_rate": 1.2844187253541081e-07, + "logits/chosen": 3.307116985321045, + "logits/rejected": 3.3995635509490967, + "logps/chosen": -668.4981689453125, + "logps/rejected": -595.920166015625, + "loss": 0.4335, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8938275575637817, + "rewards/margins": 1.0600321292877197, + "rewards/rejected": -1.9538596868515015, + "step": 3690 + }, + { + "epoch": 4.02, + "grad_norm": 4.379048669567367, + "learning_rate": 1.271790736183001e-07, + "logits/chosen": 3.482682704925537, + "logits/rejected": 3.450831174850464, + "logps/chosen": -650.8020629882812, + "logps/rejected": -644.2947998046875, + "loss": 0.4224, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.8331559896469116, + "rewards/margins": 1.0969077348709106, + "rewards/rejected": -1.9300638437271118, + "step": 3700 + }, + { + "epoch": 4.02, + "eval_logits/chosen": 3.4400007724761963, + "eval_logits/rejected": 3.481160879135132, + "eval_logps/chosen": -634.6395874023438, + "eval_logps/rejected": -614.9320068359375, + "eval_loss": 0.47838032245635986, + "eval_rewards/accuracies": 0.7936508059501648, + "eval_rewards/chosen": -0.9130271673202515, + "eval_rewards/margins": 0.9955475926399231, + "eval_rewards/rejected": -1.9085748195648193, + "eval_runtime": 202.7333, + "eval_samples_per_second": 9.865, + "eval_steps_per_second": 0.311, + "step": 3700 + }, + { + "epoch": 4.03, + "grad_norm": 4.744919850350896, + "learning_rate": 1.2592039198990567e-07, + "logits/chosen": 3.4300265312194824, + "logits/rejected": 3.4588325023651123, + "logps/chosen": -582.3802490234375, + "logps/rejected": -574.881103515625, + "loss": 0.4068, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9523553848266602, + "rewards/margins": 1.1304038763046265, + "rewards/rejected": -2.082759380340576, + "step": 3710 + }, + { + "epoch": 4.04, + "grad_norm": 4.2781980433834175, + "learning_rate": 1.2466586984463033e-07, + "logits/chosen": 3.283041477203369, + "logits/rejected": 3.3292555809020996, + "logps/chosen": -655.4451904296875, + "logps/rejected": -611.529296875, + "loss": 0.4244, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.9574171304702759, + "rewards/margins": 1.0868561267852783, + "rewards/rejected": -2.0442731380462646, + "step": 3720 + }, + { + "epoch": 4.05, + "grad_norm": 4.884052600454236, + "learning_rate": 1.2341554923744007e-07, + "logits/chosen": 3.3003993034362793, + "logits/rejected": 3.383582353591919, + "logps/chosen": -637.0278930664062, + "logps/rejected": -624.9862060546875, + "loss": 0.4397, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.011597990989685, + "rewards/margins": 1.0633455514907837, + "rewards/rejected": -2.074943780899048, + "step": 3730 + }, + { + "epoch": 4.06, + "grad_norm": 3.652258541540437, + "learning_rate": 1.2216947208245395e-07, + "logits/chosen": 3.4180221557617188, + "logits/rejected": 3.314668655395508, + "logps/chosen": -626.7178955078125, + "logps/rejected": -600.8445434570312, + "loss": 0.4228, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0129839181900024, + "rewards/margins": 1.0503250360488892, + "rewards/rejected": -2.0633087158203125, + "step": 3740 + }, + { + "epoch": 4.07, + "grad_norm": 4.5189078365069255, + "learning_rate": 1.2092768015153913e-07, + "logits/chosen": 3.302248477935791, + "logits/rejected": 3.437277317047119, + "logps/chosen": -627.9656372070312, + "logps/rejected": -564.1021728515625, + "loss": 0.4134, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.9830034375190735, + "rewards/margins": 1.0382428169250488, + "rewards/rejected": -2.0212459564208984, + "step": 3750 + }, + { + "epoch": 4.08, + "grad_norm": 4.651438848297526, + "learning_rate": 1.1969021507291018e-07, + "logits/chosen": 3.34623384475708, + "logits/rejected": 3.4822335243225098, + "logps/chosen": -670.3914794921875, + "logps/rejected": -595.1871948242188, + "loss": 0.4345, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9710467457771301, + "rewards/margins": 1.094674825668335, + "rewards/rejected": -2.0657215118408203, + "step": 3760 + }, + { + "epoch": 4.09, + "grad_norm": 4.831404139992892, + "learning_rate": 1.1845711832973429e-07, + "logits/chosen": 3.3657729625701904, + "logits/rejected": 3.376382827758789, + "logps/chosen": -630.7979736328125, + "logps/rejected": -632.58154296875, + "loss": 0.4056, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.9402076005935669, + "rewards/margins": 1.0973269939422607, + "rewards/rejected": -2.037534713745117, + "step": 3770 + }, + { + "epoch": 4.1, + "grad_norm": 5.832474258246264, + "learning_rate": 1.1722843125874016e-07, + "logits/chosen": 3.3085269927978516, + "logits/rejected": 3.3649120330810547, + "logps/chosen": -656.54150390625, + "logps/rejected": -626.4422607421875, + "loss": 0.4198, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.949749767780304, + "rewards/margins": 1.1984117031097412, + "rewards/rejected": -2.1481614112854004, + "step": 3780 + }, + { + "epoch": 4.12, + "grad_norm": 4.985892937566476, + "learning_rate": 1.1600419504883215e-07, + "logits/chosen": 3.4230434894561768, + "logits/rejected": 3.4386258125305176, + "logps/chosen": -670.7752685546875, + "logps/rejected": -629.9710693359375, + "loss": 0.4387, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.013933777809143, + "rewards/margins": 1.1262916326522827, + "rewards/rejected": -2.140225410461426, + "step": 3790 + }, + { + "epoch": 4.13, + "grad_norm": 4.608938175733721, + "learning_rate": 1.1478445073971007e-07, + "logits/chosen": 3.133939743041992, + "logits/rejected": 3.20147967338562, + "logps/chosen": -592.5680541992188, + "logps/rejected": -570.9287109375, + "loss": 0.4101, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0050890445709229, + "rewards/margins": 1.0093024969100952, + "rewards/rejected": -2.0143914222717285, + "step": 3800 + }, + { + "epoch": 4.13, + "eval_logits/chosen": 3.422454833984375, + "eval_logits/rejected": 3.4569199085235596, + "eval_logps/chosen": -638.0772094726562, + "eval_logps/rejected": -619.7818603515625, + "eval_loss": 0.47731995582580566, + "eval_rewards/accuracies": 0.7876983880996704, + "eval_rewards/chosen": -0.9474031925201416, + "eval_rewards/margins": 1.0096713304519653, + "eval_rewards/rejected": -1.957074522972107, + "eval_runtime": 202.9426, + "eval_samples_per_second": 9.855, + "eval_steps_per_second": 0.31, + "step": 3800 + }, + { + "epoch": 4.14, + "grad_norm": 4.266967058847925, + "learning_rate": 1.1356923922049297e-07, + "logits/chosen": 3.4540011882781982, + "logits/rejected": 3.403491258621216, + "logps/chosen": -628.2205810546875, + "logps/rejected": -639.0848999023438, + "loss": 0.4002, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9759773015975952, + "rewards/margins": 1.1415761709213257, + "rewards/rejected": -2.117553234100342, + "step": 3810 + }, + { + "epoch": 4.15, + "grad_norm": 4.27011854566667, + "learning_rate": 1.1235860122834858e-07, + "logits/chosen": 3.3374085426330566, + "logits/rejected": 3.3360908031463623, + "logps/chosen": -704.3816528320312, + "logps/rejected": -671.1971435546875, + "loss": 0.4087, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0669662952423096, + "rewards/margins": 1.1149975061416626, + "rewards/rejected": -2.1819636821746826, + "step": 3820 + }, + { + "epoch": 4.16, + "grad_norm": 4.340973016665936, + "learning_rate": 1.1115257734712755e-07, + "logits/chosen": 3.3176560401916504, + "logits/rejected": 3.362886428833008, + "logps/chosen": -719.64208984375, + "logps/rejected": -633.7008056640625, + "loss": 0.4215, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.0610309839248657, + "rewards/margins": 1.3459486961364746, + "rewards/rejected": -2.4069790840148926, + "step": 3830 + }, + { + "epoch": 4.17, + "grad_norm": 3.5539401102691444, + "learning_rate": 1.0995120800600322e-07, + "logits/chosen": 3.295255661010742, + "logits/rejected": 3.3017280101776123, + "logps/chosen": -630.6492919921875, + "logps/rejected": -632.7191772460938, + "loss": 0.41, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0907437801361084, + "rewards/margins": 1.179003357887268, + "rewards/rejected": -2.269747257232666, + "step": 3840 + }, + { + "epoch": 4.18, + "grad_norm": 5.480402411765085, + "learning_rate": 1.0875453347811623e-07, + "logits/chosen": 3.3341212272644043, + "logits/rejected": 3.3940231800079346, + "logps/chosen": -640.7610473632812, + "logps/rejected": -615.4697265625, + "loss": 0.4057, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9333788752555847, + "rewards/margins": 1.2855052947998047, + "rewards/rejected": -2.218884229660034, + "step": 3850 + }, + { + "epoch": 4.19, + "grad_norm": 4.0247985991808335, + "learning_rate": 1.0756259387922417e-07, + "logits/chosen": 3.4757308959960938, + "logits/rejected": 3.3736164569854736, + "logps/chosen": -592.9967651367188, + "logps/rejected": -604.2167358398438, + "loss": 0.4186, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -0.9449540376663208, + "rewards/margins": 1.1130799055099487, + "rewards/rejected": -2.0580339431762695, + "step": 3860 + }, + { + "epoch": 4.2, + "grad_norm": 3.808674396015461, + "learning_rate": 1.0637542916635733e-07, + "logits/chosen": 3.385357618331909, + "logits/rejected": 3.3909316062927246, + "logps/chosen": -614.0596313476562, + "logps/rejected": -613.41015625, + "loss": 0.4293, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7040565609931946, + "rewards/margins": 1.097634196281433, + "rewards/rejected": -1.801690697669983, + "step": 3870 + }, + { + "epoch": 4.21, + "grad_norm": 3.7696858880428796, + "learning_rate": 1.051930791364788e-07, + "logits/chosen": 3.235701084136963, + "logits/rejected": 3.214096784591675, + "logps/chosen": -655.4987182617188, + "logps/rejected": -564.7158813476562, + "loss": 0.4096, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9135665893554688, + "rewards/margins": 1.1304066181182861, + "rewards/rejected": -2.043973445892334, + "step": 3880 + }, + { + "epoch": 4.22, + "grad_norm": 6.8837413169779955, + "learning_rate": 1.0401558342515063e-07, + "logits/chosen": 3.31986927986145, + "logits/rejected": 3.338214874267578, + "logps/chosen": -690.4530029296875, + "logps/rejected": -661.3316650390625, + "loss": 0.4307, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.9339920282363892, + "rewards/margins": 1.0488418340682983, + "rewards/rejected": -1.9828341007232666, + "step": 3890 + }, + { + "epoch": 4.23, + "grad_norm": 4.231560477523635, + "learning_rate": 1.028429815052047e-07, + "logits/chosen": 3.285670518875122, + "logits/rejected": 3.300931215286255, + "logps/chosen": -687.8453369140625, + "logps/rejected": -601.5693969726562, + "loss": 0.4295, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9780045747756958, + "rewards/margins": 1.2776046991348267, + "rewards/rejected": -2.2556090354919434, + "step": 3900 + }, + { + "epoch": 4.23, + "eval_logits/chosen": 3.3998217582702637, + "eval_logits/rejected": 3.428964376449585, + "eval_logps/chosen": -642.2666015625, + "eval_logps/rejected": -625.0360717773438, + "eval_loss": 0.47901660203933716, + "eval_rewards/accuracies": 0.795634925365448, + "eval_rewards/chosen": -0.9892975091934204, + "eval_rewards/margins": 1.020318865776062, + "eval_rewards/rejected": -2.0096163749694824, + "eval_runtime": 202.8405, + "eval_samples_per_second": 9.86, + "eval_steps_per_second": 0.311, + "step": 3900 + }, + { + "epoch": 4.25, + "grad_norm": 4.84330652049246, + "learning_rate": 1.0167531268542026e-07, + "logits/chosen": 3.4341633319854736, + "logits/rejected": 3.386991024017334, + "logps/chosen": -623.751708984375, + "logps/rejected": -648.4140625, + "loss": 0.4191, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9651447534561157, + "rewards/margins": 1.2494183778762817, + "rewards/rejected": -2.2145628929138184, + "step": 3910 + }, + { + "epoch": 4.26, + "grad_norm": 4.525568517684105, + "learning_rate": 1.005126161092053e-07, + "logits/chosen": 3.2907967567443848, + "logits/rejected": 3.3596577644348145, + "logps/chosen": -630.5189208984375, + "logps/rejected": -607.6754150390625, + "loss": 0.4117, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0380223989486694, + "rewards/margins": 1.147351622581482, + "rewards/rejected": -2.1853740215301514, + "step": 3920 + }, + { + "epoch": 4.27, + "grad_norm": 4.723979298330365, + "learning_rate": 9.935493075328518e-08, + "logits/chosen": 3.127185821533203, + "logits/rejected": 3.149927854537964, + "logps/chosen": -591.57177734375, + "logps/rejected": -588.5655517578125, + "loss": 0.4143, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.0499693155288696, + "rewards/margins": 1.2105190753936768, + "rewards/rejected": -2.260488510131836, + "step": 3930 + }, + { + "epoch": 4.28, + "grad_norm": 4.2884489136695185, + "learning_rate": 9.820229542639529e-08, + "logits/chosen": 3.2507805824279785, + "logits/rejected": 3.0884203910827637, + "logps/chosen": -626.3121337890625, + "logps/rejected": -596.10595703125, + "loss": 0.4207, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.01030695438385, + "rewards/margins": 1.1719428300857544, + "rewards/rejected": -2.1822495460510254, + "step": 3940 + }, + { + "epoch": 4.29, + "grad_norm": 3.9822564622794254, + "learning_rate": 9.705474876798068e-08, + "logits/chosen": 3.262510299682617, + "logits/rejected": 3.3105883598327637, + "logps/chosen": -607.6744384765625, + "logps/rejected": -613.1331787109375, + "loss": 0.4231, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.054054856300354, + "rewards/margins": 1.0771335363388062, + "rewards/rejected": -2.131188154220581, + "step": 3950 + }, + { + "epoch": 4.3, + "grad_norm": 4.49091860837814, + "learning_rate": 9.591232924690037e-08, + "logits/chosen": 3.3651633262634277, + "logits/rejected": 3.3389458656311035, + "logps/chosen": -629.3226318359375, + "logps/rejected": -607.5804443359375, + "loss": 0.4161, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6730826497077942, + "rewards/margins": 1.1576800346374512, + "rewards/rejected": -1.8307626247406006, + "step": 3960 + }, + { + "epoch": 4.31, + "grad_norm": 5.515041237259014, + "learning_rate": 9.477507516013811e-08, + "logits/chosen": 3.480274200439453, + "logits/rejected": 3.4106287956237793, + "logps/chosen": -641.4927978515625, + "logps/rejected": -647.2362060546875, + "loss": 0.4005, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.8588997721672058, + "rewards/margins": 1.1452442407608032, + "rewards/rejected": -2.0041441917419434, + "step": 3970 + }, + { + "epoch": 4.32, + "grad_norm": 5.163689090047744, + "learning_rate": 9.3643024631518e-08, + "logits/chosen": 3.230050563812256, + "logits/rejected": 3.2616755962371826, + "logps/chosen": -614.7962646484375, + "logps/rejected": -592.8779907226562, + "loss": 0.4113, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.949069619178772, + "rewards/margins": 1.0197830200195312, + "rewards/rejected": -1.9688526391983032, + "step": 3980 + }, + { + "epoch": 4.33, + "grad_norm": 4.902733937586537, + "learning_rate": 9.251621561042716e-08, + "logits/chosen": 3.240537643432617, + "logits/rejected": 3.270631790161133, + "logps/chosen": -628.7385864257812, + "logps/rejected": -608.5638427734375, + "loss": 0.4175, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9540501832962036, + "rewards/margins": 1.1350681781768799, + "rewards/rejected": -2.089118480682373, + "step": 3990 + }, + { + "epoch": 4.34, + "grad_norm": 4.454837215674531, + "learning_rate": 9.139468587054317e-08, + "logits/chosen": 3.2771542072296143, + "logits/rejected": 3.2822394371032715, + "logps/chosen": -646.6663818359375, + "logps/rejected": -622.8614501953125, + "loss": 0.4162, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.061971664428711, + "rewards/margins": 1.0516541004180908, + "rewards/rejected": -2.1136257648468018, + "step": 4000 + }, + { + "epoch": 4.34, + "eval_logits/chosen": 3.404010772705078, + "eval_logits/rejected": 3.4341719150543213, + "eval_logps/chosen": -640.1561889648438, + "eval_logps/rejected": -623.0465087890625, + "eval_loss": 0.47693389654159546, + "eval_rewards/accuracies": 0.795634925365448, + "eval_rewards/chosen": -0.9681926369667053, + "eval_rewards/margins": 1.021527886390686, + "eval_rewards/rejected": -1.989720344543457, + "eval_runtime": 203.2457, + "eval_samples_per_second": 9.84, + "eval_steps_per_second": 0.31, + "step": 4000 + }, + { + "epoch": 4.35, + "grad_norm": 4.998365579065085, + "learning_rate": 9.027847300856769e-08, + "logits/chosen": 3.236513137817383, + "logits/rejected": 3.22514009475708, + "logps/chosen": -609.8795776367188, + "logps/rejected": -629.8914184570312, + "loss": 0.4246, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.075141429901123, + "rewards/margins": 1.0727479457855225, + "rewards/rejected": -2.1478893756866455, + "step": 4010 + }, + { + "epoch": 4.36, + "grad_norm": 5.078456843813655, + "learning_rate": 8.91676144429665e-08, + "logits/chosen": 3.372511386871338, + "logits/rejected": 3.374516725540161, + "logps/chosen": -650.2095336914062, + "logps/rejected": -650.6064453125, + "loss": 0.4155, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.0647146701812744, + "rewards/margins": 1.0481324195861816, + "rewards/rejected": -2.112846851348877, + "step": 4020 + }, + { + "epoch": 4.38, + "grad_norm": 4.3609569043300045, + "learning_rate": 8.806214741271483e-08, + "logits/chosen": 3.3630530834198, + "logits/rejected": 3.356684923171997, + "logps/chosen": -674.0843505859375, + "logps/rejected": -627.49169921875, + "loss": 0.4247, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.8967161178588867, + "rewards/margins": 1.1224383115768433, + "rewards/rejected": -2.0191545486450195, + "step": 4030 + }, + { + "epoch": 4.39, + "grad_norm": 3.7584393944277377, + "learning_rate": 8.696210897604922e-08, + "logits/chosen": 3.265223741531372, + "logits/rejected": 3.300980806350708, + "logps/chosen": -633.5123291015625, + "logps/rejected": -619.6534423828125, + "loss": 0.4159, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.03200364112854, + "rewards/margins": 1.0300449132919312, + "rewards/rejected": -2.0620484352111816, + "step": 4040 + }, + { + "epoch": 4.4, + "grad_norm": 3.265323883185431, + "learning_rate": 8.586753600922486e-08, + "logits/chosen": 3.3313965797424316, + "logits/rejected": 3.3457539081573486, + "logps/chosen": -576.4733276367188, + "logps/rejected": -584.6893310546875, + "loss": 0.4309, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.8862001299858093, + "rewards/margins": 1.171441912651062, + "rewards/rejected": -2.0576419830322266, + "step": 4050 + }, + { + "epoch": 4.41, + "grad_norm": 3.761267636922022, + "learning_rate": 8.477846520527984e-08, + "logits/chosen": 3.411687135696411, + "logits/rejected": 3.3630423545837402, + "logps/chosen": -665.51123046875, + "logps/rejected": -620.060791015625, + "loss": 0.4089, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.8109720945358276, + "rewards/margins": 1.247515082359314, + "rewards/rejected": -2.0584874153137207, + "step": 4060 + }, + { + "epoch": 4.42, + "grad_norm": 3.778177699748287, + "learning_rate": 8.3694933072805e-08, + "logits/chosen": 3.2529239654541016, + "logits/rejected": 3.324174165725708, + "logps/chosen": -619.845703125, + "logps/rejected": -602.2738037109375, + "loss": 0.4301, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.932303249835968, + "rewards/margins": 1.0312929153442383, + "rewards/rejected": -1.963595986366272, + "step": 4070 + }, + { + "epoch": 4.43, + "grad_norm": 5.015487993856013, + "learning_rate": 8.261697593471967e-08, + "logits/chosen": 3.2463810443878174, + "logits/rejected": 3.315427303314209, + "logps/chosen": -588.807373046875, + "logps/rejected": -591.37890625, + "loss": 0.4299, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9638729095458984, + "rewards/margins": 1.1048758029937744, + "rewards/rejected": -2.068748950958252, + "step": 4080 + }, + { + "epoch": 4.44, + "grad_norm": 4.276649164182125, + "learning_rate": 8.154462992705454e-08, + "logits/chosen": 3.343524932861328, + "logits/rejected": 3.326464891433716, + "logps/chosen": -631.861572265625, + "logps/rejected": -643.6549072265625, + "loss": 0.4306, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9578490257263184, + "rewards/margins": 1.0083853006362915, + "rewards/rejected": -1.9662344455718994, + "step": 4090 + }, + { + "epoch": 4.45, + "grad_norm": 4.593391665388458, + "learning_rate": 8.047793099774014e-08, + "logits/chosen": 3.2465949058532715, + "logits/rejected": 3.3000450134277344, + "logps/chosen": -645.3646850585938, + "logps/rejected": -608.7596435546875, + "loss": 0.425, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9739478826522827, + "rewards/margins": 1.23960280418396, + "rewards/rejected": -2.213550567626953, + "step": 4100 + }, + { + "epoch": 4.45, + "eval_logits/chosen": 3.4237234592437744, + "eval_logits/rejected": 3.4579787254333496, + "eval_logps/chosen": -638.862060546875, + "eval_logps/rejected": -621.9555053710938, + "eval_loss": 0.47585567831993103, + "eval_rewards/accuracies": 0.7916666865348816, + "eval_rewards/chosen": -0.9552515745162964, + "eval_rewards/margins": 1.0235581398010254, + "eval_rewards/rejected": -1.9788098335266113, + "eval_runtime": 202.8726, + "eval_samples_per_second": 9.858, + "eval_steps_per_second": 0.311, + "step": 4100 + }, + { + "epoch": 4.46, + "grad_norm": 5.088982868472457, + "learning_rate": 7.941691490540161e-08, + "logits/chosen": 3.4403469562530518, + "logits/rejected": 3.5343425273895264, + "logps/chosen": -696.0791625976562, + "logps/rejected": -612.7171630859375, + "loss": 0.4457, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.076757550239563, + "rewards/margins": 1.2494875192642212, + "rewards/rejected": -2.326245069503784, + "step": 4110 + }, + { + "epoch": 4.47, + "grad_norm": 3.843403335802077, + "learning_rate": 7.836161721815992e-08, + "logits/chosen": 3.3702099323272705, + "logits/rejected": 3.3862807750701904, + "logps/chosen": -700.8905029296875, + "logps/rejected": -730.05126953125, + "loss": 0.4042, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.902929961681366, + "rewards/margins": 1.358778715133667, + "rewards/rejected": -2.2617084980010986, + "step": 4120 + }, + { + "epoch": 4.48, + "grad_norm": 4.310462247780051, + "learning_rate": 7.731207331243992e-08, + "logits/chosen": 3.2865593433380127, + "logits/rejected": 3.242449998855591, + "logps/chosen": -678.3230590820312, + "logps/rejected": -682.6590576171875, + "loss": 0.4111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9124001264572144, + "rewards/margins": 1.4731018543243408, + "rewards/rejected": -2.3855018615722656, + "step": 4130 + }, + { + "epoch": 4.5, + "grad_norm": 3.435121047348075, + "learning_rate": 7.626831837178413e-08, + "logits/chosen": 3.4061710834503174, + "logits/rejected": 3.3974239826202393, + "logps/chosen": -679.5182495117188, + "logps/rejected": -665.0508422851562, + "loss": 0.3845, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8448168039321899, + "rewards/margins": 1.3219852447509766, + "rewards/rejected": -2.166801929473877, + "step": 4140 + }, + { + "epoch": 4.51, + "grad_norm": 4.65878306914775, + "learning_rate": 7.523038738567317e-08, + "logits/chosen": 3.3584542274475098, + "logits/rejected": 3.4139747619628906, + "logps/chosen": -634.37255859375, + "logps/rejected": -603.0714111328125, + "loss": 0.4122, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0318065881729126, + "rewards/margins": 1.1386983394622803, + "rewards/rejected": -2.1705050468444824, + "step": 4150 + }, + { + "epoch": 4.52, + "grad_norm": 4.581289609864568, + "learning_rate": 7.419831514835318e-08, + "logits/chosen": 3.4580376148223877, + "logits/rejected": 3.436858654022217, + "logps/chosen": -698.56494140625, + "logps/rejected": -683.8887939453125, + "loss": 0.417, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9824358224868774, + "rewards/margins": 1.047573208808899, + "rewards/rejected": -2.0300090312957764, + "step": 4160 + }, + { + "epoch": 4.53, + "grad_norm": 4.181278989327639, + "learning_rate": 7.317213625766921e-08, + "logits/chosen": 3.445160388946533, + "logits/rejected": 3.3971214294433594, + "logps/chosen": -610.2749633789062, + "logps/rejected": -648.83984375, + "loss": 0.4002, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9260603785514832, + "rewards/margins": 1.2716095447540283, + "rewards/rejected": -2.1976699829101562, + "step": 4170 + }, + { + "epoch": 4.54, + "grad_norm": 5.517001823658448, + "learning_rate": 7.215188511390549e-08, + "logits/chosen": 3.3265254497528076, + "logits/rejected": 3.3422646522521973, + "logps/chosen": -642.5684204101562, + "logps/rejected": -604.9553833007812, + "loss": 0.4061, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0304888486862183, + "rewards/margins": 1.15310800075531, + "rewards/rejected": -2.1835970878601074, + "step": 4180 + }, + { + "epoch": 4.55, + "grad_norm": 5.084928927565557, + "learning_rate": 7.113759591863197e-08, + "logits/chosen": 3.2932231426239014, + "logits/rejected": 3.3378074169158936, + "logps/chosen": -646.2611694335938, + "logps/rejected": -613.41650390625, + "loss": 0.415, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8827333450317383, + "rewards/margins": 1.1417460441589355, + "rewards/rejected": -2.024479389190674, + "step": 4190 + }, + { + "epoch": 4.56, + "grad_norm": 6.4229927408586125, + "learning_rate": 7.012930267355818e-08, + "logits/chosen": 3.3931682109832764, + "logits/rejected": 3.4705066680908203, + "logps/chosen": -614.5667724609375, + "logps/rejected": -627.6751708984375, + "loss": 0.4155, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.155367136001587, + "rewards/margins": 1.1422535181045532, + "rewards/rejected": -2.2976207733154297, + "step": 4200 + }, + { + "epoch": 4.56, + "eval_logits/chosen": 3.398123025894165, + "eval_logits/rejected": 3.427708148956299, + "eval_logps/chosen": -645.1696166992188, + "eval_logps/rejected": -629.8077392578125, + "eval_loss": 0.47777795791625977, + "eval_rewards/accuracies": 0.7916666865348816, + "eval_rewards/chosen": -1.0183273553848267, + "eval_rewards/margins": 1.0390048027038574, + "eval_rewards/rejected": -2.0573320388793945, + "eval_runtime": 203.136, + "eval_samples_per_second": 9.846, + "eval_steps_per_second": 0.31, + "step": 4200 + }, + { + "epoch": 4.57, + "grad_norm": 4.983938050066726, + "learning_rate": 6.912703917939331e-08, + "logits/chosen": 3.275489330291748, + "logits/rejected": 3.266265869140625, + "logps/chosen": -667.8606567382812, + "logps/rejected": -658.889404296875, + "loss": 0.4066, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.0209743976593018, + "rewards/margins": 1.134408712387085, + "rewards/rejected": -2.1553831100463867, + "step": 4210 + }, + { + "epoch": 4.58, + "grad_norm": 5.492779356862019, + "learning_rate": 6.81308390347127e-08, + "logits/chosen": 3.339324951171875, + "logits/rejected": 3.317880630493164, + "logps/chosen": -618.5656127929688, + "logps/rejected": -620.5933837890625, + "loss": 0.407, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.9727081060409546, + "rewards/margins": 1.3094838857650757, + "rewards/rejected": -2.2821919918060303, + "step": 4220 + }, + { + "epoch": 4.59, + "grad_norm": 4.854526188053133, + "learning_rate": 6.714073563483221e-08, + "logits/chosen": 3.2255160808563232, + "logits/rejected": 3.220808506011963, + "logps/chosen": -673.5921630859375, + "logps/rejected": -639.7328491210938, + "loss": 0.4328, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1177629232406616, + "rewards/margins": 1.1165602207183838, + "rewards/rejected": -2.234323024749756, + "step": 4230 + }, + { + "epoch": 4.6, + "grad_norm": 4.141750829062684, + "learning_rate": 6.615676217068833e-08, + "logits/chosen": 3.2563107013702393, + "logits/rejected": 3.2750449180603027, + "logps/chosen": -667.8575439453125, + "logps/rejected": -636.2128295898438, + "loss": 0.4224, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.1001965999603271, + "rewards/margins": 1.0079988241195679, + "rewards/rejected": -2.1081955432891846, + "step": 4240 + }, + { + "epoch": 4.61, + "grad_norm": 4.088192065267914, + "learning_rate": 6.517895162772538e-08, + "logits/chosen": 3.3319525718688965, + "logits/rejected": 3.2376513481140137, + "logps/chosen": -616.2135009765625, + "logps/rejected": -643.5526733398438, + "loss": 0.4049, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.1921319961547852, + "rewards/margins": 1.1689560413360596, + "rewards/rejected": -2.3610877990722656, + "step": 4250 + }, + { + "epoch": 4.63, + "grad_norm": 3.7531703944598815, + "learning_rate": 6.420733678478995e-08, + "logits/chosen": 3.4484024047851562, + "logits/rejected": 3.348048448562622, + "logps/chosen": -638.4662475585938, + "logps/rejected": -627.6661376953125, + "loss": 0.4192, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.9249661564826965, + "rewards/margins": 1.3605735301971436, + "rewards/rejected": -2.2855398654937744, + "step": 4260 + }, + { + "epoch": 4.64, + "grad_norm": 6.846389928383541, + "learning_rate": 6.324195021303225e-08, + "logits/chosen": 3.4238860607147217, + "logits/rejected": 3.410504102706909, + "logps/chosen": -598.2091064453125, + "logps/rejected": -618.4214477539062, + "loss": 0.4063, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9846474528312683, + "rewards/margins": 1.142899513244629, + "rewards/rejected": -2.127547025680542, + "step": 4270 + }, + { + "epoch": 4.65, + "grad_norm": 4.636272897331951, + "learning_rate": 6.228282427481394e-08, + "logits/chosen": 3.274707317352295, + "logits/rejected": 3.250917434692383, + "logps/chosen": -642.2122192382812, + "logps/rejected": -667.5880737304688, + "loss": 0.4132, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9661245346069336, + "rewards/margins": 1.148542881011963, + "rewards/rejected": -2.1146671772003174, + "step": 4280 + }, + { + "epoch": 4.66, + "grad_norm": 5.207389231138979, + "learning_rate": 6.132999112262339e-08, + "logits/chosen": 3.3857948780059814, + "logits/rejected": 3.430241107940674, + "logps/chosen": -606.1727294921875, + "logps/rejected": -611.9073486328125, + "loss": 0.3993, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9053149223327637, + "rewards/margins": 1.331397294998169, + "rewards/rejected": -2.2367124557495117, + "step": 4290 + }, + { + "epoch": 4.67, + "grad_norm": 4.231587027915136, + "learning_rate": 6.038348269799764e-08, + "logits/chosen": 3.3768208026885986, + "logits/rejected": 3.30268931388855, + "logps/chosen": -636.0169067382812, + "logps/rejected": -650.2225341796875, + "loss": 0.4311, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9160051345825195, + "rewards/margins": 1.140350103378296, + "rewards/rejected": -2.0563549995422363, + "step": 4300 + }, + { + "epoch": 4.67, + "eval_logits/chosen": 3.410733938217163, + "eval_logits/rejected": 3.4412810802459717, + "eval_logps/chosen": -640.4597778320312, + "eval_logps/rejected": -624.7266235351562, + "eval_loss": 0.47653162479400635, + "eval_rewards/accuracies": 0.7896825671195984, + "eval_rewards/chosen": -0.9712289571762085, + "eval_rewards/margins": 1.0352927446365356, + "eval_rewards/rejected": -2.006521701812744, + "eval_runtime": 203.1315, + "eval_samples_per_second": 9.846, + "eval_steps_per_second": 0.31, + "step": 4300 + }, + { + "epoch": 4.68, + "grad_norm": 4.973306644416572, + "learning_rate": 5.944333073045205e-08, + "logits/chosen": 3.374052047729492, + "logits/rejected": 3.4084370136260986, + "logps/chosen": -642.2147216796875, + "logps/rejected": -634.2425537109375, + "loss": 0.4347, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.986552357673645, + "rewards/margins": 1.129396677017212, + "rewards/rejected": -2.1159489154815674, + "step": 4310 + }, + { + "epoch": 4.69, + "grad_norm": 4.926003781899123, + "learning_rate": 5.85095667364163e-08, + "logits/chosen": 3.329627513885498, + "logits/rejected": 3.2988522052764893, + "logps/chosen": -620.7818603515625, + "logps/rejected": -593.2532958984375, + "loss": 0.3997, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.8992365002632141, + "rewards/margins": 1.41867995262146, + "rewards/rejected": -2.3179163932800293, + "step": 4320 + }, + { + "epoch": 4.7, + "grad_norm": 4.949352813141063, + "learning_rate": 5.758222201817786e-08, + "logits/chosen": 3.2874362468719482, + "logits/rejected": 3.2021355628967285, + "logps/chosen": -618.588623046875, + "logps/rejected": -636.9315185546875, + "loss": 0.4267, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9578142166137695, + "rewards/margins": 0.945949912071228, + "rewards/rejected": -1.9037641286849976, + "step": 4330 + }, + { + "epoch": 4.71, + "grad_norm": 4.199638045038265, + "learning_rate": 5.666132766283291e-08, + "logits/chosen": 3.2147376537323, + "logits/rejected": 3.19602632522583, + "logps/chosen": -635.8148803710938, + "logps/rejected": -647.5721435546875, + "loss": 0.4278, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9341497421264648, + "rewards/margins": 1.1103743314743042, + "rewards/rejected": -2.0445237159729004, + "step": 4340 + }, + { + "epoch": 4.72, + "grad_norm": 4.845012036468722, + "learning_rate": 5.574691454124397e-08, + "logits/chosen": 3.2677807807922363, + "logits/rejected": 3.245954990386963, + "logps/chosen": -653.6508178710938, + "logps/rejected": -608.2535400390625, + "loss": 0.4263, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9203804135322571, + "rewards/margins": 1.005350112915039, + "rewards/rejected": -1.9257304668426514, + "step": 4350 + }, + { + "epoch": 4.73, + "grad_norm": 4.434984640560382, + "learning_rate": 5.48390133070053e-08, + "logits/chosen": 3.148699998855591, + "logits/rejected": 3.2259299755096436, + "logps/chosen": -613.2357177734375, + "logps/rejected": -614.7881469726562, + "loss": 0.4233, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9823920130729675, + "rewards/margins": 1.1206071376800537, + "rewards/rejected": -2.102999210357666, + "step": 4360 + }, + { + "epoch": 4.74, + "grad_norm": 5.431057249316014, + "learning_rate": 5.393765439541481e-08, + "logits/chosen": 3.2440972328186035, + "logits/rejected": 3.2051806449890137, + "logps/chosen": -566.9620971679688, + "logps/rejected": -601.295654296875, + "loss": 0.4347, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.1483008861541748, + "rewards/margins": 1.140537977218628, + "rewards/rejected": -2.2888388633728027, + "step": 4370 + }, + { + "epoch": 4.76, + "grad_norm": 4.957593775169598, + "learning_rate": 5.304286802245442e-08, + "logits/chosen": 3.3060543537139893, + "logits/rejected": 3.3490371704101562, + "logps/chosen": -645.0230712890625, + "logps/rejected": -681.2359008789062, + "loss": 0.4001, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8980600237846375, + "rewards/margins": 1.297424077987671, + "rewards/rejected": -2.195484161376953, + "step": 4380 + }, + { + "epoch": 4.77, + "grad_norm": 4.846223069154338, + "learning_rate": 5.2154684183776693e-08, + "logits/chosen": 3.4804294109344482, + "logits/rejected": 3.414719820022583, + "logps/chosen": -627.5233154296875, + "logps/rejected": -658.6403198242188, + "loss": 0.4158, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.892591118812561, + "rewards/margins": 1.0979639291763306, + "rewards/rejected": -1.9905548095703125, + "step": 4390 + }, + { + "epoch": 4.78, + "grad_norm": 3.9195408526050732, + "learning_rate": 5.12731326536994e-08, + "logits/chosen": 3.463097095489502, + "logits/rejected": 3.391611099243164, + "logps/chosen": -655.9578857421875, + "logps/rejected": -643.9403686523438, + "loss": 0.41, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.7878724336624146, + "rewards/margins": 1.3385916948318481, + "rewards/rejected": -2.126464366912842, + "step": 4400 + }, + { + "epoch": 4.78, + "eval_logits/chosen": 3.4080650806427, + "eval_logits/rejected": 3.438655376434326, + "eval_logps/chosen": -640.9733276367188, + "eval_logps/rejected": -625.0818481445312, + "eval_loss": 0.4768357574939728, + "eval_rewards/accuracies": 0.7916666865348816, + "eval_rewards/chosen": -0.9763648509979248, + "eval_rewards/margins": 1.0337090492248535, + "eval_rewards/rejected": -2.010073661804199, + "eval_runtime": 203.0745, + "eval_samples_per_second": 9.849, + "eval_steps_per_second": 0.31, + "step": 4400 + }, + { + "epoch": 4.79, + "grad_norm": 5.4524290398157955, + "learning_rate": 5.0398242984207475e-08, + "logits/chosen": 3.3209099769592285, + "logits/rejected": 3.2791080474853516, + "logps/chosen": -586.2724609375, + "logps/rejected": -618.526123046875, + "loss": 0.4101, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.0599873065948486, + "rewards/margins": 1.16214919090271, + "rewards/rejected": -2.2221364974975586, + "step": 4410 + }, + { + "epoch": 4.8, + "grad_norm": 4.6020179571514275, + "learning_rate": 4.953004450396239e-08, + "logits/chosen": 3.284672260284424, + "logits/rejected": 3.2334282398223877, + "logps/chosen": -563.5467529296875, + "logps/rejected": -631.9415283203125, + "loss": 0.4228, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0434836149215698, + "rewards/margins": 1.07113778591156, + "rewards/rejected": -2.11462140083313, + "step": 4420 + }, + { + "epoch": 4.81, + "grad_norm": 4.1241101144448145, + "learning_rate": 4.866856631731889e-08, + "logits/chosen": 3.443408250808716, + "logits/rejected": 3.4631810188293457, + "logps/chosen": -676.0413818359375, + "logps/rejected": -657.0355224609375, + "loss": 0.4116, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.0986618995666504, + "rewards/margins": 1.040489912033081, + "rewards/rejected": -2.1391518115997314, + "step": 4430 + }, + { + "epoch": 4.82, + "grad_norm": 3.795819911330352, + "learning_rate": 4.781383730334918e-08, + "logits/chosen": 3.4550979137420654, + "logits/rejected": 3.521261692047119, + "logps/chosen": -607.6665649414062, + "logps/rejected": -585.4976806640625, + "loss": 0.3977, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.978421688079834, + "rewards/margins": 1.056796669960022, + "rewards/rejected": -2.0352184772491455, + "step": 4440 + }, + { + "epoch": 4.83, + "grad_norm": 5.2898365415343624, + "learning_rate": 4.696588611487517e-08, + "logits/chosen": 3.3449108600616455, + "logits/rejected": 3.3410325050354004, + "logps/chosen": -651.4517822265625, + "logps/rejected": -653.3697509765625, + "loss": 0.4288, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9503029584884644, + "rewards/margins": 1.1928989887237549, + "rewards/rejected": -2.143202066421509, + "step": 4450 + }, + { + "epoch": 4.84, + "grad_norm": 4.994699717782159, + "learning_rate": 4.6124741177507835e-08, + "logits/chosen": 3.423499584197998, + "logits/rejected": 3.3942131996154785, + "logps/chosen": -669.455078125, + "logps/rejected": -642.244140625, + "loss": 0.3964, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8707054257392883, + "rewards/margins": 1.2696247100830078, + "rewards/rejected": -2.1403300762176514, + "step": 4460 + }, + { + "epoch": 4.85, + "grad_norm": 4.1727119144294615, + "learning_rate": 4.5290430688693983e-08, + "logits/chosen": 3.2331364154815674, + "logits/rejected": 3.3367247581481934, + "logps/chosen": -672.2711181640625, + "logps/rejected": -612.0027465820312, + "loss": 0.4116, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.1666219234466553, + "rewards/margins": 1.0333144664764404, + "rewards/rejected": -2.1999363899230957, + "step": 4470 + }, + { + "epoch": 4.86, + "grad_norm": 4.3586986594794945, + "learning_rate": 4.4462982616771534e-08, + "logits/chosen": 3.2222061157226562, + "logits/rejected": 3.2643065452575684, + "logps/chosen": -666.2291259765625, + "logps/rejected": -637.8343505859375, + "loss": 0.4348, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0683128833770752, + "rewards/margins": 1.2867891788482666, + "rewards/rejected": -2.355102062225342, + "step": 4480 + }, + { + "epoch": 4.88, + "grad_norm": 6.229497687930593, + "learning_rate": 4.364242470003154e-08, + "logits/chosen": 3.3906490802764893, + "logits/rejected": 3.3158926963806152, + "logps/chosen": -636.111328125, + "logps/rejected": -652.4210815429688, + "loss": 0.4149, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.8192951083183289, + "rewards/margins": 1.3163602352142334, + "rewards/rejected": -2.135655164718628, + "step": 4490 + }, + { + "epoch": 4.89, + "grad_norm": 4.738889591633732, + "learning_rate": 4.2828784445788666e-08, + "logits/chosen": 3.2304110527038574, + "logits/rejected": 3.284945011138916, + "logps/chosen": -643.64501953125, + "logps/rejected": -607.7906494140625, + "loss": 0.4127, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9771002531051636, + "rewards/margins": 1.2356865406036377, + "rewards/rejected": -2.212786912918091, + "step": 4500 + }, + { + "epoch": 4.89, + "eval_logits/chosen": 3.4159555435180664, + "eval_logits/rejected": 3.4453370571136475, + "eval_logps/chosen": -639.3276977539062, + "eval_logps/rejected": -624.016845703125, + "eval_loss": 0.47491776943206787, + "eval_rewards/accuracies": 0.7936508059501648, + "eval_rewards/chosen": -0.9599084854125977, + "eval_rewards/margins": 1.0395152568817139, + "eval_rewards/rejected": -1.9994237422943115, + "eval_runtime": 203.3389, + "eval_samples_per_second": 9.836, + "eval_steps_per_second": 0.31, + "step": 4500 + }, + { + "epoch": 4.9, + "grad_norm": 4.37751293010457, + "learning_rate": 4.2022089129458566e-08, + "logits/chosen": 3.273806095123291, + "logits/rejected": 3.283414363861084, + "logps/chosen": -653.6941528320312, + "logps/rejected": -600.9439086914062, + "loss": 0.4268, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0693585872650146, + "rewards/margins": 1.042353868484497, + "rewards/rejected": -2.111712694168091, + "step": 4510 + }, + { + "epoch": 4.91, + "grad_norm": 3.7680861083504684, + "learning_rate": 4.122236579364402e-08, + "logits/chosen": 3.2637264728546143, + "logits/rejected": 3.3021061420440674, + "logps/chosen": -674.3392944335938, + "logps/rejected": -635.2799072265625, + "loss": 0.4382, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9136451482772827, + "rewards/margins": 1.1711009740829468, + "rewards/rejected": -2.0847458839416504, + "step": 4520 + }, + { + "epoch": 4.92, + "grad_norm": 4.402537573501737, + "learning_rate": 4.042964124722834e-08, + "logits/chosen": 3.159298896789551, + "logits/rejected": 3.158567190170288, + "logps/chosen": -611.3229370117188, + "logps/rejected": -615.6229248046875, + "loss": 0.4264, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0395512580871582, + "rewards/margins": 1.0462052822113037, + "rewards/rejected": -2.085756778717041, + "step": 4530 + }, + { + "epoch": 4.93, + "grad_norm": 4.923340820706569, + "learning_rate": 3.9643942064476216e-08, + "logits/chosen": 3.3168938159942627, + "logits/rejected": 3.452329158782959, + "logps/chosen": -710.4453125, + "logps/rejected": -664.5238647460938, + "loss": 0.4232, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.89970862865448, + "rewards/margins": 1.0521256923675537, + "rewards/rejected": -1.9518343210220337, + "step": 4540 + }, + { + "epoch": 4.94, + "grad_norm": 3.9167663179010104, + "learning_rate": 3.8865294584143506e-08, + "logits/chosen": 3.326674222946167, + "logits/rejected": 3.3209240436553955, + "logps/chosen": -633.3974609375, + "logps/rejected": -638.9397583007812, + "loss": 0.4162, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8874519467353821, + "rewards/margins": 1.1475107669830322, + "rewards/rejected": -2.0349628925323486, + "step": 4550 + }, + { + "epoch": 4.95, + "grad_norm": 4.708892942686557, + "learning_rate": 3.809372490859381e-08, + "logits/chosen": 3.2978458404541016, + "logits/rejected": 3.2902169227600098, + "logps/chosen": -591.3196411132812, + "logps/rejected": -613.2948608398438, + "loss": 0.4383, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9776461720466614, + "rewards/margins": 0.9725853800773621, + "rewards/rejected": -1.9502315521240234, + "step": 4560 + }, + { + "epoch": 4.96, + "grad_norm": 4.1717624088625875, + "learning_rate": 3.732925890292377e-08, + "logits/chosen": 3.1886661052703857, + "logits/rejected": 3.2254981994628906, + "logps/chosen": -619.9031372070312, + "logps/rejected": -610.3145751953125, + "loss": 0.4331, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1668506860733032, + "rewards/margins": 1.0246211290359497, + "rewards/rejected": -2.191471576690674, + "step": 4570 + }, + { + "epoch": 4.97, + "grad_norm": 4.518807027117185, + "learning_rate": 3.657192219409566e-08, + "logits/chosen": 3.3411731719970703, + "logits/rejected": 3.373260498046875, + "logps/chosen": -656.5994262695312, + "logps/rejected": -639.821044921875, + "loss": 0.4289, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0331714153289795, + "rewards/margins": 0.9044809341430664, + "rewards/rejected": -1.937652349472046, + "step": 4580 + }, + { + "epoch": 4.98, + "grad_norm": 4.606342390009915, + "learning_rate": 3.582174017007858e-08, + "logits/chosen": 3.55220103263855, + "logits/rejected": 3.4020888805389404, + "logps/chosen": -676.1447143554688, + "logps/rejected": -671.7203369140625, + "loss": 0.4335, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0054256916046143, + "rewards/margins": 1.053364634513855, + "rewards/rejected": -2.058790683746338, + "step": 4590 + }, + { + "epoch": 4.99, + "grad_norm": 4.237458177176042, + "learning_rate": 3.507873797899735e-08, + "logits/chosen": 3.2579739093780518, + "logits/rejected": 3.1894021034240723, + "logps/chosen": -560.0206909179688, + "logps/rejected": -589.6734619140625, + "loss": 0.453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8938484191894531, + "rewards/margins": 1.065301537513733, + "rewards/rejected": -1.959149956703186, + "step": 4600 + }, + { + "epoch": 4.99, + "eval_logits/chosen": 3.414238214492798, + "eval_logits/rejected": 3.444387674331665, + "eval_logps/chosen": -635.6461791992188, + "eval_logps/rejected": -619.3519287109375, + "eval_loss": 0.4748367667198181, + "eval_rewards/accuracies": 0.7916666865348816, + "eval_rewards/chosen": -0.9230929613113403, + "eval_rewards/margins": 1.029681921005249, + "eval_rewards/rejected": -1.9527748823165894, + "eval_runtime": 202.7994, + "eval_samples_per_second": 9.862, + "eval_steps_per_second": 0.311, + "step": 4600 + }, + { + "epoch": 5.01, + "grad_norm": 4.820324892740955, + "learning_rate": 3.434294052828945e-08, + "logits/chosen": 3.439835786819458, + "logits/rejected": 3.350839138031006, + "logps/chosen": -619.0117797851562, + "logps/rejected": -672.6756591796875, + "loss": 0.4199, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.869764506816864, + "rewards/margins": 1.084979772567749, + "rewards/rejected": -1.9547443389892578, + "step": 4610 + }, + { + "epoch": 5.02, + "grad_norm": 3.9650106546389403, + "learning_rate": 3.361437248386983e-08, + "logits/chosen": 3.4324920177459717, + "logits/rejected": 3.456651210784912, + "logps/chosen": -668.35888671875, + "logps/rejected": -620.5419921875, + "loss": 0.3776, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.955123245716095, + "rewards/margins": 1.0994551181793213, + "rewards/rejected": -2.0545783042907715, + "step": 4620 + }, + { + "epoch": 5.03, + "grad_norm": 4.70688935810224, + "learning_rate": 3.28930582693045e-08, + "logits/chosen": 3.2283871173858643, + "logits/rejected": 3.264172077178955, + "logps/chosen": -729.6647338867188, + "logps/rejected": -721.0824584960938, + "loss": 0.3998, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.890113353729248, + "rewards/margins": 1.3613433837890625, + "rewards/rejected": -2.2514567375183105, + "step": 4630 + }, + { + "epoch": 5.04, + "grad_norm": 5.0103647290306945, + "learning_rate": 3.217902206499134e-08, + "logits/chosen": 3.302089214324951, + "logits/rejected": 3.3854782581329346, + "logps/chosen": -672.3339233398438, + "logps/rejected": -631.8421630859375, + "loss": 0.4473, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8637478947639465, + "rewards/margins": 1.0654820203781128, + "rewards/rejected": -1.929229974746704, + "step": 4640 + }, + { + "epoch": 5.05, + "grad_norm": 5.581096231339241, + "learning_rate": 3.1472287807349865e-08, + "logits/chosen": 3.2928035259246826, + "logits/rejected": 3.206064224243164, + "logps/chosen": -633.5398559570312, + "logps/rejected": -630.5841064453125, + "loss": 0.4163, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0908564329147339, + "rewards/margins": 1.0193157196044922, + "rewards/rejected": -2.1101725101470947, + "step": 4650 + }, + { + "epoch": 5.06, + "grad_norm": 3.74186827957767, + "learning_rate": 3.077287918801841e-08, + "logits/chosen": 3.3158020973205566, + "logits/rejected": 3.3574090003967285, + "logps/chosen": -671.4949340820312, + "logps/rejected": -608.8038330078125, + "loss": 0.3968, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8246719241142273, + "rewards/margins": 1.2937284708023071, + "rewards/rejected": -2.1184000968933105, + "step": 4660 + }, + { + "epoch": 5.07, + "grad_norm": 4.4512392019613545, + "learning_rate": 3.0080819653060366e-08, + "logits/chosen": 3.2338714599609375, + "logits/rejected": 3.3237907886505127, + "logps/chosen": -578.6688232421875, + "logps/rejected": -599.2220458984375, + "loss": 0.4339, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9173551797866821, + "rewards/margins": 1.14029860496521, + "rewards/rejected": -2.0576539039611816, + "step": 4670 + }, + { + "epoch": 5.08, + "grad_norm": 4.469588696422973, + "learning_rate": 2.9396132402177947e-08, + "logits/chosen": 3.3419361114501953, + "logits/rejected": 3.3482131958007812, + "logps/chosen": -604.2589721679688, + "logps/rejected": -582.9603271484375, + "loss": 0.3904, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9833993911743164, + "rewards/margins": 1.125213861465454, + "rewards/rejected": -2.1086132526397705, + "step": 4680 + }, + { + "epoch": 5.09, + "grad_norm": 4.661317641177597, + "learning_rate": 2.871884038793443e-08, + "logits/chosen": 3.3045921325683594, + "logits/rejected": 3.301054000854492, + "logps/chosen": -613.4208374023438, + "logps/rejected": -600.6182250976562, + "loss": 0.4014, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0003347396850586, + "rewards/margins": 1.080404281616211, + "rewards/rejected": -2.0807390213012695, + "step": 4690 + }, + { + "epoch": 5.1, + "grad_norm": 4.278752725947728, + "learning_rate": 2.804896631498488e-08, + "logits/chosen": 3.3149044513702393, + "logits/rejected": 3.3393523693084717, + "logps/chosen": -632.0557861328125, + "logps/rejected": -626.2003173828125, + "loss": 0.4035, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.073968529701233, + "rewards/margins": 1.1869663000106812, + "rewards/rejected": -2.260934829711914, + "step": 4700 + }, + { + "epoch": 5.1, + "eval_logits/chosen": 3.401919364929199, + "eval_logits/rejected": 3.429266929626465, + "eval_logps/chosen": -638.950439453125, + "eval_logps/rejected": -623.7211303710938, + "eval_loss": 0.47538915276527405, + "eval_rewards/accuracies": 0.7896825671195984, + "eval_rewards/chosen": -0.9561359286308289, + "eval_rewards/margins": 1.0403298139572144, + "eval_rewards/rejected": -1.9964655637741089, + "eval_runtime": 203.0268, + "eval_samples_per_second": 9.851, + "eval_steps_per_second": 0.31, + "step": 4700 + }, + { + "epoch": 5.11, + "grad_norm": 4.581330110290142, + "learning_rate": 2.738653263931495e-08, + "logits/chosen": 3.0435214042663574, + "logits/rejected": 3.1650655269622803, + "logps/chosen": -640.2958374023438, + "logps/rejected": -609.2405395507812, + "loss": 0.4067, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0501011610031128, + "rewards/margins": 1.3587573766708374, + "rewards/rejected": -2.4088587760925293, + "step": 4710 + }, + { + "epoch": 5.12, + "grad_norm": 4.244257969835405, + "learning_rate": 2.6731561567488235e-08, + "logits/chosen": 3.550856351852417, + "logits/rejected": 3.5885555744171143, + "logps/chosen": -620.6921997070312, + "logps/rejected": -627.4866333007812, + "loss": 0.4089, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0588308572769165, + "rewards/margins": 1.1712490320205688, + "rewards/rejected": -2.2300798892974854, + "step": 4720 + }, + { + "epoch": 5.14, + "grad_norm": 4.661841778899971, + "learning_rate": 2.6084075055901463e-08, + "logits/chosen": 3.4233341217041016, + "logits/rejected": 3.5295162200927734, + "logps/chosen": -715.7772216796875, + "logps/rejected": -646.1692504882812, + "loss": 0.4086, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7983964681625366, + "rewards/margins": 1.1571332216262817, + "rewards/rejected": -1.9555295705795288, + "step": 4730 + }, + { + "epoch": 5.15, + "grad_norm": 4.6671555450162625, + "learning_rate": 2.5444094810048888e-08, + "logits/chosen": 3.316159725189209, + "logits/rejected": 3.356541872024536, + "logps/chosen": -733.1777954101562, + "logps/rejected": -655.0715942382812, + "loss": 0.4133, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9959171414375305, + "rewards/margins": 1.1742465496063232, + "rewards/rejected": -2.170163869857788, + "step": 4740 + }, + { + "epoch": 5.16, + "grad_norm": 3.9437881445314713, + "learning_rate": 2.4811642283794537e-08, + "logits/chosen": 3.3583106994628906, + "logits/rejected": 3.3618130683898926, + "logps/chosen": -622.913330078125, + "logps/rejected": -617.5716552734375, + "loss": 0.3909, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.233416199684143, + "rewards/margins": 1.1455744504928589, + "rewards/rejected": -2.378990650177002, + "step": 4750 + }, + { + "epoch": 5.17, + "grad_norm": 4.852586309277658, + "learning_rate": 2.4186738678652786e-08, + "logits/chosen": 3.3076260089874268, + "logits/rejected": 3.292635440826416, + "logps/chosen": -640.589111328125, + "logps/rejected": -656.1749877929688, + "loss": 0.4355, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9329697489738464, + "rewards/margins": 1.0732852220535278, + "rewards/rejected": -2.0062549114227295, + "step": 4760 + }, + { + "epoch": 5.18, + "grad_norm": 4.243111776045957, + "learning_rate": 2.356940494307799e-08, + "logits/chosen": 3.3132290840148926, + "logits/rejected": 3.3167660236358643, + "logps/chosen": -589.9036254882812, + "logps/rejected": -592.9352416992188, + "loss": 0.3896, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.916020393371582, + "rewards/margins": 1.243582844734192, + "rewards/rejected": -2.1596033573150635, + "step": 4770 + }, + { + "epoch": 5.19, + "grad_norm": 4.446088956070295, + "learning_rate": 2.295966177176198e-08, + "logits/chosen": 3.243967056274414, + "logits/rejected": 3.2699360847473145, + "logps/chosen": -604.6546630859375, + "logps/rejected": -590.2599487304688, + "loss": 0.4304, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.808563232421875, + "rewards/margins": 1.1195275783538818, + "rewards/rejected": -1.928091049194336, + "step": 4780 + }, + { + "epoch": 5.2, + "grad_norm": 4.64754800625305, + "learning_rate": 2.2357529604940445e-08, + "logits/chosen": 3.391747236251831, + "logits/rejected": 3.3919975757598877, + "logps/chosen": -651.8663940429688, + "logps/rejected": -638.8233642578125, + "loss": 0.4051, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.012085199356079, + "rewards/margins": 1.2385754585266113, + "rewards/rejected": -2.2506606578826904, + "step": 4790 + }, + { + "epoch": 5.21, + "grad_norm": 5.137836752532852, + "learning_rate": 2.1763028627707596e-08, + "logits/chosen": 3.2514262199401855, + "logits/rejected": 3.2740256786346436, + "logps/chosen": -732.9015502929688, + "logps/rejected": -692.7354125976562, + "loss": 0.4225, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9560949206352234, + "rewards/margins": 1.128583312034607, + "rewards/rejected": -2.0846781730651855, + "step": 4800 + }, + { + "epoch": 5.21, + "eval_logits/chosen": 3.4077415466308594, + "eval_logits/rejected": 3.4359259605407715, + "eval_logps/chosen": -638.046142578125, + "eval_logps/rejected": -622.6226196289062, + "eval_loss": 0.4752858281135559, + "eval_rewards/accuracies": 0.7876983880996704, + "eval_rewards/chosen": -0.947092592716217, + "eval_rewards/margins": 1.0383890867233276, + "eval_rewards/rejected": -1.9854816198349, + "eval_runtime": 203.1953, + "eval_samples_per_second": 9.843, + "eval_steps_per_second": 0.31, + "step": 4800 + }, + { + "epoch": 5.22, + "grad_norm": 3.816159955183857, + "learning_rate": 2.1176178769339635e-08, + "logits/chosen": 3.4070944786071777, + "logits/rejected": 3.5404582023620605, + "logps/chosen": -644.9030151367188, + "logps/rejected": -614.4904174804688, + "loss": 0.408, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.9099332690238953, + "rewards/margins": 1.2075811624526978, + "rewards/rejected": -2.1175143718719482, + "step": 4810 + }, + { + "epoch": 5.23, + "grad_norm": 3.9284751768775665, + "learning_rate": 2.059699970262671e-08, + "logits/chosen": 3.6015961170196533, + "logits/rejected": 3.378958225250244, + "logps/chosen": -631.582763671875, + "logps/rejected": -653.2015991210938, + "loss": 0.4156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8876656293869019, + "rewards/margins": 1.1182324886322021, + "rewards/rejected": -2.0058982372283936, + "step": 4820 + }, + { + "epoch": 5.24, + "grad_norm": 5.252330936247836, + "learning_rate": 2.0025510843213132e-08, + "logits/chosen": 3.382859706878662, + "logits/rejected": 3.3588058948516846, + "logps/chosen": -595.603515625, + "logps/rejected": -621.556640625, + "loss": 0.4164, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.1256474256515503, + "rewards/margins": 0.9402295351028442, + "rewards/rejected": -2.0658769607543945, + "step": 4830 + }, + { + "epoch": 5.26, + "grad_norm": 4.475785672412417, + "learning_rate": 1.946173134894691e-08, + "logits/chosen": 3.3111343383789062, + "logits/rejected": 3.285496473312378, + "logps/chosen": -645.1478881835938, + "logps/rejected": -625.9896240234375, + "loss": 0.4121, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9211744070053101, + "rewards/margins": 1.1798975467681885, + "rewards/rejected": -2.101071834564209, + "step": 4840 + }, + { + "epoch": 5.27, + "grad_norm": 4.0620936735328135, + "learning_rate": 1.8905680119237292e-08, + "logits/chosen": 3.2482478618621826, + "logits/rejected": 3.231915235519409, + "logps/chosen": -581.0772705078125, + "logps/rejected": -619.1720581054688, + "loss": 0.4024, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -0.9992374181747437, + "rewards/margins": 1.2866462469100952, + "rewards/rejected": -2.285883665084839, + "step": 4850 + }, + { + "epoch": 5.28, + "grad_norm": 3.6728399438555686, + "learning_rate": 1.835737579442126e-08, + "logits/chosen": 3.308389186859131, + "logits/rejected": 3.2529404163360596, + "logps/chosen": -613.5067138671875, + "logps/rejected": -592.02685546875, + "loss": 0.427, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0827772617340088, + "rewards/margins": 1.0043256282806396, + "rewards/rejected": -2.0871028900146484, + "step": 4860 + }, + { + "epoch": 5.29, + "grad_norm": 3.7106435347709823, + "learning_rate": 1.7816836755138535e-08, + "logits/chosen": 3.2648491859436035, + "logits/rejected": 3.3202967643737793, + "logps/chosen": -621.5859375, + "logps/rejected": -588.76953125, + "loss": 0.4198, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0197430849075317, + "rewards/margins": 1.1170375347137451, + "rewards/rejected": -2.1367805004119873, + "step": 4870 + }, + { + "epoch": 5.3, + "grad_norm": 4.8771679617057835, + "learning_rate": 1.72840811217157e-08, + "logits/chosen": 3.3346686363220215, + "logits/rejected": 3.3177542686462402, + "logps/chosen": -655.8094482421875, + "logps/rejected": -650.5350341796875, + "loss": 0.4209, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0607974529266357, + "rewards/margins": 1.1251922845840454, + "rewards/rejected": -2.1859896183013916, + "step": 4880 + }, + { + "epoch": 5.31, + "grad_norm": 4.407800753858192, + "learning_rate": 1.6759126753558506e-08, + "logits/chosen": 3.4082882404327393, + "logits/rejected": 3.2606024742126465, + "logps/chosen": -608.5733642578125, + "logps/rejected": -660.5091552734375, + "loss": 0.3973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8485631942749023, + "rewards/margins": 1.4242725372314453, + "rewards/rejected": -2.2728357315063477, + "step": 4890 + }, + { + "epoch": 5.32, + "grad_norm": 5.549196075308517, + "learning_rate": 1.6241991248553217e-08, + "logits/chosen": 3.202904462814331, + "logits/rejected": 3.173356533050537, + "logps/chosen": -636.3018798828125, + "logps/rejected": -643.2816162109375, + "loss": 0.3941, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.9654437899589539, + "rewards/margins": 1.382318377494812, + "rewards/rejected": -2.3477623462677, + "step": 4900 + }, + { + "epoch": 5.32, + "eval_logits/chosen": 3.4012038707733154, + "eval_logits/rejected": 3.4281721115112305, + "eval_logps/chosen": -639.1229858398438, + "eval_logps/rejected": -623.8593139648438, + "eval_loss": 0.47542309761047363, + "eval_rewards/accuracies": 0.7896825671195984, + "eval_rewards/chosen": -0.957861602306366, + "eval_rewards/margins": 1.0399866104125977, + "eval_rewards/rejected": -1.9978482723236084, + "eval_runtime": 202.6421, + "eval_samples_per_second": 9.87, + "eval_steps_per_second": 0.311, + "step": 4900 + }, + { + "epoch": 5.33, + "grad_norm": 3.8445878799427184, + "learning_rate": 1.5732691942476673e-08, + "logits/chosen": 3.3608036041259766, + "logits/rejected": 3.2699153423309326, + "logps/chosen": -624.7760009765625, + "logps/rejected": -650.7325439453125, + "loss": 0.3724, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.889731764793396, + "rewards/margins": 1.1686939001083374, + "rewards/rejected": -2.0584254264831543, + "step": 4910 + }, + { + "epoch": 5.34, + "grad_norm": 3.9165052944586756, + "learning_rate": 1.5231245908415348e-08, + "logits/chosen": 3.2369396686553955, + "logits/rejected": 3.2419559955596924, + "logps/chosen": -619.8662719726562, + "logps/rejected": -603.00439453125, + "loss": 0.4447, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1576344966888428, + "rewards/margins": 1.0836381912231445, + "rewards/rejected": -2.2412729263305664, + "step": 4920 + }, + { + "epoch": 5.35, + "grad_norm": 5.056300731125499, + "learning_rate": 1.4737669956192745e-08, + "logits/chosen": 3.3968377113342285, + "logits/rejected": 3.513598918914795, + "logps/chosen": -640.3262329101562, + "logps/rejected": -651.488037109375, + "loss": 0.42, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8613258600234985, + "rewards/margins": 1.3728233575820923, + "rewards/rejected": -2.234149217605591, + "step": 4930 + }, + { + "epoch": 5.36, + "grad_norm": 5.087084081341506, + "learning_rate": 1.425198063180602e-08, + "logits/chosen": 3.325380802154541, + "logits/rejected": 3.2402496337890625, + "logps/chosen": -595.4832763671875, + "logps/rejected": -580.9246215820312, + "loss": 0.4394, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.8831828832626343, + "rewards/margins": 1.0617479085922241, + "rewards/rejected": -1.9449306726455688, + "step": 4940 + }, + { + "epoch": 5.37, + "grad_norm": 5.132526160430227, + "learning_rate": 1.377419421687126e-08, + "logits/chosen": 3.284151554107666, + "logits/rejected": 3.3109688758850098, + "logps/chosen": -661.6206665039062, + "logps/rejected": -648.8699951171875, + "loss": 0.4299, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0141069889068604, + "rewards/margins": 1.0557873249053955, + "rewards/rejected": -2.069894313812256, + "step": 4950 + }, + { + "epoch": 5.39, + "grad_norm": 5.215006545274992, + "learning_rate": 1.3304326728077797e-08, + "logits/chosen": 3.396632671356201, + "logits/rejected": 3.3906586170196533, + "logps/chosen": -671.5709228515625, + "logps/rejected": -654.5450439453125, + "loss": 0.4483, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.8753288388252258, + "rewards/margins": 1.0657485723495483, + "rewards/rejected": -1.941077470779419, + "step": 4960 + }, + { + "epoch": 5.4, + "grad_norm": 3.6777621619640075, + "learning_rate": 1.284239391665115e-08, + "logits/chosen": 3.4247264862060547, + "logits/rejected": 3.4094860553741455, + "logps/chosen": -633.9702758789062, + "logps/rejected": -615.48974609375, + "loss": 0.3872, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8330886960029602, + "rewards/margins": 1.09920334815979, + "rewards/rejected": -1.9322922229766846, + "step": 4970 + }, + { + "epoch": 5.41, + "grad_norm": 4.692737717148633, + "learning_rate": 1.2388411267825e-08, + "logits/chosen": 3.251021146774292, + "logits/rejected": 3.275005340576172, + "logps/chosen": -623.6749267578125, + "logps/rejected": -604.2633056640625, + "loss": 0.4187, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.025691032409668, + "rewards/margins": 1.01606285572052, + "rewards/rejected": -2.0417537689208984, + "step": 4980 + }, + { + "epoch": 5.42, + "grad_norm": 4.3474023538090885, + "learning_rate": 1.1942394000322209e-08, + "logits/chosen": 3.247284412384033, + "logits/rejected": 3.3002495765686035, + "logps/chosen": -631.767333984375, + "logps/rejected": -658.4265747070312, + "loss": 0.4207, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9438480138778687, + "rewards/margins": 1.0788995027542114, + "rewards/rejected": -2.022747755050659, + "step": 4990 + }, + { + "epoch": 5.43, + "grad_norm": 4.410015757828728, + "learning_rate": 1.1504357065844572e-08, + "logits/chosen": 3.3900043964385986, + "logits/rejected": 3.348998546600342, + "logps/chosen": -643.7843017578125, + "logps/rejected": -624.371826171875, + "loss": 0.4093, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9818245768547058, + "rewards/margins": 1.2492727041244507, + "rewards/rejected": -2.231097459793091, + "step": 5000 + }, + { + "epoch": 5.43, + "eval_logits/chosen": 3.4052021503448486, + "eval_logits/rejected": 3.43179988861084, + "eval_logps/chosen": -634.6866455078125, + "eval_logps/rejected": -618.552978515625, + "eval_loss": 0.4747713804244995, + "eval_rewards/accuracies": 0.7936508059501648, + "eval_rewards/chosen": -0.913497269153595, + "eval_rewards/margins": 1.0312875509262085, + "eval_rewards/rejected": -1.9447849988937378, + "eval_runtime": 203.57, + "eval_samples_per_second": 9.825, + "eval_steps_per_second": 0.309, + "step": 5000 + } + ], + "logging_steps": 10, + "max_steps": 5526, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}