{ "best_metric": 0.4747713804244995, "best_model_checkpoint": "data/phi_1_5_dpo_ep6/checkpoint-5000", "epoch": 6.0, "eval_steps": 100, "global_step": 5526, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.2784331867920224, "learning_rate": 5e-09, "logits/chosen": 4.471683502197266, "logits/rejected": 5.047541618347168, "logps/chosen": -583.1558837890625, "logps/rejected": -443.7651062011719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 2.1074329313581432, "learning_rate": 5e-08, "logits/chosen": 4.588065147399902, "logits/rejected": 4.978394985198975, "logps/chosen": -552.5501098632812, "logps/rejected": -387.0480651855469, "loss": 0.6928, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.0003048771759495139, "rewards/margins": 0.0007037359173409641, "rewards/rejected": -0.00039885862497612834, "step": 10 }, { "epoch": 0.02, "grad_norm": 3.585629152687865, "learning_rate": 1e-07, "logits/chosen": 4.497701168060303, "logits/rejected": 4.875298976898193, "logps/chosen": -568.1593627929688, "logps/rejected": -405.4841003417969, "loss": 0.6933, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0002680518664419651, "rewards/margins": -1.2179441000625957e-05, "rewards/rejected": -0.0002558725536800921, "step": 20 }, { "epoch": 0.03, "grad_norm": 2.1397508289715015, "learning_rate": 1.5e-07, "logits/chosen": 4.588676452636719, "logits/rejected": 4.831759452819824, "logps/chosen": -480.6222229003906, "logps/rejected": -376.4667663574219, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -8.940284897107631e-05, "rewards/margins": -5.898187373531982e-05, "rewards/rejected": -3.042102252948098e-05, "step": 30 }, { "epoch": 0.04, "grad_norm": 2.196396235652288, "learning_rate": 2e-07, "logits/chosen": 4.697300910949707, "logits/rejected": 5.044940948486328, "logps/chosen": -558.78759765625, "logps/rejected": -447.4754333496094, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0011049907188862562, "rewards/margins": 0.0004904826055280864, "rewards/rejected": 0.0006145082297734916, "step": 40 }, { "epoch": 0.05, "grad_norm": 1.9641522024523996, "learning_rate": 2.5e-07, "logits/chosen": 4.586944103240967, "logits/rejected": 4.845526695251465, "logps/chosen": -569.728515625, "logps/rejected": -436.475341796875, "loss": 0.6927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0018296729540452361, "rewards/margins": 0.0005367769626900554, "rewards/rejected": 0.0012928961077705026, "step": 50 }, { "epoch": 0.07, "grad_norm": 2.0311045116257476, "learning_rate": 3e-07, "logits/chosen": 4.496231555938721, "logits/rejected": 4.9472222328186035, "logps/chosen": -518.8323364257812, "logps/rejected": -363.575927734375, "loss": 0.6921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005082172341644764, "rewards/margins": 0.003068957244977355, "rewards/rejected": 0.0020132153294980526, "step": 60 }, { "epoch": 0.08, "grad_norm": 2.122608710951403, "learning_rate": 3.5e-07, "logits/chosen": 4.606133460998535, "logits/rejected": 4.809296607971191, "logps/chosen": -485.9520568847656, "logps/rejected": -387.82830810546875, "loss": 0.6915, "rewards/accuracies": 0.59375, "rewards/chosen": 0.007089491002261639, "rewards/margins": 0.000784275762271136, "rewards/rejected": 0.006305214948952198, "step": 70 }, { "epoch": 0.09, "grad_norm": 2.9116433618910453, "learning_rate": 4e-07, "logits/chosen": 4.483397483825684, "logits/rejected": 5.010839462280273, "logps/chosen": -596.1915283203125, "logps/rejected": -399.2021789550781, "loss": 0.6908, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01583760604262352, "rewards/margins": 0.006217488087713718, "rewards/rejected": 0.00962011981755495, "step": 80 }, { "epoch": 0.1, "grad_norm": 2.1318681018681374, "learning_rate": 4.5e-07, "logits/chosen": 4.709166526794434, "logits/rejected": 4.690488338470459, "logps/chosen": -557.8314208984375, "logps/rejected": -429.8755798339844, "loss": 0.6885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.025951331481337547, "rewards/margins": 0.00821693055331707, "rewards/rejected": 0.017734399065375328, "step": 90 }, { "epoch": 0.11, "grad_norm": 2.0017670975468773, "learning_rate": 5e-07, "logits/chosen": 4.650539875030518, "logits/rejected": 4.840279579162598, "logps/chosen": -522.7197875976562, "logps/rejected": -441.0248107910156, "loss": 0.6881, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.03702830895781517, "rewards/margins": 0.008776113390922546, "rewards/rejected": 0.028252195566892624, "step": 100 }, { "epoch": 0.11, "eval_logits/chosen": 4.6645894050598145, "eval_logits/rejected": 4.888310432434082, "eval_logps/chosen": -538.6564331054688, "eval_logps/rejected": -421.0949401855469, "eval_loss": 0.6856396198272705, "eval_rewards/accuracies": 0.7023809552192688, "eval_rewards/chosen": 0.046804603189229965, "eval_rewards/margins": 0.017009131610393524, "eval_rewards/rejected": 0.02979547157883644, "eval_runtime": 203.1077, "eval_samples_per_second": 9.847, "eval_steps_per_second": 0.31, "step": 100 }, { "epoch": 0.12, "grad_norm": 6.583589348054713, "learning_rate": 4.999958096628589e-07, "logits/chosen": 4.720016002655029, "logits/rejected": 4.64304256439209, "logps/chosen": -500.9232482910156, "logps/rejected": -427.63153076171875, "loss": 0.6854, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.044227249920368195, "rewards/margins": 0.005637052934616804, "rewards/rejected": 0.0385901965200901, "step": 110 }, { "epoch": 0.13, "grad_norm": 3.1982958145542955, "learning_rate": 4.999832387919069e-07, "logits/chosen": 4.5180182456970215, "logits/rejected": 4.88397216796875, "logps/chosen": -552.8077392578125, "logps/rejected": -401.8198547363281, "loss": 0.6833, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0751495510339737, "rewards/margins": 0.0239702258259058, "rewards/rejected": 0.05117932707071304, "step": 120 }, { "epoch": 0.14, "grad_norm": 2.4449854530637416, "learning_rate": 4.999622878085538e-07, "logits/chosen": 4.7093329429626465, "logits/rejected": 4.781570911407471, "logps/chosen": -516.9468994140625, "logps/rejected": -393.8667907714844, "loss": 0.6792, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.07485412806272507, "rewards/margins": 0.029199976474046707, "rewards/rejected": 0.04565414786338806, "step": 130 }, { "epoch": 0.15, "grad_norm": 2.1934078981016993, "learning_rate": 4.999329574151327e-07, "logits/chosen": 4.582241058349609, "logits/rejected": 4.7597975730896, "logps/chosen": -501.5450134277344, "logps/rejected": -409.2080078125, "loss": 0.6777, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.08410192281007767, "rewards/margins": 0.025774246081709862, "rewards/rejected": 0.058327674865722656, "step": 140 }, { "epoch": 0.16, "grad_norm": 2.96575096247207, "learning_rate": 4.998952485948778e-07, "logits/chosen": 4.52903938293457, "logits/rejected": 4.743136882781982, "logps/chosen": -528.6700439453125, "logps/rejected": -401.9606628417969, "loss": 0.6772, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.09848333895206451, "rewards/margins": 0.0334458127617836, "rewards/rejected": 0.06503753364086151, "step": 150 }, { "epoch": 0.17, "grad_norm": 2.8737260484485887, "learning_rate": 4.998491626118902e-07, "logits/chosen": 4.462107181549072, "logits/rejected": 4.984793663024902, "logps/chosen": -594.0728149414062, "logps/rejected": -446.53826904296875, "loss": 0.6756, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1353224217891693, "rewards/margins": 0.06305033713579178, "rewards/rejected": 0.07227210700511932, "step": 160 }, { "epoch": 0.18, "grad_norm": 1.8647392675629684, "learning_rate": 4.997947010110966e-07, "logits/chosen": 4.541747570037842, "logits/rejected": 4.776431083679199, "logps/chosen": -535.6854858398438, "logps/rejected": -413.56048583984375, "loss": 0.6738, "rewards/accuracies": 0.625, "rewards/chosen": 0.12944112718105316, "rewards/margins": 0.03229222446680069, "rewards/rejected": 0.09714889526367188, "step": 170 }, { "epoch": 0.2, "grad_norm": 2.177803918062556, "learning_rate": 4.997318656181965e-07, "logits/chosen": 4.700952053070068, "logits/rejected": 4.818137168884277, "logps/chosen": -530.86474609375, "logps/rejected": -421.16632080078125, "loss": 0.67, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.13788820803165436, "rewards/margins": 0.0508052296936512, "rewards/rejected": 0.08708297461271286, "step": 180 }, { "epoch": 0.21, "grad_norm": 2.0886821853779236, "learning_rate": 4.99660658539602e-07, "logits/chosen": 4.684708595275879, "logits/rejected": 4.796721458435059, "logps/chosen": -509.56982421875, "logps/rejected": -412.2833557128906, "loss": 0.6702, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1366799771785736, "rewards/margins": 0.022193659096956253, "rewards/rejected": 0.11448632180690765, "step": 190 }, { "epoch": 0.22, "grad_norm": 2.220515964351123, "learning_rate": 4.995810821623662e-07, "logits/chosen": 4.499368190765381, "logits/rejected": 4.888810157775879, "logps/chosen": -496.9297790527344, "logps/rejected": -394.7762756347656, "loss": 0.6692, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16144059598445892, "rewards/margins": 0.06260715425014496, "rewards/rejected": 0.09883344173431396, "step": 200 }, { "epoch": 0.22, "eval_logits/chosen": 4.636972904205322, "eval_logits/rejected": 4.871835231781006, "eval_logps/chosen": -525.9188842773438, "eval_logps/rejected": -414.19549560546875, "eval_loss": 0.6641963124275208, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.17417941987514496, "eval_rewards/margins": 0.07538975775241852, "eval_rewards/rejected": 0.09878966212272644, "eval_runtime": 203.2626, "eval_samples_per_second": 9.839, "eval_steps_per_second": 0.31, "step": 200 }, { "epoch": 0.23, "grad_norm": 2.9376887120140927, "learning_rate": 4.99493139154104e-07, "logits/chosen": 4.480714321136475, "logits/rejected": 5.064620018005371, "logps/chosen": -576.3648681640625, "logps/rejected": -387.22650146484375, "loss": 0.6545, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.19050724804401398, "rewards/margins": 0.11292573064565659, "rewards/rejected": 0.07758153975009918, "step": 210 }, { "epoch": 0.24, "grad_norm": 1.8767204134346958, "learning_rate": 4.993968324629023e-07, "logits/chosen": 4.697628498077393, "logits/rejected": 4.726076602935791, "logps/chosen": -505.72137451171875, "logps/rejected": -431.8421325683594, "loss": 0.658, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.1784447729587555, "rewards/margins": 0.0745016485452652, "rewards/rejected": 0.1039431244134903, "step": 220 }, { "epoch": 0.25, "grad_norm": 2.082590024513838, "learning_rate": 4.99292165317221e-07, "logits/chosen": 4.607518672943115, "logits/rejected": 4.914595127105713, "logps/chosen": -509.68499755859375, "logps/rejected": -400.05584716796875, "loss": 0.6609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2046581506729126, "rewards/margins": 0.06535429507493973, "rewards/rejected": 0.13930386304855347, "step": 230 }, { "epoch": 0.26, "grad_norm": 3.237183271768573, "learning_rate": 4.991791412257852e-07, "logits/chosen": 4.595475196838379, "logits/rejected": 4.688504695892334, "logps/chosen": -520.5208740234375, "logps/rejected": -381.739990234375, "loss": 0.6551, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.2024141252040863, "rewards/margins": 0.10769456624984741, "rewards/rejected": 0.09471957385540009, "step": 240 }, { "epoch": 0.27, "grad_norm": 1.918810148742378, "learning_rate": 4.990577639774672e-07, "logits/chosen": 4.68790340423584, "logits/rejected": 4.860108375549316, "logps/chosen": -487.408203125, "logps/rejected": -385.59564208984375, "loss": 0.6541, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19003790616989136, "rewards/margins": 0.09179778397083282, "rewards/rejected": 0.09824012964963913, "step": 250 }, { "epoch": 0.28, "grad_norm": 2.2494428114155856, "learning_rate": 4.9892803764116e-07, "logits/chosen": 4.434889793395996, "logits/rejected": 4.799215793609619, "logps/chosen": -511.882568359375, "logps/rejected": -400.19927978515625, "loss": 0.6486, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19563761353492737, "rewards/margins": 0.10582878440618515, "rewards/rejected": 0.08980882912874222, "step": 260 }, { "epoch": 0.29, "grad_norm": 3.666721869904351, "learning_rate": 4.987899665656399e-07, "logits/chosen": 4.74543571472168, "logits/rejected": 4.740050315856934, "logps/chosen": -503.22930908203125, "logps/rejected": -457.3968811035156, "loss": 0.6495, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.20304739475250244, "rewards/margins": 0.05644340440630913, "rewards/rejected": 0.14660397171974182, "step": 270 }, { "epoch": 0.3, "grad_norm": 5.948963323108333, "learning_rate": 4.986435553794221e-07, "logits/chosen": 4.648507118225098, "logits/rejected": 4.644364356994629, "logps/chosen": -506.22503662109375, "logps/rejected": -439.25494384765625, "loss": 0.6575, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20311832427978516, "rewards/margins": 0.06590523570775986, "rewards/rejected": 0.1372130960226059, "step": 280 }, { "epoch": 0.31, "grad_norm": 1.6922344782897734, "learning_rate": 4.984888089906041e-07, "logits/chosen": 4.390051364898682, "logits/rejected": 4.838263511657715, "logps/chosen": -551.6712646484375, "logps/rejected": -444.44598388671875, "loss": 0.651, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.2372256964445114, "rewards/margins": 0.09892772883176804, "rewards/rejected": 0.13829797506332397, "step": 290 }, { "epoch": 0.33, "grad_norm": 1.9239681386959693, "learning_rate": 4.983257325867025e-07, "logits/chosen": 4.478053569793701, "logits/rejected": 5.162686347961426, "logps/chosen": -508.06414794921875, "logps/rejected": -339.91845703125, "loss": 0.6368, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.25704532861709595, "rewards/margins": 0.1772209107875824, "rewards/rejected": 0.07982443273067474, "step": 300 }, { "epoch": 0.33, "eval_logits/chosen": 4.5967912673950195, "eval_logits/rejected": 4.840723037719727, "eval_logps/chosen": -517.7680053710938, "eval_logps/rejected": -411.4657287597656, "eval_loss": 0.6442444920539856, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 0.25568887591362, "eval_rewards/margins": 0.12960152328014374, "eval_rewards/rejected": 0.12608733773231506, "eval_runtime": 203.0386, "eval_samples_per_second": 9.85, "eval_steps_per_second": 0.31, "step": 300 }, { "epoch": 0.34, "grad_norm": 2.1499440637283977, "learning_rate": 4.981543316344781e-07, "logits/chosen": 4.507613182067871, "logits/rejected": 4.725367546081543, "logps/chosen": -546.1619873046875, "logps/rejected": -389.0150146484375, "loss": 0.6368, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3014827370643616, "rewards/margins": 0.1564800888299942, "rewards/rejected": 0.14500267803668976, "step": 310 }, { "epoch": 0.35, "grad_norm": 3.228748334481589, "learning_rate": 4.979746118797531e-07, "logits/chosen": 4.5380539894104, "logits/rejected": 4.800951957702637, "logps/chosen": -521.36083984375, "logps/rejected": -410.35650634765625, "loss": 0.6463, "rewards/accuracies": 0.71875, "rewards/chosen": 0.24756142497062683, "rewards/margins": 0.14254947006702423, "rewards/rejected": 0.1050119400024414, "step": 320 }, { "epoch": 0.36, "grad_norm": 1.912283760633026, "learning_rate": 4.977865793472184e-07, "logits/chosen": 4.489951133728027, "logits/rejected": 5.115104675292969, "logps/chosen": -489.64996337890625, "logps/rejected": -359.89239501953125, "loss": 0.6359, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.2450093924999237, "rewards/margins": 0.13062641024589539, "rewards/rejected": 0.11438298225402832, "step": 330 }, { "epoch": 0.37, "grad_norm": 2.2785842112366574, "learning_rate": 4.975902403402318e-07, "logits/chosen": 4.511780261993408, "logits/rejected": 4.862536430358887, "logps/chosen": -557.9495849609375, "logps/rejected": -401.67901611328125, "loss": 0.6448, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2301413118839264, "rewards/margins": 0.1367010772228241, "rewards/rejected": 0.0934402346611023, "step": 340 }, { "epoch": 0.38, "grad_norm": 1.8134683023299867, "learning_rate": 4.973856014406061e-07, "logits/chosen": 4.520918846130371, "logits/rejected": 4.871068477630615, "logps/chosen": -503.2167053222656, "logps/rejected": -392.04998779296875, "loss": 0.6313, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.2531774044036865, "rewards/margins": 0.18557706475257874, "rewards/rejected": 0.06760034710168839, "step": 350 }, { "epoch": 0.39, "grad_norm": 2.0638343555583893, "learning_rate": 4.971726695083893e-07, "logits/chosen": 4.620011329650879, "logits/rejected": 4.672842025756836, "logps/chosen": -500.76776123046875, "logps/rejected": -445.62493896484375, "loss": 0.631, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.27054670453071594, "rewards/margins": 0.1622386872768402, "rewards/rejected": 0.10830801725387573, "step": 360 }, { "epoch": 0.4, "grad_norm": 2.361650417228017, "learning_rate": 4.96951451681634e-07, "logits/chosen": 4.580225467681885, "logits/rejected": 4.911395072937012, "logps/chosen": -531.093994140625, "logps/rejected": -425.7933654785156, "loss": 0.6303, "rewards/accuracies": 0.65625, "rewards/chosen": 0.27540484070777893, "rewards/margins": 0.12915542721748352, "rewards/rejected": 0.14624938368797302, "step": 370 }, { "epoch": 0.41, "grad_norm": 2.7174483204511937, "learning_rate": 4.967219553761586e-07, "logits/chosen": 4.723080635070801, "logits/rejected": 4.8610453605651855, "logps/chosen": -517.4929809570312, "logps/rejected": -459.22979736328125, "loss": 0.6392, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.2602981626987457, "rewards/margins": 0.16877390444278717, "rewards/rejected": 0.09152427315711975, "step": 380 }, { "epoch": 0.42, "grad_norm": 3.9776964083274917, "learning_rate": 4.96484188285298e-07, "logits/chosen": 4.445316791534424, "logits/rejected": 4.665471076965332, "logps/chosen": -577.9735107421875, "logps/rejected": -460.4791564941406, "loss": 0.6329, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3058663308620453, "rewards/margins": 0.1735948771238327, "rewards/rejected": 0.1322714388370514, "step": 390 }, { "epoch": 0.43, "grad_norm": 1.76118979332965, "learning_rate": 4.962381583796465e-07, "logits/chosen": 4.506722450256348, "logits/rejected": 4.659956932067871, "logps/chosen": -500.7157287597656, "logps/rejected": -396.84393310546875, "loss": 0.6283, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.27281665802001953, "rewards/margins": 0.16729183495044708, "rewards/rejected": 0.10552482306957245, "step": 400 }, { "epoch": 0.43, "eval_logits/chosen": 4.515594959259033, "eval_logits/rejected": 4.762938499450684, "eval_logps/chosen": -517.2609252929688, "eval_logps/rejected": -415.9521789550781, "eval_loss": 0.6283168792724609, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 0.2607596516609192, "eval_rewards/margins": 0.1795366406440735, "eval_rewards/rejected": 0.08122298866510391, "eval_runtime": 203.11, "eval_samples_per_second": 9.847, "eval_steps_per_second": 0.31, "step": 400 }, { "epoch": 0.45, "grad_norm": 2.272815572830576, "learning_rate": 4.9598387390679e-07, "logits/chosen": 4.5500898361206055, "logits/rejected": 4.538148880004883, "logps/chosen": -542.4667358398438, "logps/rejected": -461.719970703125, "loss": 0.6271, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.28052183985710144, "rewards/margins": 0.19728553295135498, "rewards/rejected": 0.08323628455400467, "step": 410 }, { "epoch": 0.46, "grad_norm": 2.1158441354384188, "learning_rate": 4.9572134339103e-07, "logits/chosen": 4.513038158416748, "logits/rejected": 4.744021415710449, "logps/chosen": -557.8423461914062, "logps/rejected": -416.1087951660156, "loss": 0.6292, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2836315631866455, "rewards/margins": 0.16970789432525635, "rewards/rejected": 0.11392368376255035, "step": 420 }, { "epoch": 0.47, "grad_norm": 2.4154271169731176, "learning_rate": 4.954505756330975e-07, "logits/chosen": 4.326611518859863, "logits/rejected": 4.637131214141846, "logps/chosen": -497.8357849121094, "logps/rejected": -378.7847595214844, "loss": 0.6125, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.2221941202878952, "rewards/margins": 0.1941184103488922, "rewards/rejected": 0.028075695037841797, "step": 430 }, { "epoch": 0.48, "grad_norm": 3.6625125192218624, "learning_rate": 4.951715797098579e-07, "logits/chosen": 4.394779682159424, "logits/rejected": 4.818585395812988, "logps/chosen": -543.4212036132812, "logps/rejected": -413.21881103515625, "loss": 0.6213, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20388510823249817, "rewards/margins": 0.2376679927110672, "rewards/rejected": -0.03378290683031082, "step": 440 }, { "epoch": 0.49, "grad_norm": 3.0417051890171325, "learning_rate": 4.94884364974007e-07, "logits/chosen": 4.4246063232421875, "logits/rejected": 4.6781697273254395, "logps/chosen": -488.7919921875, "logps/rejected": -413.4225158691406, "loss": 0.6171, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.18326595425605774, "rewards/margins": 0.17521528899669647, "rewards/rejected": 0.008050672709941864, "step": 450 }, { "epoch": 0.5, "grad_norm": 2.2275571876330544, "learning_rate": 4.945889410537577e-07, "logits/chosen": 4.345341205596924, "logits/rejected": 4.646592140197754, "logps/chosen": -530.9051513671875, "logps/rejected": -403.25579833984375, "loss": 0.6132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.24030213057994843, "rewards/margins": 0.22080914676189423, "rewards/rejected": 0.01949295774102211, "step": 460 }, { "epoch": 0.51, "grad_norm": 2.06236619392903, "learning_rate": 4.942853178525163e-07, "logits/chosen": 4.217994689941406, "logits/rejected": 4.637279510498047, "logps/chosen": -551.31640625, "logps/rejected": -434.1787109375, "loss": 0.6159, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.19648627936840057, "rewards/margins": 0.18652307987213135, "rewards/rejected": 0.00996321253478527, "step": 470 }, { "epoch": 0.52, "grad_norm": 2.135303619213813, "learning_rate": 4.939735055485515e-07, "logits/chosen": 4.297804832458496, "logits/rejected": 4.6675238609313965, "logps/chosen": -536.94287109375, "logps/rejected": -443.29693603515625, "loss": 0.602, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.1485282927751541, "rewards/margins": 0.2334461659193039, "rewards/rejected": -0.08491786569356918, "step": 480 }, { "epoch": 0.53, "grad_norm": 1.9853533137189272, "learning_rate": 4.936535145946528e-07, "logits/chosen": 4.211465358734131, "logits/rejected": 4.481369972229004, "logps/chosen": -560.4998779296875, "logps/rejected": -426.50897216796875, "loss": 0.608, "rewards/accuracies": 0.71875, "rewards/chosen": 0.21016542613506317, "rewards/margins": 0.27178627252578735, "rewards/rejected": -0.061620790511369705, "step": 490 }, { "epoch": 0.54, "grad_norm": 3.856906618729529, "learning_rate": 4.933253557177799e-07, "logits/chosen": 4.18944787979126, "logits/rejected": 4.669934272766113, "logps/chosen": -520.4953002929688, "logps/rejected": -416.11590576171875, "loss": 0.6052, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15112890303134918, "rewards/margins": 0.20565268397331238, "rewards/rejected": -0.05452378839254379, "step": 500 }, { "epoch": 0.54, "eval_logits/chosen": 4.3152852058410645, "eval_logits/rejected": 4.5515546798706055, "eval_logps/chosen": -529.0491333007812, "eval_logps/rejected": -434.0544738769531, "eval_loss": 0.6131876111030579, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.14287839829921722, "eval_rewards/margins": 0.24267850816249847, "eval_rewards/rejected": -0.09980012476444244, "eval_runtime": 202.6353, "eval_samples_per_second": 9.87, "eval_steps_per_second": 0.311, "step": 500 }, { "epoch": 0.55, "grad_norm": 1.8808869421536998, "learning_rate": 4.929890399187035e-07, "logits/chosen": 4.082136631011963, "logits/rejected": 4.399838447570801, "logps/chosen": -544.2486572265625, "logps/rejected": -440.2666931152344, "loss": 0.5926, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.10085509717464447, "rewards/margins": 0.26802974939346313, "rewards/rejected": -0.16717462241649628, "step": 510 }, { "epoch": 0.56, "grad_norm": 2.251806935640601, "learning_rate": 4.926445784716363e-07, "logits/chosen": 4.27434778213501, "logits/rejected": 4.4805097579956055, "logps/chosen": -493.55059814453125, "logps/rejected": -420.85205078125, "loss": 0.6046, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05821802467107773, "rewards/margins": 0.24736134707927704, "rewards/rejected": -0.1891433149576187, "step": 520 }, { "epoch": 0.58, "grad_norm": 1.8178650690469547, "learning_rate": 4.922919829238551e-07, "logits/chosen": 4.100694179534912, "logits/rejected": 4.416205406188965, "logps/chosen": -512.9978637695312, "logps/rejected": -404.34527587890625, "loss": 0.588, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.1162143126130104, "rewards/margins": 0.31595414876937866, "rewards/rejected": -0.19973981380462646, "step": 530 }, { "epoch": 0.59, "grad_norm": 5.360920096472579, "learning_rate": 4.919312650953137e-07, "logits/chosen": 4.242461204528809, "logits/rejected": 4.388332843780518, "logps/chosen": -487.63592529296875, "logps/rejected": -440.6812438964844, "loss": 0.5947, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.026374664157629013, "rewards/margins": 0.25695887207984924, "rewards/rejected": -0.23058418929576874, "step": 540 }, { "epoch": 0.6, "grad_norm": 2.9490637929554993, "learning_rate": 4.915624370782462e-07, "logits/chosen": 4.2471604347229, "logits/rejected": 4.280003547668457, "logps/chosen": -500.488525390625, "logps/rejected": -468.49749755859375, "loss": 0.6095, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.032730571925640106, "rewards/margins": 0.19359466433525085, "rewards/rejected": -0.16086408495903015, "step": 550 }, { "epoch": 0.61, "grad_norm": 4.01165068460727, "learning_rate": 4.911855112367632e-07, "logits/chosen": 4.124515533447266, "logits/rejected": 4.417056083679199, "logps/chosen": -545.4638061523438, "logps/rejected": -426.03253173828125, "loss": 0.6051, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07037229835987091, "rewards/margins": 0.26376873254776, "rewards/rejected": -0.19339647889137268, "step": 560 }, { "epoch": 0.62, "grad_norm": 3.850942981354808, "learning_rate": 4.908005002064349e-07, "logits/chosen": 4.044139862060547, "logits/rejected": 4.450469970703125, "logps/chosen": -526.7841796875, "logps/rejected": -394.8378601074219, "loss": 0.5936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14681175351142883, "rewards/margins": 0.3266211748123169, "rewards/rejected": -0.17980939149856567, "step": 570 }, { "epoch": 0.63, "grad_norm": 1.9639781340108824, "learning_rate": 4.904074168938699e-07, "logits/chosen": 4.0963239669799805, "logits/rejected": 4.188741207122803, "logps/chosen": -488.54803466796875, "logps/rejected": -426.55731201171875, "loss": 0.5948, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.03685665875673294, "rewards/margins": 0.2861422598361969, "rewards/rejected": -0.24928562343120575, "step": 580 }, { "epoch": 0.64, "grad_norm": 1.9123674383013602, "learning_rate": 4.900062744762808e-07, "logits/chosen": 4.1099090576171875, "logits/rejected": 4.312173366546631, "logps/chosen": -595.0274658203125, "logps/rejected": -468.94915771484375, "loss": 0.5845, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.10104771703481674, "rewards/margins": 0.3661288917064667, "rewards/rejected": -0.2650812268257141, "step": 590 }, { "epoch": 0.65, "grad_norm": 6.153053301101733, "learning_rate": 4.895970864010433e-07, "logits/chosen": 4.1564226150512695, "logits/rejected": 4.437946319580078, "logps/chosen": -579.667724609375, "logps/rejected": -429.3572692871094, "loss": 0.5923, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0929703563451767, "rewards/margins": 0.3057120740413666, "rewards/rejected": -0.2127417027950287, "step": 600 }, { "epoch": 0.65, "eval_logits/chosen": 4.228888511657715, "eval_logits/rejected": 4.458848476409912, "eval_logps/chosen": -529.0886840820312, "eval_logps/rejected": -440.3538513183594, "eval_loss": 0.6007751226425171, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.14248257875442505, "eval_rewards/margins": 0.3052762746810913, "eval_rewards/rejected": -0.16279369592666626, "eval_runtime": 203.2506, "eval_samples_per_second": 9.84, "eval_steps_per_second": 0.31, "step": 600 }, { "epoch": 0.66, "grad_norm": 2.546602524377636, "learning_rate": 4.891798663852454e-07, "logits/chosen": 4.237885475158691, "logits/rejected": 4.455267906188965, "logps/chosen": -548.642822265625, "logps/rejected": -467.329833984375, "loss": 0.5907, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.14880751073360443, "rewards/margins": 0.31532496213912964, "rewards/rejected": -0.1665174812078476, "step": 610 }, { "epoch": 0.67, "grad_norm": 2.0210468870193523, "learning_rate": 4.887546284152276e-07, "logits/chosen": 4.149720191955566, "logits/rejected": 4.5017805099487305, "logps/chosen": -544.840576171875, "logps/rejected": -441.256103515625, "loss": 0.5975, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13573914766311646, "rewards/margins": 0.2920631468296051, "rewards/rejected": -0.15632399916648865, "step": 620 }, { "epoch": 0.68, "grad_norm": 2.158709999703385, "learning_rate": 4.883213867461131e-07, "logits/chosen": 4.091971397399902, "logits/rejected": 4.204458236694336, "logps/chosen": -531.9281005859375, "logps/rejected": -437.74847412109375, "loss": 0.5968, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.008038423955440521, "rewards/margins": 0.20734302699565887, "rewards/rejected": -0.19930461049079895, "step": 630 }, { "epoch": 0.69, "grad_norm": 2.1425960327586475, "learning_rate": 4.878801559013315e-07, "logits/chosen": 4.021170616149902, "logits/rejected": 4.375435829162598, "logps/chosen": -581.045166015625, "logps/rejected": -459.45709228515625, "loss": 0.5813, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.11591031402349472, "rewards/margins": 0.33884397149086, "rewards/rejected": -0.22293367981910706, "step": 640 }, { "epoch": 0.71, "grad_norm": 2.5470845859360636, "learning_rate": 4.874309506721307e-07, "logits/chosen": 4.09341287612915, "logits/rejected": 4.369636535644531, "logps/chosen": -503.78125, "logps/rejected": -418.76080322265625, "loss": 0.5808, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08035546541213989, "rewards/margins": 0.3203769624233246, "rewards/rejected": -0.2400215119123459, "step": 650 }, { "epoch": 0.72, "grad_norm": 2.833418539609218, "learning_rate": 4.869737861170815e-07, "logits/chosen": 4.144112586975098, "logits/rejected": 4.382904529571533, "logps/chosen": -480.66094970703125, "logps/rejected": -407.4732971191406, "loss": 0.5824, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08363370597362518, "rewards/margins": 0.3676373362541199, "rewards/rejected": -0.28400367498397827, "step": 660 }, { "epoch": 0.73, "grad_norm": 2.231705054067377, "learning_rate": 4.865086775615727e-07, "logits/chosen": 3.999843120574951, "logits/rejected": 4.188136100769043, "logps/chosen": -563.2714233398438, "logps/rejected": -490.22186279296875, "loss": 0.6089, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08467956632375717, "rewards/margins": 0.2611342966556549, "rewards/rejected": -0.3458138406276703, "step": 670 }, { "epoch": 0.74, "grad_norm": 2.15719451802752, "learning_rate": 4.860356405972979e-07, "logits/chosen": 3.9899182319641113, "logits/rejected": 4.266640663146973, "logps/chosen": -571.6624145507812, "logps/rejected": -439.17822265625, "loss": 0.5675, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.06711284816265106, "rewards/margins": 0.3917382061481476, "rewards/rejected": -0.32462531328201294, "step": 680 }, { "epoch": 0.75, "grad_norm": 3.8527902072521663, "learning_rate": 4.855546910817316e-07, "logits/chosen": 4.05373477935791, "logits/rejected": 4.261914253234863, "logps/chosen": -539.2098388671875, "logps/rejected": -448.1529235839844, "loss": 0.5773, "rewards/accuracies": 0.71875, "rewards/chosen": 0.051774777472019196, "rewards/margins": 0.4131855070590973, "rewards/rejected": -0.3614107072353363, "step": 690 }, { "epoch": 0.76, "grad_norm": 2.712352840545992, "learning_rate": 4.850658451375989e-07, "logits/chosen": 3.863182544708252, "logits/rejected": 4.379975318908691, "logps/chosen": -548.5557861328125, "logps/rejected": -405.55377197265625, "loss": 0.5899, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.021846016868948936, "rewards/margins": 0.35745707154273987, "rewards/rejected": -0.3793030381202698, "step": 700 }, { "epoch": 0.76, "eval_logits/chosen": 4.13485050201416, "eval_logits/rejected": 4.344428539276123, "eval_logps/chosen": -535.7857055664062, "eval_logps/rejected": -453.22711181640625, "eval_loss": 0.5879531502723694, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 0.07551173120737076, "eval_rewards/margins": 0.36703822016716003, "eval_rewards/rejected": -0.2915264964103699, "eval_runtime": 203.5158, "eval_samples_per_second": 9.827, "eval_steps_per_second": 0.31, "step": 700 }, { "epoch": 0.77, "grad_norm": 3.5270973534849777, "learning_rate": 4.845691191523343e-07, "logits/chosen": 4.097344875335693, "logits/rejected": 4.327819347381592, "logps/chosen": -538.5155639648438, "logps/rejected": -463.36199951171875, "loss": 0.587, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.003706134855747223, "rewards/margins": 0.4366297125816345, "rewards/rejected": -0.4329235553741455, "step": 710 }, { "epoch": 0.78, "grad_norm": 2.340358336507431, "learning_rate": 4.840645297775326e-07, "logits/chosen": 4.059669494628906, "logits/rejected": 4.2709221839904785, "logps/chosen": -494.7140197753906, "logps/rejected": -407.42889404296875, "loss": 0.5868, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12765507400035858, "rewards/margins": 0.26888275146484375, "rewards/rejected": -0.3965378403663635, "step": 720 }, { "epoch": 0.79, "grad_norm": 5.452700718311131, "learning_rate": 4.835520939283907e-07, "logits/chosen": 3.9361884593963623, "logits/rejected": 4.386366367340088, "logps/chosen": -586.333740234375, "logps/rejected": -431.07415771484375, "loss": 0.5551, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.08558530360460281, "rewards/margins": 0.3690303564071655, "rewards/rejected": -0.45461565256118774, "step": 730 }, { "epoch": 0.8, "grad_norm": 1.8567399586454065, "learning_rate": 4.830318287831401e-07, "logits/chosen": 4.077117919921875, "logits/rejected": 4.315122604370117, "logps/chosen": -569.4171142578125, "logps/rejected": -495.67095947265625, "loss": 0.5767, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07002132385969162, "rewards/margins": 0.32364723086357117, "rewards/rejected": -0.393668532371521, "step": 740 }, { "epoch": 0.81, "grad_norm": 1.9588211831618236, "learning_rate": 4.82503751782472e-07, "logits/chosen": 4.032423973083496, "logits/rejected": 4.166065216064453, "logps/chosen": -554.0896606445312, "logps/rejected": -491.1756896972656, "loss": 0.6032, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.0907275378704071, "rewards/margins": 0.3419603407382965, "rewards/rejected": -0.4326878488063812, "step": 750 }, { "epoch": 0.83, "grad_norm": 3.107580787070029, "learning_rate": 4.819678806289514e-07, "logits/chosen": 4.053136825561523, "logits/rejected": 4.109324932098389, "logps/chosen": -527.7947998046875, "logps/rejected": -492.25579833984375, "loss": 0.572, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1407662332057953, "rewards/margins": 0.31691235303878784, "rewards/rejected": -0.4576786160469055, "step": 760 }, { "epoch": 0.84, "grad_norm": 2.82290413965199, "learning_rate": 4.814242332864249e-07, "logits/chosen": 4.089312553405762, "logits/rejected": 4.108882904052734, "logps/chosen": -565.4785766601562, "logps/rejected": -530.161376953125, "loss": 0.5773, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10818660259246826, "rewards/margins": 0.31342652440071106, "rewards/rejected": -0.4216131269931793, "step": 770 }, { "epoch": 0.85, "grad_norm": 3.0371041705697035, "learning_rate": 4.808728279794178e-07, "logits/chosen": 4.000454902648926, "logits/rejected": 4.330757141113281, "logps/chosen": -619.1341552734375, "logps/rejected": -447.75372314453125, "loss": 0.5774, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1276620477437973, "rewards/margins": 0.40843528509140015, "rewards/rejected": -0.5360973477363586, "step": 780 }, { "epoch": 0.86, "grad_norm": 2.1196217775360657, "learning_rate": 4.803136831925228e-07, "logits/chosen": 4.038321495056152, "logits/rejected": 4.138734817504883, "logps/chosen": -558.1214599609375, "logps/rejected": -492.8551330566406, "loss": 0.5433, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.07264934480190277, "rewards/margins": 0.48099589347839355, "rewards/rejected": -0.5536452531814575, "step": 790 }, { "epoch": 0.87, "grad_norm": 3.0153920473437883, "learning_rate": 4.797468176697817e-07, "logits/chosen": 4.080142021179199, "logits/rejected": 4.077189922332764, "logps/chosen": -550.8026123046875, "logps/rejected": -462.605712890625, "loss": 0.558, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23426461219787598, "rewards/margins": 0.3467225134372711, "rewards/rejected": -0.5809870958328247, "step": 800 }, { "epoch": 0.87, "eval_logits/chosen": 4.064155101776123, "eval_logits/rejected": 4.270414352416992, "eval_logps/chosen": -552.982177734375, "eval_logps/rejected": -477.1144104003906, "eval_loss": 0.5715343952178955, "eval_rewards/accuracies": 0.726190447807312, "eval_rewards/chosen": -0.09645335376262665, "eval_rewards/margins": 0.43394559621810913, "eval_rewards/rejected": -0.530398964881897, "eval_runtime": 202.7946, "eval_samples_per_second": 9.862, "eval_steps_per_second": 0.311, "step": 800 }, { "epoch": 0.88, "grad_norm": 3.1242415701188437, "learning_rate": 4.791722504140557e-07, "logits/chosen": 3.9817912578582764, "logits/rejected": 3.9899230003356934, "logps/chosen": -539.9017944335938, "logps/rejected": -485.11749267578125, "loss": 0.5644, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18518143892288208, "rewards/margins": 0.39562076330184937, "rewards/rejected": -0.5808022618293762, "step": 810 }, { "epoch": 0.89, "grad_norm": 4.364703585921067, "learning_rate": 4.785900006863886e-07, "logits/chosen": 3.8450775146484375, "logits/rejected": 3.8901209831237793, "logps/chosen": -592.6529541015625, "logps/rejected": -499.44488525390625, "loss": 0.541, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1347336769104004, "rewards/margins": 0.4826177954673767, "rewards/rejected": -0.6173514723777771, "step": 820 }, { "epoch": 0.9, "grad_norm": 2.233604071674828, "learning_rate": 4.780000880053617e-07, "logits/chosen": 3.895159959793091, "logits/rejected": 4.225598335266113, "logps/chosen": -576.2763671875, "logps/rejected": -453.64306640625, "loss": 0.5504, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3578268885612488, "rewards/margins": 0.4358748495578766, "rewards/rejected": -0.793701708316803, "step": 830 }, { "epoch": 0.91, "grad_norm": 2.8171329385172004, "learning_rate": 4.774025321464393e-07, "logits/chosen": 3.9733738899230957, "logits/rejected": 4.090386390686035, "logps/chosen": -548.554443359375, "logps/rejected": -460.45391845703125, "loss": 0.5436, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22730925679206848, "rewards/margins": 0.4536939263343811, "rewards/rejected": -0.6810031533241272, "step": 840 }, { "epoch": 0.92, "grad_norm": 2.597837995051063, "learning_rate": 4.7679735314130554e-07, "logits/chosen": 3.9798221588134766, "logits/rejected": 4.143235683441162, "logps/chosen": -604.2347412109375, "logps/rejected": -508.7215881347656, "loss": 0.5669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15413324534893036, "rewards/margins": 0.5176432728767395, "rewards/rejected": -0.6717765927314758, "step": 850 }, { "epoch": 0.93, "grad_norm": 2.8211401150825086, "learning_rate": 4.761845712771928e-07, "logits/chosen": 4.0112457275390625, "logits/rejected": 3.9924988746643066, "logps/chosen": -494.43121337890625, "logps/rejected": -462.6639099121094, "loss": 0.5487, "rewards/accuracies": 0.75, "rewards/chosen": -0.3942197859287262, "rewards/margins": 0.46237269043922424, "rewards/rejected": -0.8565924763679504, "step": 860 }, { "epoch": 0.94, "grad_norm": 2.726543592078615, "learning_rate": 4.755642070962019e-07, "logits/chosen": 4.057265281677246, "logits/rejected": 4.079537868499756, "logps/chosen": -638.6723022460938, "logps/rejected": -520.6543579101562, "loss": 0.5569, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.21904997527599335, "rewards/margins": 0.5892314910888672, "rewards/rejected": -0.8082815408706665, "step": 870 }, { "epoch": 0.96, "grad_norm": 2.42602462885076, "learning_rate": 4.749362813946134e-07, "logits/chosen": 4.092899322509766, "logits/rejected": 4.094930648803711, "logps/chosen": -548.0364379882812, "logps/rejected": -462.27374267578125, "loss": 0.5519, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.1817770004272461, "rewards/margins": 0.5349105000495911, "rewards/rejected": -0.7166875004768372, "step": 880 }, { "epoch": 0.97, "grad_norm": 3.503506476250686, "learning_rate": 4.743008152221904e-07, "logits/chosen": 3.9905002117156982, "logits/rejected": 4.0423197746276855, "logps/chosen": -543.5731201171875, "logps/rejected": -484.5904846191406, "loss": 0.5581, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.38738584518432617, "rewards/margins": 0.41433319449424744, "rewards/rejected": -0.801719069480896, "step": 890 }, { "epoch": 0.98, "grad_norm": 2.6298968847563513, "learning_rate": 4.7365782988147297e-07, "logits/chosen": 3.9976742267608643, "logits/rejected": 4.120612144470215, "logps/chosen": -532.8512573242188, "logps/rejected": -476.075927734375, "loss": 0.5495, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3852420449256897, "rewards/margins": 0.4661009907722473, "rewards/rejected": -0.851343035697937, "step": 900 }, { "epoch": 0.98, "eval_logits/chosen": 4.001543998718262, "eval_logits/rejected": 4.197599411010742, "eval_logps/chosen": -569.9209594726562, "eval_logps/rejected": -500.8484191894531, "eval_loss": 0.5551918745040894, "eval_rewards/accuracies": 0.7341269850730896, "eval_rewards/chosen": -0.26584070920944214, "eval_rewards/margins": 0.5018988251686096, "eval_rewards/rejected": -0.7677395343780518, "eval_runtime": 202.8629, "eval_samples_per_second": 9.859, "eval_steps_per_second": 0.311, "step": 900 }, { "epoch": 0.99, "grad_norm": 2.0947908923652254, "learning_rate": 4.73007346927064e-07, "logits/chosen": 3.878185749053955, "logits/rejected": 4.080098628997803, "logps/chosen": -557.4278564453125, "logps/rejected": -478.7586975097656, "loss": 0.5554, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3504122793674469, "rewards/margins": 0.43874025344848633, "rewards/rejected": -0.7891525030136108, "step": 910 }, { "epoch": 1.0, "grad_norm": 2.61481570805474, "learning_rate": 4.7234938816490643e-07, "logits/chosen": 3.984571933746338, "logits/rejected": 4.107733249664307, "logps/chosen": -548.0833740234375, "logps/rejected": -495.656005859375, "loss": 0.5533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2844906747341156, "rewards/margins": 0.4039286673069, "rewards/rejected": -0.6884194016456604, "step": 920 }, { "epoch": 1.01, "grad_norm": 2.639868915389026, "learning_rate": 4.7168397565155264e-07, "logits/chosen": 3.9218106269836426, "logits/rejected": 4.216797828674316, "logps/chosen": -566.1287231445312, "logps/rejected": -468.50506591796875, "loss": 0.5275, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.3953813314437866, "rewards/margins": 0.6424225568771362, "rewards/rejected": -1.0378040075302124, "step": 930 }, { "epoch": 1.02, "grad_norm": 3.72502140934014, "learning_rate": 4.710111316934248e-07, "logits/chosen": 4.068375110626221, "logits/rejected": 4.248932838439941, "logps/chosen": -568.8611450195312, "logps/rejected": -518.5789184570312, "loss": 0.5293, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.41718488931655884, "rewards/margins": 0.45708101987838745, "rewards/rejected": -0.8742658495903015, "step": 940 }, { "epoch": 1.03, "grad_norm": 2.3680192609419124, "learning_rate": 4.7033087884606713e-07, "logits/chosen": 3.8009941577911377, "logits/rejected": 3.9223339557647705, "logps/chosen": -525.62548828125, "logps/rejected": -463.38397216796875, "loss": 0.5481, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5579544305801392, "rewards/margins": 0.34062978625297546, "rewards/rejected": -0.898584246635437, "step": 950 }, { "epoch": 1.04, "grad_norm": 2.1297712756488445, "learning_rate": 4.6964323991338973e-07, "logits/chosen": 3.92254900932312, "logits/rejected": 4.1782660484313965, "logps/chosen": -588.7681274414062, "logps/rejected": -488.18597412109375, "loss": 0.5315, "rewards/accuracies": 0.78125, "rewards/chosen": -0.37962308526039124, "rewards/margins": 0.49557191133499146, "rewards/rejected": -0.8751950263977051, "step": 960 }, { "epoch": 1.05, "grad_norm": 3.11704708081821, "learning_rate": 4.6894823794690436e-07, "logits/chosen": 3.853797435760498, "logits/rejected": 3.8071129322052, "logps/chosen": -546.7620239257812, "logps/rejected": -489.9036560058594, "loss": 0.5353, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6293502449989319, "rewards/margins": 0.4502403736114502, "rewards/rejected": -1.0795905590057373, "step": 970 }, { "epoch": 1.06, "grad_norm": 2.595678646905618, "learning_rate": 4.6824589624495136e-07, "logits/chosen": 3.8109824657440186, "logits/rejected": 4.0372538566589355, "logps/chosen": -553.0970458984375, "logps/rejected": -498.0740661621094, "loss": 0.5355, "rewards/accuracies": 0.75, "rewards/chosen": -0.4165034294128418, "rewards/margins": 0.5991848111152649, "rewards/rejected": -1.0156883001327515, "step": 980 }, { "epoch": 1.07, "grad_norm": 2.2526945379801053, "learning_rate": 4.6753623835191903e-07, "logits/chosen": 3.806671142578125, "logits/rejected": 3.963538408279419, "logps/chosen": -536.47705078125, "logps/rejected": -471.6568298339844, "loss": 0.5257, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.30763205885887146, "rewards/margins": 0.5957274436950684, "rewards/rejected": -0.903359591960907, "step": 990 }, { "epoch": 1.09, "grad_norm": 3.2721785743598715, "learning_rate": 4.668192880574537e-07, "logits/chosen": 3.9289004802703857, "logits/rejected": 4.014190673828125, "logps/chosen": -570.447265625, "logps/rejected": -481.65228271484375, "loss": 0.5124, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.318829745054245, "rewards/margins": 0.5781766772270203, "rewards/rejected": -0.8970065116882324, "step": 1000 }, { "epoch": 1.09, "eval_logits/chosen": 3.9124913215637207, "eval_logits/rejected": 4.095924377441406, "eval_logps/chosen": -582.042724609375, "eval_logps/rejected": -518.0128784179688, "eval_loss": 0.5473096966743469, "eval_rewards/accuracies": 0.7321428656578064, "eval_rewards/chosen": -0.38705816864967346, "eval_rewards/margins": 0.5523259043693542, "eval_rewards/rejected": -0.9393841624259949, "eval_runtime": 203.1299, "eval_samples_per_second": 9.846, "eval_steps_per_second": 0.31, "step": 1000 }, { "epoch": 1.1, "grad_norm": 2.4735244526194125, "learning_rate": 4.6609506939566336e-07, "logits/chosen": 3.8594024181365967, "logits/rejected": 3.8261044025421143, "logps/chosen": -606.83251953125, "logps/rejected": -555.4110717773438, "loss": 0.5417, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.40992632508277893, "rewards/margins": 0.664315938949585, "rewards/rejected": -1.0742422342300415, "step": 1010 }, { "epoch": 1.11, "grad_norm": 3.452879055757116, "learning_rate": 4.653636066443105e-07, "logits/chosen": 3.8114733695983887, "logits/rejected": 3.984539747238159, "logps/chosen": -678.3446044921875, "logps/rejected": -560.6809692382812, "loss": 0.5451, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4688519835472107, "rewards/margins": 0.5835325121879578, "rewards/rejected": -1.0523844957351685, "step": 1020 }, { "epoch": 1.12, "grad_norm": 2.3604158315865336, "learning_rate": 4.646249243239996e-07, "logits/chosen": 3.7811264991760254, "logits/rejected": 3.891242504119873, "logps/chosen": -549.7884521484375, "logps/rejected": -494.5982971191406, "loss": 0.5057, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.48677635192871094, "rewards/margins": 0.5802093744277954, "rewards/rejected": -1.066985845565796, "step": 1030 }, { "epoch": 1.13, "grad_norm": 2.911695118266272, "learning_rate": 4.6387904719735426e-07, "logits/chosen": 3.8690688610076904, "logits/rejected": 4.009054660797119, "logps/chosen": -642.6327514648438, "logps/rejected": -560.8517456054688, "loss": 0.5572, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3320097327232361, "rewards/margins": 0.535895824432373, "rewards/rejected": -0.8679056167602539, "step": 1040 }, { "epoch": 1.14, "grad_norm": 2.781369434499782, "learning_rate": 4.631260002681876e-07, "logits/chosen": 3.9193673133850098, "logits/rejected": 3.9731242656707764, "logps/chosen": -576.6608276367188, "logps/rejected": -497.3514709472656, "loss": 0.5372, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.45688796043395996, "rewards/margins": 0.5704649686813354, "rewards/rejected": -1.0273528099060059, "step": 1050 }, { "epoch": 1.15, "grad_norm": 3.027185161911988, "learning_rate": 4.6236580878066354e-07, "logits/chosen": 3.7078070640563965, "logits/rejected": 3.9149177074432373, "logps/chosen": -577.9295043945312, "logps/rejected": -520.2188110351562, "loss": 0.5307, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.45761507749557495, "rewards/margins": 0.569358766078949, "rewards/rejected": -1.0269739627838135, "step": 1060 }, { "epoch": 1.16, "grad_norm": 2.81530979384369, "learning_rate": 4.6159849821845134e-07, "logits/chosen": 3.8268768787384033, "logits/rejected": 3.9492759704589844, "logps/chosen": -558.7430419921875, "logps/rejected": -495.2325134277344, "loss": 0.5187, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4992932379245758, "rewards/margins": 0.5600478053092957, "rewards/rejected": -1.0593410730361938, "step": 1070 }, { "epoch": 1.17, "grad_norm": 3.004411745900334, "learning_rate": 4.6082409430387036e-07, "logits/chosen": 3.7030282020568848, "logits/rejected": 3.8176980018615723, "logps/chosen": -637.4542846679688, "logps/rejected": -537.9615478515625, "loss": 0.5298, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4201931357383728, "rewards/margins": 0.6639485359191895, "rewards/rejected": -1.084141731262207, "step": 1080 }, { "epoch": 1.18, "grad_norm": 2.608455246691224, "learning_rate": 4.600426229970287e-07, "logits/chosen": 3.752833843231201, "logits/rejected": 3.9438960552215576, "logps/chosen": -566.6677856445312, "logps/rejected": -518.1152954101562, "loss": 0.5307, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.419950008392334, "rewards/margins": 0.5896099805831909, "rewards/rejected": -1.009559988975525, "step": 1090 }, { "epoch": 1.19, "grad_norm": 3.071459738601912, "learning_rate": 4.59254110494952e-07, "logits/chosen": 3.917219638824463, "logits/rejected": 4.227973937988281, "logps/chosen": -599.9447021484375, "logps/rejected": -491.8515625, "loss": 0.5322, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3520606756210327, "rewards/margins": 0.5913389921188354, "rewards/rejected": -0.9433996081352234, "step": 1100 }, { "epoch": 1.19, "eval_logits/chosen": 3.871466636657715, "eval_logits/rejected": 4.0435919761657715, "eval_logps/chosen": -579.7517700195312, "eval_logps/rejected": -518.7011108398438, "eval_loss": 0.5400219559669495, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -0.3641493618488312, "eval_rewards/margins": 0.5821177363395691, "eval_rewards/rejected": -0.9462669491767883, "eval_runtime": 202.8047, "eval_samples_per_second": 9.862, "eval_steps_per_second": 0.311, "step": 1100 }, { "epoch": 1.21, "grad_norm": 2.3553195362992825, "learning_rate": 4.5845858323070635e-07, "logits/chosen": 3.76731538772583, "logits/rejected": 3.8337559700012207, "logps/chosen": -547.681884765625, "logps/rejected": -493.3389587402344, "loss": 0.5266, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4557339549064636, "rewards/margins": 0.5228798985481262, "rewards/rejected": -0.9786139726638794, "step": 1110 }, { "epoch": 1.22, "grad_norm": 2.3470086606185214, "learning_rate": 4.5765606787251107e-07, "logits/chosen": 3.7126567363739014, "logits/rejected": 4.070954322814941, "logps/chosen": -620.4693603515625, "logps/rejected": -516.2987670898438, "loss": 0.5152, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4025896489620209, "rewards/margins": 0.788905918598175, "rewards/rejected": -1.191495656967163, "step": 1120 }, { "epoch": 1.23, "grad_norm": 4.949775862620086, "learning_rate": 4.5684659132284564e-07, "logits/chosen": 3.6738791465759277, "logits/rejected": 3.7427947521209717, "logps/chosen": -600.7318725585938, "logps/rejected": -527.5272216796875, "loss": 0.5198, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.552603006362915, "rewards/margins": 0.5690030455589294, "rewards/rejected": -1.1216061115264893, "step": 1130 }, { "epoch": 1.24, "grad_norm": 3.206426056624094, "learning_rate": 4.5603018071754713e-07, "logits/chosen": 3.661280393600464, "logits/rejected": 3.9159374237060547, "logps/chosen": -653.1373291015625, "logps/rejected": -562.4027709960938, "loss": 0.5258, "rewards/accuracies": 0.75, "rewards/chosen": -0.4949869215488434, "rewards/margins": 0.6851035356521606, "rewards/rejected": -1.180090308189392, "step": 1140 }, { "epoch": 1.25, "grad_norm": 2.2769394974170676, "learning_rate": 4.55206863424901e-07, "logits/chosen": 3.6856913566589355, "logits/rejected": 4.0145063400268555, "logps/chosen": -568.9828491210938, "logps/rejected": -486.29656982421875, "loss": 0.5186, "rewards/accuracies": 0.75, "rewards/chosen": -0.47184038162231445, "rewards/margins": 0.7080105543136597, "rewards/rejected": -1.1798509359359741, "step": 1150 }, { "epoch": 1.26, "grad_norm": 2.359734545391228, "learning_rate": 4.5437666704472355e-07, "logits/chosen": 3.7498373985290527, "logits/rejected": 3.9770877361297607, "logps/chosen": -580.9403686523438, "logps/rejected": -529.2962646484375, "loss": 0.529, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5413922071456909, "rewards/margins": 0.5566983222961426, "rewards/rejected": -1.0980905294418335, "step": 1160 }, { "epoch": 1.27, "grad_norm": 2.89031946703898, "learning_rate": 4.535396194074366e-07, "logits/chosen": 3.727942943572998, "logits/rejected": 3.9232964515686035, "logps/chosen": -600.3663330078125, "logps/rejected": -554.5781860351562, "loss": 0.5045, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3358447551727295, "rewards/margins": 0.7004222869873047, "rewards/rejected": -1.0362670421600342, "step": 1170 }, { "epoch": 1.28, "grad_norm": 2.8945187135010166, "learning_rate": 4.526957485731344e-07, "logits/chosen": 3.6489124298095703, "logits/rejected": 3.7614033222198486, "logps/chosen": -664.4152221679688, "logps/rejected": -542.4786987304688, "loss": 0.5471, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.45210176706314087, "rewards/margins": 0.5391023755073547, "rewards/rejected": -0.9912041425704956, "step": 1180 }, { "epoch": 1.29, "grad_norm": 2.67596171281522, "learning_rate": 4.518450828306436e-07, "logits/chosen": 3.7186882495880127, "logits/rejected": 3.734210968017578, "logps/chosen": -583.5777587890625, "logps/rejected": -533.0770263671875, "loss": 0.5204, "rewards/accuracies": 0.78125, "rewards/chosen": -0.579624354839325, "rewards/margins": 0.6160343289375305, "rewards/rejected": -1.1956586837768555, "step": 1190 }, { "epoch": 1.3, "grad_norm": 2.4346554508723615, "learning_rate": 4.509876506965742e-07, "logits/chosen": 3.6612792015075684, "logits/rejected": 3.8886547088623047, "logps/chosen": -599.6268310546875, "logps/rejected": -515.5652465820312, "loss": 0.5281, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.605526864528656, "rewards/margins": 0.708820104598999, "rewards/rejected": -1.3143469095230103, "step": 1200 }, { "epoch": 1.3, "eval_logits/chosen": 3.7841575145721436, "eval_logits/rejected": 3.9368276596069336, "eval_logps/chosen": -596.7364501953125, "eval_logps/rejected": -539.057861328125, "eval_loss": 0.534388542175293, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": -0.5339952707290649, "eval_rewards/margins": 0.6158384680747986, "eval_rewards/rejected": -1.1498335599899292, "eval_runtime": 203.2584, "eval_samples_per_second": 9.84, "eval_steps_per_second": 0.31, "step": 1200 }, { "epoch": 1.31, "grad_norm": 3.695970636776523, "learning_rate": 4.501234809143637e-07, "logits/chosen": 3.718877077102661, "logits/rejected": 3.7873053550720215, "logps/chosen": -593.6304321289062, "logps/rejected": -524.5706787109375, "loss": 0.5043, "rewards/accuracies": 0.75, "rewards/chosen": -0.5071628093719482, "rewards/margins": 0.7255369424819946, "rewards/rejected": -1.2326997518539429, "step": 1210 }, { "epoch": 1.32, "grad_norm": 2.3954534729791033, "learning_rate": 4.492526024533143e-07, "logits/chosen": 3.745767116546631, "logits/rejected": 3.882204532623291, "logps/chosen": -592.2398681640625, "logps/rejected": -523.0452270507812, "loss": 0.5221, "rewards/accuracies": 0.6875, "rewards/chosen": -0.566536545753479, "rewards/margins": 0.5222768783569336, "rewards/rejected": -1.0888134241104126, "step": 1220 }, { "epoch": 1.34, "grad_norm": 3.0519224807574847, "learning_rate": 4.4837504450762067e-07, "logits/chosen": 3.8677144050598145, "logits/rejected": 3.9290356636047363, "logps/chosen": -618.3121948242188, "logps/rejected": -562.8489990234375, "loss": 0.5103, "rewards/accuracies": 0.75, "rewards/chosen": -0.35622692108154297, "rewards/margins": 0.6840597987174988, "rewards/rejected": -1.0402867794036865, "step": 1230 }, { "epoch": 1.35, "grad_norm": 2.761352538082151, "learning_rate": 4.4749083649539204e-07, "logits/chosen": 3.855912685394287, "logits/rejected": 3.8187203407287598, "logps/chosen": -574.2819213867188, "logps/rejected": -547.544189453125, "loss": 0.5093, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.48297685384750366, "rewards/margins": 0.7096476554870605, "rewards/rejected": -1.1926246881484985, "step": 1240 }, { "epoch": 1.36, "grad_norm": 2.44118330758894, "learning_rate": 4.466000080576659e-07, "logits/chosen": 3.6384823322296143, "logits/rejected": 3.8135483264923096, "logps/chosen": -582.2689208984375, "logps/rejected": -531.7244873046875, "loss": 0.5101, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6522843241691589, "rewards/margins": 0.6011296510696411, "rewards/rejected": -1.2534139156341553, "step": 1250 }, { "epoch": 1.37, "grad_norm": 3.0536070990003177, "learning_rate": 4.4570258905741417e-07, "logits/chosen": 3.7012696266174316, "logits/rejected": 3.7779288291931152, "logps/chosen": -582.983642578125, "logps/rejected": -548.3180541992188, "loss": 0.5103, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6485563516616821, "rewards/margins": 0.5782519578933716, "rewards/rejected": -1.2268083095550537, "step": 1260 }, { "epoch": 1.38, "grad_norm": 2.4433145304612522, "learning_rate": 4.447986095785421e-07, "logits/chosen": 3.6619350910186768, "logits/rejected": 3.8520569801330566, "logps/chosen": -629.6549682617188, "logps/rejected": -559.6665649414062, "loss": 0.5246, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5843095183372498, "rewards/margins": 0.6479975581169128, "rewards/rejected": -1.2323070764541626, "step": 1270 }, { "epoch": 1.39, "grad_norm": 2.7685908669959094, "learning_rate": 4.4388809992487996e-07, "logits/chosen": 3.6826343536376953, "logits/rejected": 3.9680213928222656, "logps/chosen": -596.3795776367188, "logps/rejected": -489.682861328125, "loss": 0.521, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5391322374343872, "rewards/margins": 0.6415873765945435, "rewards/rejected": -1.1807196140289307, "step": 1280 }, { "epoch": 1.4, "grad_norm": 2.462386287502182, "learning_rate": 4.4297109061916725e-07, "logits/chosen": 3.736112117767334, "logits/rejected": 3.9865105152130127, "logps/chosen": -615.0040893554688, "logps/rejected": -472.54864501953125, "loss": 0.519, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4535880982875824, "rewards/margins": 0.651738166809082, "rewards/rejected": -1.1053262948989868, "step": 1290 }, { "epoch": 1.41, "grad_norm": 2.618384827610224, "learning_rate": 4.420476124020291e-07, "logits/chosen": 3.7563376426696777, "logits/rejected": 3.8841323852539062, "logps/chosen": -584.8636474609375, "logps/rejected": -542.523681640625, "loss": 0.5063, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.47505125403404236, "rewards/margins": 0.5914508104324341, "rewards/rejected": -1.0665019750595093, "step": 1300 }, { "epoch": 1.41, "eval_logits/chosen": 3.8498694896698, "eval_logits/rejected": 4.013454914093018, "eval_logps/chosen": -580.8731079101562, "eval_logps/rejected": -523.8220825195312, "eval_loss": 0.5296512246131897, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -0.37536194920539856, "eval_rewards/margins": 0.6221145391464233, "eval_rewards/rejected": -0.9974763989448547, "eval_runtime": 202.8973, "eval_samples_per_second": 9.857, "eval_steps_per_second": 0.311, "step": 1300 }, { "epoch": 1.42, "grad_norm": 5.907008056312174, "learning_rate": 4.411176962309461e-07, "logits/chosen": 3.8201069831848145, "logits/rejected": 3.972466230392456, "logps/chosen": -662.3941650390625, "logps/rejected": -575.9456176757812, "loss": 0.4931, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3811381459236145, "rewards/margins": 0.6734236478805542, "rewards/rejected": -1.0545618534088135, "step": 1310 }, { "epoch": 1.43, "grad_norm": 3.6381197531260185, "learning_rate": 4.4018137327921633e-07, "logits/chosen": 3.693995714187622, "logits/rejected": 3.833395004272461, "logps/chosen": -566.2265625, "logps/rejected": -505.2315979003906, "loss": 0.5139, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.516391396522522, "rewards/margins": 0.6587409973144531, "rewards/rejected": -1.1751322746276855, "step": 1320 }, { "epoch": 1.44, "grad_norm": 3.1695698040135647, "learning_rate": 4.3923867493491057e-07, "logits/chosen": 3.733046770095825, "logits/rejected": 4.046479225158691, "logps/chosen": -604.51513671875, "logps/rejected": -487.60638427734375, "loss": 0.5011, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.321003794670105, "rewards/margins": 0.698340117931366, "rewards/rejected": -1.0193439722061157, "step": 1330 }, { "epoch": 1.45, "grad_norm": 2.8032158154033904, "learning_rate": 4.3828963279981994e-07, "logits/chosen": 3.8844008445739746, "logits/rejected": 4.010292053222656, "logps/chosen": -568.9833984375, "logps/rejected": -535.6318969726562, "loss": 0.5126, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.46742120385169983, "rewards/margins": 0.6064526438713074, "rewards/rejected": -1.0738738775253296, "step": 1340 }, { "epoch": 1.47, "grad_norm": 2.5088126336091148, "learning_rate": 4.3733427868839645e-07, "logits/chosen": 3.8523497581481934, "logits/rejected": 3.7402586936950684, "logps/chosen": -551.6283569335938, "logps/rejected": -521.6468505859375, "loss": 0.5311, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49471384286880493, "rewards/margins": 0.6895469427108765, "rewards/rejected": -1.184260606765747, "step": 1350 }, { "epoch": 1.48, "grad_norm": 2.9949133381363153, "learning_rate": 4.3637264462668664e-07, "logits/chosen": 3.510672092437744, "logits/rejected": 3.69635009765625, "logps/chosen": -603.7630615234375, "logps/rejected": -495.576904296875, "loss": 0.5253, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.48227667808532715, "rewards/margins": 0.6299175024032593, "rewards/rejected": -1.112194299697876, "step": 1360 }, { "epoch": 1.49, "grad_norm": 2.9594466747673804, "learning_rate": 4.35404762851258e-07, "logits/chosen": 3.5656349658966064, "logits/rejected": 3.833620071411133, "logps/chosen": -549.580078125, "logps/rejected": -528.0352783203125, "loss": 0.5199, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5975288152694702, "rewards/margins": 0.6607010364532471, "rewards/rejected": -1.2582299709320068, "step": 1370 }, { "epoch": 1.5, "grad_norm": 2.8641396810139015, "learning_rate": 4.34430665808118e-07, "logits/chosen": 3.7431271076202393, "logits/rejected": 3.898761034011841, "logps/chosen": -635.0924682617188, "logps/rejected": -550.517578125, "loss": 0.5135, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.41983968019485474, "rewards/margins": 0.6735917329788208, "rewards/rejected": -1.0934313535690308, "step": 1380 }, { "epoch": 1.51, "grad_norm": 2.6138220257213933, "learning_rate": 4.3345038615162687e-07, "logits/chosen": 3.7801513671875, "logits/rejected": 3.9459152221679688, "logps/chosen": -618.5330810546875, "logps/rejected": -526.7120361328125, "loss": 0.5401, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.41865187883377075, "rewards/margins": 0.6133507490158081, "rewards/rejected": -1.0320026874542236, "step": 1390 }, { "epoch": 1.52, "grad_norm": 2.3348133744227595, "learning_rate": 4.324639567434026e-07, "logits/chosen": 3.724447727203369, "logits/rejected": 3.9371161460876465, "logps/chosen": -607.0804443359375, "logps/rejected": -523.1050415039062, "loss": 0.5073, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.33897972106933594, "rewards/margins": 0.7490689158439636, "rewards/rejected": -1.0880486965179443, "step": 1400 }, { "epoch": 1.52, "eval_logits/chosen": 3.784623384475708, "eval_logits/rejected": 3.9400875568389893, "eval_logps/chosen": -581.5236206054688, "eval_logps/rejected": -527.0737915039062, "eval_loss": 0.5216101408004761, "eval_rewards/accuracies": 0.7757936716079712, "eval_rewards/chosen": -0.3818674385547638, "eval_rewards/margins": 0.6481255888938904, "eval_rewards/rejected": -1.029992938041687, "eval_runtime": 202.8107, "eval_samples_per_second": 9.861, "eval_steps_per_second": 0.311, "step": 1400 }, { "epoch": 1.53, "grad_norm": 3.5770222334742705, "learning_rate": 4.314714106512195e-07, "logits/chosen": 3.729435443878174, "logits/rejected": 4.007909774780273, "logps/chosen": -589.6553955078125, "logps/rejected": -514.7840576171875, "loss": 0.5088, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.4252251982688904, "rewards/margins": 0.7190917134284973, "rewards/rejected": -1.1443169116973877, "step": 1410 }, { "epoch": 1.54, "grad_norm": 3.563305957531453, "learning_rate": 4.304727811478995e-07, "logits/chosen": 3.6431992053985596, "logits/rejected": 3.8974602222442627, "logps/chosen": -645.6729736328125, "logps/rejected": -540.0687255859375, "loss": 0.5065, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3529585897922516, "rewards/margins": 0.7727079391479492, "rewards/rejected": -1.1256663799285889, "step": 1420 }, { "epoch": 1.55, "grad_norm": 2.58757534266237, "learning_rate": 4.294681017101972e-07, "logits/chosen": 3.654217481613159, "logits/rejected": 3.7179622650146484, "logps/chosen": -589.9276123046875, "logps/rejected": -514.4965209960938, "loss": 0.4785, "rewards/accuracies": 0.75, "rewards/chosen": -0.5733104944229126, "rewards/margins": 0.6313902139663696, "rewards/rejected": -1.2047007083892822, "step": 1430 }, { "epoch": 1.56, "grad_norm": 3.0022317826313167, "learning_rate": 4.2845740601767697e-07, "logits/chosen": 3.5225296020507812, "logits/rejected": 3.619579315185547, "logps/chosen": -579.6800537109375, "logps/rejected": -494.28399658203125, "loss": 0.496, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4841237962245941, "rewards/margins": 0.6999977231025696, "rewards/rejected": -1.1841213703155518, "step": 1440 }, { "epoch": 1.57, "grad_norm": 3.2680642270849556, "learning_rate": 4.2744072795158446e-07, "logits/chosen": 3.601937770843506, "logits/rejected": 3.6645870208740234, "logps/chosen": -604.373046875, "logps/rejected": -517.3341064453125, "loss": 0.5007, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5202440023422241, "rewards/margins": 0.5371121168136597, "rewards/rejected": -1.0573561191558838, "step": 1450 }, { "epoch": 1.59, "grad_norm": 2.7738096169935598, "learning_rate": 4.264181015937105e-07, "logits/chosen": 3.6634833812713623, "logits/rejected": 3.7749035358428955, "logps/chosen": -604.2745361328125, "logps/rejected": -527.6934814453125, "loss": 0.4764, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5378643274307251, "rewards/margins": 0.8417774438858032, "rewards/rejected": -1.3796416521072388, "step": 1460 }, { "epoch": 1.6, "grad_norm": 2.881528548527839, "learning_rate": 4.2538956122524874e-07, "logits/chosen": 3.3788399696350098, "logits/rejected": 3.563056230545044, "logps/chosen": -621.4329833984375, "logps/rejected": -560.8185424804688, "loss": 0.4908, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7095000147819519, "rewards/margins": 0.7638419270515442, "rewards/rejected": -1.473341941833496, "step": 1470 }, { "epoch": 1.61, "grad_norm": 2.777025235666112, "learning_rate": 4.2435514132564645e-07, "logits/chosen": 3.482332944869995, "logits/rejected": 3.697258472442627, "logps/chosen": -608.84326171875, "logps/rejected": -499.27606201171875, "loss": 0.4907, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.724181592464447, "rewards/margins": 0.7695054411888123, "rewards/rejected": -1.4936869144439697, "step": 1480 }, { "epoch": 1.62, "grad_norm": 2.935178744516972, "learning_rate": 4.233148765714487e-07, "logits/chosen": 3.664395570755005, "logits/rejected": 3.7082161903381348, "logps/chosen": -598.2871704101562, "logps/rejected": -573.3458862304688, "loss": 0.4949, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.6274911761283875, "rewards/margins": 0.8205268979072571, "rewards/rejected": -1.4480180740356445, "step": 1490 }, { "epoch": 1.63, "grad_norm": 3.1100698844265766, "learning_rate": 4.222688018351357e-07, "logits/chosen": 3.55896258354187, "logits/rejected": 3.7258670330047607, "logps/chosen": -677.6727294921875, "logps/rejected": -549.7400512695312, "loss": 0.5156, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5775496959686279, "rewards/margins": 0.7107436060905457, "rewards/rejected": -1.2882932424545288, "step": 1500 }, { "epoch": 1.63, "eval_logits/chosen": 3.6677606105804443, "eval_logits/rejected": 3.78678560256958, "eval_logps/chosen": -600.8123168945312, "eval_logps/rejected": -552.3165893554688, "eval_loss": 0.5176644325256348, "eval_rewards/accuracies": 0.7559523582458496, "eval_rewards/chosen": -0.5747539401054382, "eval_rewards/margins": 0.7076672315597534, "eval_rewards/rejected": -1.2824209928512573, "eval_runtime": 203.422, "eval_samples_per_second": 9.832, "eval_steps_per_second": 0.31, "step": 1500 }, { "epoch": 1.64, "grad_norm": 3.459724119197008, "learning_rate": 4.212169521839541e-07, "logits/chosen": 3.597620725631714, "logits/rejected": 3.7718772888183594, "logps/chosen": -646.1989135742188, "logps/rejected": -560.6222534179688, "loss": 0.4988, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.3835225999355316, "rewards/margins": 0.8813455700874329, "rewards/rejected": -1.2648680210113525, "step": 1510 }, { "epoch": 1.65, "grad_norm": 2.539862474644881, "learning_rate": 4.2015936287874103e-07, "logits/chosen": 3.594446897506714, "logits/rejected": 3.7397289276123047, "logps/chosen": -585.3614501953125, "logps/rejected": -507.7784729003906, "loss": 0.5067, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6375758051872253, "rewards/margins": 0.5888264775276184, "rewards/rejected": -1.2264022827148438, "step": 1520 }, { "epoch": 1.66, "grad_norm": 3.688641112230014, "learning_rate": 4.1909606937274253e-07, "logits/chosen": 3.694026231765747, "logits/rejected": 3.582556962966919, "logps/chosen": -593.2966918945312, "logps/rejected": -550.7011108398438, "loss": 0.494, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5636107325553894, "rewards/margins": 0.7417998313903809, "rewards/rejected": -1.3054105043411255, "step": 1530 }, { "epoch": 1.67, "grad_norm": 3.517111037766342, "learning_rate": 4.180271073104249e-07, "logits/chosen": 3.6457290649414062, "logits/rejected": 3.6459174156188965, "logps/chosen": -600.4559326171875, "logps/rejected": -556.584228515625, "loss": 0.4925, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6495497226715088, "rewards/margins": 0.7601557970046997, "rewards/rejected": -1.409705400466919, "step": 1540 }, { "epoch": 1.68, "grad_norm": 3.2263768026848734, "learning_rate": 4.169525125262794e-07, "logits/chosen": 3.729513645172119, "logits/rejected": 3.8889973163604736, "logps/chosen": -621.041748046875, "logps/rejected": -568.662841796875, "loss": 0.5129, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5739201903343201, "rewards/margins": 0.6804107427597046, "rewards/rejected": -1.254331111907959, "step": 1550 }, { "epoch": 1.69, "grad_norm": 2.9881390854389966, "learning_rate": 4.158723210436216e-07, "logits/chosen": 3.7290759086608887, "logits/rejected": 3.7448055744171143, "logps/chosen": -617.1058959960938, "logps/rejected": -557.6389770507812, "loss": 0.5065, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5363360643386841, "rewards/margins": 0.7953433394432068, "rewards/rejected": -1.3316794633865356, "step": 1560 }, { "epoch": 1.7, "grad_norm": 3.027069210987796, "learning_rate": 4.147865690733834e-07, "logits/chosen": 3.477890729904175, "logits/rejected": 3.589928388595581, "logps/chosen": -560.6275024414062, "logps/rejected": -495.3143005371094, "loss": 0.5049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6722136735916138, "rewards/margins": 0.5960069894790649, "rewards/rejected": -1.2682207822799683, "step": 1570 }, { "epoch": 1.72, "grad_norm": 3.4567442624175686, "learning_rate": 4.1369529301289923e-07, "logits/chosen": 3.4488158226013184, "logits/rejected": 3.533224582672119, "logps/chosen": -608.8463134765625, "logps/rejected": -540.799560546875, "loss": 0.4991, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.653535008430481, "rewards/margins": 0.7377526164054871, "rewards/rejected": -1.3912876844406128, "step": 1580 }, { "epoch": 1.73, "grad_norm": 3.173424614233903, "learning_rate": 4.12598529444686e-07, "logits/chosen": 3.5706634521484375, "logits/rejected": 3.687189817428589, "logps/chosen": -646.652099609375, "logps/rejected": -616.9104614257812, "loss": 0.5081, "rewards/accuracies": 0.75, "rewards/chosen": -0.5312900543212891, "rewards/margins": 0.8549784421920776, "rewards/rejected": -1.3862684965133667, "step": 1590 }, { "epoch": 1.74, "grad_norm": 2.9125334834126515, "learning_rate": 4.114963151352166e-07, "logits/chosen": 3.7563605308532715, "logits/rejected": 3.8214287757873535, "logps/chosen": -587.04833984375, "logps/rejected": -572.2859497070312, "loss": 0.5072, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5536529421806335, "rewards/margins": 0.6972201466560364, "rewards/rejected": -1.2508732080459595, "step": 1600 }, { "epoch": 1.74, "eval_logits/chosen": 3.661421775817871, "eval_logits/rejected": 3.7790794372558594, "eval_logps/chosen": -593.063720703125, "eval_logps/rejected": -545.2913818359375, "eval_loss": 0.5138276815414429, "eval_rewards/accuracies": 0.7797619104385376, "eval_rewards/chosen": -0.4972679913043976, "eval_rewards/margins": 0.71490079164505, "eval_rewards/rejected": -1.2121686935424805, "eval_runtime": 203.2579, "eval_samples_per_second": 9.84, "eval_steps_per_second": 0.31, "step": 1600 }, { "epoch": 1.75, "grad_norm": 2.8349589434640756, "learning_rate": 4.103886870336875e-07, "logits/chosen": 3.5567550659179688, "logits/rejected": 3.544466733932495, "logps/chosen": -576.1727905273438, "logps/rejected": -553.1214599609375, "loss": 0.5018, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6350575089454651, "rewards/margins": 0.660717785358429, "rewards/rejected": -1.2957751750946045, "step": 1610 }, { "epoch": 1.76, "grad_norm": 4.003556107005901, "learning_rate": 4.0927568227078016e-07, "logits/chosen": 3.5722336769104004, "logits/rejected": 3.902299404144287, "logps/chosen": -658.9724731445312, "logps/rejected": -556.3914794921875, "loss": 0.501, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.4931603968143463, "rewards/margins": 0.7726175785064697, "rewards/rejected": -1.2657779455184937, "step": 1620 }, { "epoch": 1.77, "grad_norm": 3.6688857060794633, "learning_rate": 4.0815733815741594e-07, "logits/chosen": 3.5408577919006348, "logits/rejected": 3.70662260055542, "logps/chosen": -575.60595703125, "logps/rejected": -498.02947998046875, "loss": 0.4935, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.607037365436554, "rewards/margins": 0.7332677245140076, "rewards/rejected": -1.3403050899505615, "step": 1630 }, { "epoch": 1.78, "grad_norm": 3.335350540464027, "learning_rate": 4.0703369218350605e-07, "logits/chosen": 3.5745227336883545, "logits/rejected": 3.697269916534424, "logps/chosen": -579.6307983398438, "logps/rejected": -512.6817016601562, "loss": 0.4888, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5564176440238953, "rewards/margins": 0.793745219707489, "rewards/rejected": -1.3501628637313843, "step": 1640 }, { "epoch": 1.79, "grad_norm": 3.106413437721439, "learning_rate": 4.0590478201669405e-07, "logits/chosen": 3.532491683959961, "logits/rejected": 3.675520658493042, "logps/chosen": -570.85595703125, "logps/rejected": -524.6241455078125, "loss": 0.4793, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4330361485481262, "rewards/margins": 0.7855597734451294, "rewards/rejected": -1.2185958623886108, "step": 1650 }, { "epoch": 1.8, "grad_norm": 3.2501242606500087, "learning_rate": 4.047706455010936e-07, "logits/chosen": 3.5161030292510986, "logits/rejected": 3.4709973335266113, "logps/chosen": -604.4154663085938, "logps/rejected": -549.8175659179688, "loss": 0.5079, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6928753852844238, "rewards/margins": 0.7038476467132568, "rewards/rejected": -1.3967231512069702, "step": 1660 }, { "epoch": 1.81, "grad_norm": 2.3245973701477403, "learning_rate": 4.0363132065601955e-07, "logits/chosen": 3.5682873725891113, "logits/rejected": 3.549381971359253, "logps/chosen": -601.1841430664062, "logps/rejected": -526.7806396484375, "loss": 0.494, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6761364936828613, "rewards/margins": 0.595120370388031, "rewards/rejected": -1.271256685256958, "step": 1670 }, { "epoch": 1.82, "grad_norm": 3.1404482260870914, "learning_rate": 4.024868456747137e-07, "logits/chosen": 3.565006732940674, "logits/rejected": 3.710141658782959, "logps/chosen": -600.9898681640625, "logps/rejected": -561.8255004882812, "loss": 0.484, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6002925634384155, "rewards/margins": 0.7557550668716431, "rewards/rejected": -1.3560476303100586, "step": 1680 }, { "epoch": 1.83, "grad_norm": 3.3290402883292605, "learning_rate": 4.0133725892306413e-07, "logits/chosen": 3.5321547985076904, "logits/rejected": 3.633854389190674, "logps/chosen": -612.2471923828125, "logps/rejected": -570.8936767578125, "loss": 0.4932, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5366383790969849, "rewards/margins": 0.9583670496940613, "rewards/rejected": -1.4950053691864014, "step": 1690 }, { "epoch": 1.85, "grad_norm": 3.219385863270907, "learning_rate": 4.001825989383194e-07, "logits/chosen": 3.6114490032196045, "logits/rejected": 3.8093953132629395, "logps/chosen": -582.7833862304688, "logps/rejected": -533.4085083007812, "loss": 0.4908, "rewards/accuracies": 0.75, "rewards/chosen": -0.7001829147338867, "rewards/margins": 0.7178817987442017, "rewards/rejected": -1.4180647134780884, "step": 1700 }, { "epoch": 1.85, "eval_logits/chosen": 3.669623374938965, "eval_logits/rejected": 3.789337396621704, "eval_logps/chosen": -598.1292114257812, "eval_logps/rejected": -553.7918090820312, "eval_loss": 0.507692277431488, "eval_rewards/accuracies": 0.7797619104385376, "eval_rewards/chosen": -0.547922670841217, "eval_rewards/margins": 0.7492501139640808, "eval_rewards/rejected": -1.2971727848052979, "eval_runtime": 203.2134, "eval_samples_per_second": 9.842, "eval_steps_per_second": 0.31, "step": 1700 }, { "epoch": 1.86, "grad_norm": 2.9809501048356926, "learning_rate": 3.990229044277964e-07, "logits/chosen": 3.6091561317443848, "logits/rejected": 3.6311869621276855, "logps/chosen": -575.5233154296875, "logps/rejected": -522.2789306640625, "loss": 0.496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6322982907295227, "rewards/margins": 0.7211301922798157, "rewards/rejected": -1.3534284830093384, "step": 1710 }, { "epoch": 1.87, "grad_norm": 3.8178999530593045, "learning_rate": 3.97858214267583e-07, "logits/chosen": 3.6715168952941895, "logits/rejected": 3.739348888397217, "logps/chosen": -579.3275146484375, "logps/rejected": -527.568359375, "loss": 0.4925, "rewards/accuracies": 0.75, "rewards/chosen": -0.538645327091217, "rewards/margins": 0.7014168500900269, "rewards/rejected": -1.2400623559951782, "step": 1720 }, { "epoch": 1.88, "grad_norm": 2.88366749396568, "learning_rate": 3.966885675012348e-07, "logits/chosen": 3.6033108234405518, "logits/rejected": 3.6658871173858643, "logps/chosen": -597.9328002929688, "logps/rejected": -576.0477294921875, "loss": 0.4854, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6976242661476135, "rewards/margins": 0.7374922037124634, "rewards/rejected": -1.4351164102554321, "step": 1730 }, { "epoch": 1.89, "grad_norm": 3.579737238340416, "learning_rate": 3.9551400333846594e-07, "logits/chosen": 3.4998416900634766, "logits/rejected": 3.552649736404419, "logps/chosen": -589.7621459960938, "logps/rejected": -503.41656494140625, "loss": 0.5067, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5329033136367798, "rewards/margins": 0.613740861415863, "rewards/rejected": -1.146644115447998, "step": 1740 }, { "epoch": 1.9, "grad_norm": 2.8824881047105753, "learning_rate": 3.943345611538352e-07, "logits/chosen": 3.6485018730163574, "logits/rejected": 3.7200427055358887, "logps/chosen": -619.30615234375, "logps/rejected": -504.87335205078125, "loss": 0.5088, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6067255139350891, "rewards/margins": 0.5986908674240112, "rewards/rejected": -1.2054163217544556, "step": 1750 }, { "epoch": 1.91, "grad_norm": 4.378653722783705, "learning_rate": 3.9315028048542564e-07, "logits/chosen": 3.754995346069336, "logits/rejected": 3.7115840911865234, "logps/chosen": -550.1331176757812, "logps/rejected": -486.9710998535156, "loss": 0.5035, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4969119131565094, "rewards/margins": 0.6981099247932434, "rewards/rejected": -1.1950218677520752, "step": 1760 }, { "epoch": 1.92, "grad_norm": 4.133688506753646, "learning_rate": 3.9196120103351946e-07, "logits/chosen": 3.7322287559509277, "logits/rejected": 3.8301990032196045, "logps/chosen": -561.39404296875, "logps/rejected": -551.4658203125, "loss": 0.4846, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.49270591139793396, "rewards/margins": 0.8821004629135132, "rewards/rejected": -1.3748066425323486, "step": 1770 }, { "epoch": 1.93, "grad_norm": 3.604820904113862, "learning_rate": 3.9076736265926704e-07, "logits/chosen": 3.5433831214904785, "logits/rejected": 3.632005214691162, "logps/chosen": -612.3604736328125, "logps/rejected": -566.1151123046875, "loss": 0.4828, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6337317228317261, "rewards/margins": 0.7398926615715027, "rewards/rejected": -1.3736244440078735, "step": 1780 }, { "epoch": 1.94, "grad_norm": 2.7111165769919032, "learning_rate": 3.8956880538335046e-07, "logits/chosen": 3.6864840984344482, "logits/rejected": 3.859823226928711, "logps/chosen": -645.093994140625, "logps/rejected": -595.4602661132812, "loss": 0.4817, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6062914729118347, "rewards/margins": 0.8517727851867676, "rewards/rejected": -1.458064317703247, "step": 1790 }, { "epoch": 1.95, "grad_norm": 3.2623811449746842, "learning_rate": 3.883655693846425e-07, "logits/chosen": 3.588616132736206, "logits/rejected": 3.5179474353790283, "logps/chosen": -559.8778076171875, "logps/rejected": -575.5091552734375, "loss": 0.5109, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7374631762504578, "rewards/margins": 0.7547104358673096, "rewards/rejected": -1.4921735525131226, "step": 1800 }, { "epoch": 1.95, "eval_logits/chosen": 3.65556263923645, "eval_logits/rejected": 3.767853260040283, "eval_logps/chosen": -604.908935546875, "eval_logps/rejected": -563.373291015625, "eval_loss": 0.5067973732948303, "eval_rewards/accuracies": 0.7757936716079712, "eval_rewards/chosen": -0.6157205700874329, "eval_rewards/margins": 0.7772676944732666, "eval_rewards/rejected": -1.3929883241653442, "eval_runtime": 202.9487, "eval_samples_per_second": 9.855, "eval_steps_per_second": 0.31, "step": 1800 }, { "epoch": 1.97, "grad_norm": 3.523477549593873, "learning_rate": 3.87157694998859e-07, "logits/chosen": 3.5332775115966797, "logits/rejected": 3.5374343395233154, "logps/chosen": -559.4539794921875, "logps/rejected": -560.0755615234375, "loss": 0.4717, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7071051597595215, "rewards/margins": 0.756817102432251, "rewards/rejected": -1.463922142982483, "step": 1810 }, { "epoch": 1.98, "grad_norm": 3.8792408245686443, "learning_rate": 3.8594522271720706e-07, "logits/chosen": 3.4940123558044434, "logits/rejected": 3.6339797973632812, "logps/chosen": -605.3863525390625, "logps/rejected": -501.80224609375, "loss": 0.4975, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7022968530654907, "rewards/margins": 0.7580503225326538, "rewards/rejected": -1.4603471755981445, "step": 1820 }, { "epoch": 1.99, "grad_norm": 3.5373167381585247, "learning_rate": 3.8472819318502804e-07, "logits/chosen": 3.5649352073669434, "logits/rejected": 3.7245540618896484, "logps/chosen": -655.5385131835938, "logps/rejected": -578.20947265625, "loss": 0.4745, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6181583404541016, "rewards/margins": 0.8856579065322876, "rewards/rejected": -1.5038163661956787, "step": 1830 }, { "epoch": 2.0, "grad_norm": 3.368152737374401, "learning_rate": 3.83506647200434e-07, "logits/chosen": 3.662012815475464, "logits/rejected": 3.7260677814483643, "logps/chosen": -599.5283813476562, "logps/rejected": -570.4051513671875, "loss": 0.4967, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6230210065841675, "rewards/margins": 0.7484488487243652, "rewards/rejected": -1.3714698553085327, "step": 1840 }, { "epoch": 2.01, "grad_norm": 3.769913619495626, "learning_rate": 3.822806257129413e-07, "logits/chosen": 3.5493359565734863, "logits/rejected": 3.633047580718994, "logps/chosen": -642.0574951171875, "logps/rejected": -601.875, "loss": 0.486, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.689389705657959, "rewards/margins": 0.8089310526847839, "rewards/rejected": -1.4983208179473877, "step": 1850 }, { "epoch": 2.02, "grad_norm": 4.12546180658684, "learning_rate": 3.810501698220967e-07, "logits/chosen": 3.6838059425354004, "logits/rejected": 3.700486660003662, "logps/chosen": -598.7171630859375, "logps/rejected": -605.1309814453125, "loss": 0.4507, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7325755953788757, "rewards/margins": 0.9967571496963501, "rewards/rejected": -1.729332685470581, "step": 1860 }, { "epoch": 2.03, "grad_norm": 3.245909512853775, "learning_rate": 3.7981532077610054e-07, "logits/chosen": 3.6005501747131348, "logits/rejected": 3.594416856765747, "logps/chosen": -607.3951416015625, "logps/rejected": -601.7537841796875, "loss": 0.4989, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7089284658432007, "rewards/margins": 0.8154546022415161, "rewards/rejected": -1.5243830680847168, "step": 1870 }, { "epoch": 2.04, "grad_norm": 3.5832542745935796, "learning_rate": 3.785761199704233e-07, "logits/chosen": 3.505204439163208, "logits/rejected": 3.4447197914123535, "logps/chosen": -558.6715698242188, "logps/rejected": -525.9684448242188, "loss": 0.4618, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7238224148750305, "rewards/margins": 0.8947445154190063, "rewards/rejected": -1.618566870689392, "step": 1880 }, { "epoch": 2.05, "grad_norm": 3.2568125918545268, "learning_rate": 3.773326089464184e-07, "logits/chosen": 3.495814800262451, "logits/rejected": 3.660496473312378, "logps/chosen": -644.8048095703125, "logps/rejected": -547.1638793945312, "loss": 0.4767, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5708962678909302, "rewards/margins": 0.7958296537399292, "rewards/rejected": -1.3667261600494385, "step": 1890 }, { "epoch": 2.06, "grad_norm": 3.1677742126172275, "learning_rate": 3.7608482938992903e-07, "logits/chosen": 3.419759750366211, "logits/rejected": 3.55739164352417, "logps/chosen": -620.032470703125, "logps/rejected": -574.774169921875, "loss": 0.4779, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.697594404220581, "rewards/margins": 0.7900949716567993, "rewards/rejected": -1.48768949508667, "step": 1900 }, { "epoch": 2.06, "eval_logits/chosen": 3.606205940246582, "eval_logits/rejected": 3.711778402328491, "eval_logps/chosen": -605.808837890625, "eval_logps/rejected": -565.767333984375, "eval_loss": 0.50054931640625, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -0.6247199773788452, "eval_rewards/margins": 0.7922087907791138, "eval_rewards/rejected": -1.416928768157959, "eval_runtime": 202.8417, "eval_samples_per_second": 9.86, "eval_steps_per_second": 0.311, "step": 1900 }, { "epoch": 2.07, "grad_norm": 3.2130345045394435, "learning_rate": 3.7483282312989155e-07, "logits/chosen": 3.6094250679016113, "logits/rejected": 3.8729541301727295, "logps/chosen": -580.1021728515625, "logps/rejected": -531.3250732421875, "loss": 0.4582, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7259335517883301, "rewards/margins": 0.8915154337882996, "rewards/rejected": -1.617449164390564, "step": 1910 }, { "epoch": 2.08, "grad_norm": 3.05589801382563, "learning_rate": 3.735766321369325e-07, "logits/chosen": 3.646768569946289, "logits/rejected": 3.6266236305236816, "logps/chosen": -636.1376953125, "logps/rejected": -559.4783325195312, "loss": 0.4532, "rewards/accuracies": 0.8125, "rewards/chosen": -0.670791506767273, "rewards/margins": 0.8766641616821289, "rewards/rejected": -1.5474556684494019, "step": 1920 }, { "epoch": 2.1, "grad_norm": 3.335600613112421, "learning_rate": 3.7231629852196214e-07, "logits/chosen": 3.4616265296936035, "logits/rejected": 3.530566692352295, "logps/chosen": -620.8572998046875, "logps/rejected": -556.129150390625, "loss": 0.482, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7699755430221558, "rewards/margins": 0.9273629188537598, "rewards/rejected": -1.697338342666626, "step": 1930 }, { "epoch": 2.11, "grad_norm": 4.404120199762387, "learning_rate": 3.710518645347626e-07, "logits/chosen": 3.587388515472412, "logits/rejected": 3.6373343467712402, "logps/chosen": -658.3599853515625, "logps/rejected": -621.992431640625, "loss": 0.4645, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6640130877494812, "rewards/margins": 0.8948653340339661, "rewards/rejected": -1.5588784217834473, "step": 1940 }, { "epoch": 2.12, "grad_norm": 3.1491212381401272, "learning_rate": 3.697833725625713e-07, "logits/chosen": 3.4838454723358154, "logits/rejected": 3.38386869430542, "logps/chosen": -590.5750122070312, "logps/rejected": -557.3521118164062, "loss": 0.4744, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7175976037979126, "rewards/margins": 0.7821000218391418, "rewards/rejected": -1.4996975660324097, "step": 1950 }, { "epoch": 2.13, "grad_norm": 3.6511312470131876, "learning_rate": 3.685108651286605e-07, "logits/chosen": 3.554466962814331, "logits/rejected": 3.6436378955841064, "logps/chosen": -611.2662353515625, "logps/rejected": -538.0736083984375, "loss": 0.4749, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5578540563583374, "rewards/margins": 0.7443768382072449, "rewards/rejected": -1.3022308349609375, "step": 1960 }, { "epoch": 2.14, "grad_norm": 3.830642631650065, "learning_rate": 3.672343848909116e-07, "logits/chosen": 3.5016560554504395, "logits/rejected": 3.5824344158172607, "logps/chosen": -616.0037231445312, "logps/rejected": -570.7191162109375, "loss": 0.4756, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7442587614059448, "rewards/margins": 0.8685930371284485, "rewards/rejected": -1.6128517389297485, "step": 1970 }, { "epoch": 2.15, "grad_norm": 3.4247382055549207, "learning_rate": 3.6595397464038484e-07, "logits/chosen": 3.4326834678649902, "logits/rejected": 3.503385066986084, "logps/chosen": -631.0132446289062, "logps/rejected": -558.8316650390625, "loss": 0.4611, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5838363170623779, "rewards/margins": 1.001306414604187, "rewards/rejected": -1.585142731666565, "step": 1980 }, { "epoch": 2.16, "grad_norm": 3.7535681296567898, "learning_rate": 3.646696772998854e-07, "logits/chosen": 3.455758571624756, "logits/rejected": 3.6762351989746094, "logps/chosen": -591.5477294921875, "logps/rejected": -526.0713500976562, "loss": 0.4858, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8906404376029968, "rewards/margins": 0.9278383255004883, "rewards/rejected": -1.8184788227081299, "step": 1990 }, { "epoch": 2.17, "grad_norm": 4.778420887307365, "learning_rate": 3.6338153592252394e-07, "logits/chosen": 3.4087131023406982, "logits/rejected": 3.445772886276245, "logps/chosen": -578.9254760742188, "logps/rejected": -527.9382934570312, "loss": 0.4833, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7731282711029053, "rewards/margins": 0.8957611918449402, "rewards/rejected": -1.6688896417617798, "step": 2000 }, { "epoch": 2.17, "eval_logits/chosen": 3.584850549697876, "eval_logits/rejected": 3.67386531829834, "eval_logps/chosen": -611.7432250976562, "eval_logps/rejected": -574.3334350585938, "eval_loss": 0.49917730689048767, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -0.6840633749961853, "eval_rewards/margins": 0.8185263872146606, "eval_rewards/rejected": -1.5025897026062012, "eval_runtime": 202.8219, "eval_samples_per_second": 9.861, "eval_steps_per_second": 0.311, "step": 2000 }, { "epoch": 2.18, "grad_norm": 3.6987584600229515, "learning_rate": 3.6208959369027377e-07, "logits/chosen": 3.4759509563446045, "logits/rejected": 3.583934783935547, "logps/chosen": -629.9797973632812, "logps/rejected": -546.8692016601562, "loss": 0.4603, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7248755693435669, "rewards/margins": 0.8174535632133484, "rewards/rejected": -1.5423290729522705, "step": 2010 }, { "epoch": 2.19, "grad_norm": 3.487846785466506, "learning_rate": 3.60793893912523e-07, "logits/chosen": 3.503138780593872, "logits/rejected": 3.5944199562072754, "logps/chosen": -598.5306396484375, "logps/rejected": -539.2348022460938, "loss": 0.4666, "rewards/accuracies": 0.75, "rewards/chosen": -0.7628095746040344, "rewards/margins": 0.8675826191902161, "rewards/rejected": -1.630392074584961, "step": 2020 }, { "epoch": 2.2, "grad_norm": 3.651409060327047, "learning_rate": 3.5949448002462293e-07, "logits/chosen": 3.439924716949463, "logits/rejected": 3.5722365379333496, "logps/chosen": -631.615966796875, "logps/rejected": -557.09326171875, "loss": 0.4754, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7125946283340454, "rewards/margins": 0.9252546429634094, "rewards/rejected": -1.63784921169281, "step": 2030 }, { "epoch": 2.21, "grad_norm": 3.260618270999553, "learning_rate": 3.581913955864317e-07, "logits/chosen": 3.434553861618042, "logits/rejected": 3.4933903217315674, "logps/chosen": -606.2720947265625, "logps/rejected": -597.4681396484375, "loss": 0.4817, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6869601607322693, "rewards/margins": 1.0263012647628784, "rewards/rejected": -1.713261365890503, "step": 2040 }, { "epoch": 2.23, "grad_norm": 3.423883637886533, "learning_rate": 3.5688468428085426e-07, "logits/chosen": 3.5992112159729004, "logits/rejected": 3.513864040374756, "logps/chosen": -611.1129150390625, "logps/rejected": -611.6151123046875, "loss": 0.4575, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.754952073097229, "rewards/margins": 0.8556115031242371, "rewards/rejected": -1.6105636358261108, "step": 2050 }, { "epoch": 2.24, "grad_norm": 3.5077824772903674, "learning_rate": 3.555743899123779e-07, "logits/chosen": 3.5471534729003906, "logits/rejected": 3.6697468757629395, "logps/chosen": -642.9610595703125, "logps/rejected": -615.0989379882812, "loss": 0.4732, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7250288128852844, "rewards/margins": 0.9433430433273315, "rewards/rejected": -1.6683717966079712, "step": 2060 }, { "epoch": 2.25, "grad_norm": 3.7861732801731605, "learning_rate": 3.542605564056041e-07, "logits/chosen": 3.5742199420928955, "logits/rejected": 3.5410943031311035, "logps/chosen": -655.3723754882812, "logps/rejected": -642.4591674804688, "loss": 0.4655, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.776167631149292, "rewards/margins": 0.889909565448761, "rewards/rejected": -1.6660772562026978, "step": 2070 }, { "epoch": 2.26, "grad_norm": 5.129957847568381, "learning_rate": 3.529432278037753e-07, "logits/chosen": 3.491333484649658, "logits/rejected": 3.507310152053833, "logps/chosen": -594.2889404296875, "logps/rejected": -602.4393310546875, "loss": 0.4889, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9160470962524414, "rewards/margins": 0.891708254814148, "rewards/rejected": -1.807755470275879, "step": 2080 }, { "epoch": 2.27, "grad_norm": 3.846187409152628, "learning_rate": 3.5162244826729947e-07, "logits/chosen": 3.289222002029419, "logits/rejected": 3.3087544441223145, "logps/chosen": -578.4583129882812, "logps/rejected": -576.5350341796875, "loss": 0.4732, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9932931065559387, "rewards/margins": 0.8519379496574402, "rewards/rejected": -1.845231056213379, "step": 2090 }, { "epoch": 2.28, "grad_norm": 4.357246431190218, "learning_rate": 3.502982620722688e-07, "logits/chosen": 3.3593249320983887, "logits/rejected": 3.3827052116394043, "logps/chosen": -671.4129028320312, "logps/rejected": -633.4642333984375, "loss": 0.4879, "rewards/accuracies": 0.75, "rewards/chosen": -1.1302413940429688, "rewards/margins": 0.95428466796875, "rewards/rejected": -2.0845260620117188, "step": 2100 }, { "epoch": 2.28, "eval_logits/chosen": 3.5029900074005127, "eval_logits/rejected": 3.5692081451416016, "eval_logps/chosen": -624.6126708984375, "eval_logps/rejected": -590.6145629882812, "eval_loss": 0.49671605229377747, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -0.812757670879364, "eval_rewards/margins": 0.8526439070701599, "eval_rewards/rejected": -1.6654013395309448, "eval_runtime": 203.9786, "eval_samples_per_second": 9.805, "eval_steps_per_second": 0.309, "step": 2100 }, { "epoch": 2.29, "grad_norm": 3.471910343991828, "learning_rate": 3.489707136089762e-07, "logits/chosen": 3.3395347595214844, "logits/rejected": 3.3745200634002686, "logps/chosen": -571.3670654296875, "logps/rejected": -526.6174926757812, "loss": 0.4372, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9002982974052429, "rewards/margins": 0.8082368969917297, "rewards/rejected": -1.7085349559783936, "step": 2110 }, { "epoch": 2.3, "grad_norm": 3.5446869217682573, "learning_rate": 3.4763984738042667e-07, "logits/chosen": 3.3679275512695312, "logits/rejected": 3.5208847522735596, "logps/chosen": -645.79052734375, "logps/rejected": -559.8326416015625, "loss": 0.4769, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9663489460945129, "rewards/margins": 0.8492861986160278, "rewards/rejected": -1.815635323524475, "step": 2120 }, { "epoch": 2.31, "grad_norm": 3.6584533129832693, "learning_rate": 3.4630570800084563e-07, "logits/chosen": 3.5914466381073, "logits/rejected": 3.5731983184814453, "logps/chosen": -645.3328857421875, "logps/rejected": -612.6683349609375, "loss": 0.481, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9558261632919312, "rewards/margins": 0.834132969379425, "rewards/rejected": -1.7899593114852905, "step": 2130 }, { "epoch": 2.32, "grad_norm": 3.2014609575217823, "learning_rate": 3.449683401941836e-07, "logits/chosen": 3.53631329536438, "logits/rejected": 3.6271331310272217, "logps/chosen": -709.5948486328125, "logps/rejected": -633.9876708984375, "loss": 0.4593, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5863145589828491, "rewards/margins": 0.8807814717292786, "rewards/rejected": -1.467095971107483, "step": 2140 }, { "epoch": 2.33, "grad_norm": 3.488531245410909, "learning_rate": 3.4362778879261636e-07, "logits/chosen": 3.443824291229248, "logits/rejected": 3.513853073120117, "logps/chosen": -635.8612670898438, "logps/rejected": -578.5512084960938, "loss": 0.4394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8079124689102173, "rewards/margins": 0.9066284894943237, "rewards/rejected": -1.7145410776138306, "step": 2150 }, { "epoch": 2.35, "grad_norm": 4.1324974690904135, "learning_rate": 3.422840987350426e-07, "logits/chosen": 3.401771068572998, "logits/rejected": 3.5432567596435547, "logps/chosen": -624.2410888671875, "logps/rejected": -544.371337890625, "loss": 0.4629, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8615863919258118, "rewards/margins": 0.9290858507156372, "rewards/rejected": -1.7906723022460938, "step": 2160 }, { "epoch": 2.36, "grad_norm": 3.4186991421500146, "learning_rate": 3.409373150655771e-07, "logits/chosen": 3.565824508666992, "logits/rejected": 3.754411220550537, "logps/chosen": -633.1618041992188, "logps/rejected": -544.0753173828125, "loss": 0.4783, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7366576790809631, "rewards/margins": 0.8337681889533997, "rewards/rejected": -1.5704257488250732, "step": 2170 }, { "epoch": 2.37, "grad_norm": 3.844335021845772, "learning_rate": 3.39587482932041e-07, "logits/chosen": 3.404428482055664, "logits/rejected": 3.5384204387664795, "logps/chosen": -638.5438842773438, "logps/rejected": -581.6624755859375, "loss": 0.4639, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.7160848379135132, "rewards/margins": 0.9811609387397766, "rewards/rejected": -1.6972458362579346, "step": 2180 }, { "epoch": 2.38, "grad_norm": 4.581285458578208, "learning_rate": 3.38234647584448e-07, "logits/chosen": 3.472095012664795, "logits/rejected": 3.5105583667755127, "logps/chosen": -580.3779296875, "logps/rejected": -542.3770141601562, "loss": 0.4723, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8190044164657593, "rewards/margins": 0.8088991045951843, "rewards/rejected": -1.627903699874878, "step": 2190 }, { "epoch": 2.39, "grad_norm": 3.414699724215474, "learning_rate": 3.3687885437348786e-07, "logits/chosen": 3.4263916015625, "logits/rejected": 3.499289035797119, "logps/chosen": -593.12646484375, "logps/rejected": -599.123779296875, "loss": 0.4645, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7351945638656616, "rewards/margins": 0.959613025188446, "rewards/rejected": -1.6948076486587524, "step": 2200 }, { "epoch": 2.39, "eval_logits/chosen": 3.577197790145874, "eval_logits/rejected": 3.6647300720214844, "eval_logps/chosen": -613.0288696289062, "eval_logps/rejected": -577.7230224609375, "eval_loss": 0.4926875829696655, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": -0.6969201564788818, "eval_rewards/margins": 0.8395654559135437, "eval_rewards/rejected": -1.5364856719970703, "eval_runtime": 202.831, "eval_samples_per_second": 9.86, "eval_steps_per_second": 0.311, "step": 2200 }, { "epoch": 2.4, "grad_norm": 3.8557727493229077, "learning_rate": 3.355201487490056e-07, "logits/chosen": 3.489133834838867, "logits/rejected": 3.4847042560577393, "logps/chosen": -631.4229736328125, "logps/rejected": -570.99365234375, "loss": 0.46, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7371636629104614, "rewards/margins": 0.8246687650680542, "rewards/rejected": -1.5618324279785156, "step": 2210 }, { "epoch": 2.41, "grad_norm": 3.582563306354732, "learning_rate": 3.3415857625847834e-07, "logits/chosen": 3.494647264480591, "logits/rejected": 3.5302734375, "logps/chosen": -571.177978515625, "logps/rejected": -532.7431640625, "loss": 0.4726, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7189729809761047, "rewards/margins": 0.9511632919311523, "rewards/rejected": -1.6701362133026123, "step": 2220 }, { "epoch": 2.42, "grad_norm": 3.500411233095585, "learning_rate": 3.327941825454884e-07, "logits/chosen": 3.5814578533172607, "logits/rejected": 3.530216932296753, "logps/chosen": -650.6365356445312, "logps/rejected": -653.82958984375, "loss": 0.4643, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5866572856903076, "rewards/margins": 1.002760648727417, "rewards/rejected": -1.5894181728363037, "step": 2230 }, { "epoch": 2.43, "grad_norm": 4.330062226180967, "learning_rate": 3.31427013348193e-07, "logits/chosen": 3.399160385131836, "logits/rejected": 3.422670364379883, "logps/chosen": -609.5322875976562, "logps/rejected": -614.65087890625, "loss": 0.4885, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6684264540672302, "rewards/margins": 0.9222012758255005, "rewards/rejected": -1.5906277894973755, "step": 2240 }, { "epoch": 2.44, "grad_norm": 3.2509909424020416, "learning_rate": 3.3005711449779104e-07, "logits/chosen": 3.4792771339416504, "logits/rejected": 3.65201997756958, "logps/chosen": -677.3572387695312, "logps/rejected": -588.9779052734375, "loss": 0.4344, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6901603937149048, "rewards/margins": 0.8743443489074707, "rewards/rejected": -1.564504861831665, "step": 2250 }, { "epoch": 2.45, "grad_norm": 3.5718986172017377, "learning_rate": 3.2868453191698667e-07, "logits/chosen": 3.412238359451294, "logits/rejected": 3.496849775314331, "logps/chosen": -640.611083984375, "logps/rejected": -567.2330322265625, "loss": 0.4783, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8837859034538269, "rewards/margins": 0.8499320149421692, "rewards/rejected": -1.733717918395996, "step": 2260 }, { "epoch": 2.46, "grad_norm": 2.8011325615379365, "learning_rate": 3.2730931161845023e-07, "logits/chosen": 3.430738925933838, "logits/rejected": 3.5010883808135986, "logps/chosen": -609.8501586914062, "logps/rejected": -563.63037109375, "loss": 0.4533, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8577713966369629, "rewards/margins": 0.9224198460578918, "rewards/rejected": -1.7801910638809204, "step": 2270 }, { "epoch": 2.48, "grad_norm": 3.209654315982721, "learning_rate": 3.2593149970327514e-07, "logits/chosen": 3.2371573448181152, "logits/rejected": 3.2815635204315186, "logps/chosen": -643.8148193359375, "logps/rejected": -635.4369506835938, "loss": 0.4541, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0217785835266113, "rewards/margins": 0.9184284210205078, "rewards/rejected": -1.9402072429656982, "step": 2280 }, { "epoch": 2.49, "grad_norm": 3.5920981527061016, "learning_rate": 3.245511423594329e-07, "logits/chosen": 3.521888017654419, "logits/rejected": 3.4993069171905518, "logps/chosen": -667.6837768554688, "logps/rejected": -641.0929565429688, "loss": 0.483, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7637149095535278, "rewards/margins": 0.8890408277511597, "rewards/rejected": -1.6527557373046875, "step": 2290 }, { "epoch": 2.5, "grad_norm": 3.7088867137960184, "learning_rate": 3.231682858602249e-07, "logits/chosen": 3.401064395904541, "logits/rejected": 3.3754830360412598, "logps/chosen": -633.728515625, "logps/rejected": -592.6007080078125, "loss": 0.4587, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7475706338882446, "rewards/margins": 1.0294914245605469, "rewards/rejected": -1.7770618200302124, "step": 2300 }, { "epoch": 2.5, "eval_logits/chosen": 3.5790069103240967, "eval_logits/rejected": 3.6614677906036377, "eval_logps/chosen": -603.5742797851562, "eval_logps/rejected": -569.4067993164062, "eval_loss": 0.4936090111732483, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.6023736596107483, "eval_rewards/margins": 0.8509496450424194, "eval_rewards/rejected": -1.4533233642578125, "eval_runtime": 202.8172, "eval_samples_per_second": 9.861, "eval_steps_per_second": 0.311, "step": 2300 }, { "epoch": 2.51, "grad_norm": 3.48598402062857, "learning_rate": 3.217829765627304e-07, "logits/chosen": 3.4837124347686768, "logits/rejected": 3.496872663497925, "logps/chosen": -587.3186645507812, "logps/rejected": -550.0616455078125, "loss": 0.4546, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8458267450332642, "rewards/margins": 0.837451159954071, "rewards/rejected": -1.6832778453826904, "step": 2310 }, { "epoch": 2.52, "grad_norm": 3.3428599640744388, "learning_rate": 3.203952609062537e-07, "logits/chosen": 3.560152769088745, "logits/rejected": 3.5849337577819824, "logps/chosen": -558.6005249023438, "logps/rejected": -551.5780029296875, "loss": 0.4638, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5677013993263245, "rewards/margins": 0.9342495799064636, "rewards/rejected": -1.5019508600234985, "step": 2320 }, { "epoch": 2.53, "grad_norm": 3.0701836045771067, "learning_rate": 3.1900518541076625e-07, "logits/chosen": 3.492565870285034, "logits/rejected": 3.5176196098327637, "logps/chosen": -588.5086669921875, "logps/rejected": -570.9232177734375, "loss": 0.4737, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6482713222503662, "rewards/margins": 0.9163777232170105, "rewards/rejected": -1.564648985862732, "step": 2330 }, { "epoch": 2.54, "grad_norm": 4.270506129920829, "learning_rate": 3.17612796675348e-07, "logits/chosen": 3.344141721725464, "logits/rejected": 3.400237560272217, "logps/chosen": -656.4193725585938, "logps/rejected": -600.7810668945312, "loss": 0.4652, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.793533444404602, "rewards/margins": 0.9670297503471375, "rewards/rejected": -1.7605631351470947, "step": 2340 }, { "epoch": 2.55, "grad_norm": 5.4624457929603745, "learning_rate": 3.1621814137662477e-07, "logits/chosen": 3.4089431762695312, "logits/rejected": 3.561499834060669, "logps/chosen": -656.7151489257812, "logps/rejected": -571.5653686523438, "loss": 0.4781, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7137613296508789, "rewards/margins": 1.0217043161392212, "rewards/rejected": -1.7354657649993896, "step": 2350 }, { "epoch": 2.56, "grad_norm": 3.8692615319841246, "learning_rate": 3.148212662672038e-07, "logits/chosen": 3.565295457839966, "logits/rejected": 3.513939619064331, "logps/chosen": -589.4000244140625, "logps/rejected": -578.9471435546875, "loss": 0.4655, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.663978636264801, "rewards/margins": 1.0130146741867065, "rewards/rejected": -1.6769931316375732, "step": 2360 }, { "epoch": 2.57, "grad_norm": 3.745757139606091, "learning_rate": 3.1342221817410615e-07, "logits/chosen": 3.4632372856140137, "logits/rejected": 3.605541944503784, "logps/chosen": -666.013671875, "logps/rejected": -610.5642700195312, "loss": 0.4481, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7232716083526611, "rewards/margins": 0.9818176031112671, "rewards/rejected": -1.7050892114639282, "step": 2370 }, { "epoch": 2.58, "grad_norm": 4.735659043960875, "learning_rate": 3.120210439971974e-07, "logits/chosen": 3.443312168121338, "logits/rejected": 3.4171016216278076, "logps/chosen": -605.1135864257812, "logps/rejected": -626.380615234375, "loss": 0.4638, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0124810934066772, "rewards/margins": 0.91313636302948, "rewards/rejected": -1.9256175756454468, "step": 2380 }, { "epoch": 2.6, "grad_norm": 4.523496777396923, "learning_rate": 3.1061779070761523e-07, "logits/chosen": 3.4363415241241455, "logits/rejected": 3.4582061767578125, "logps/chosen": -649.8582763671875, "logps/rejected": -642.29833984375, "loss": 0.4333, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7597558498382568, "rewards/margins": 1.084705114364624, "rewards/rejected": -1.8444608449935913, "step": 2390 }, { "epoch": 2.61, "grad_norm": 4.200268270134274, "learning_rate": 3.0921250534619447e-07, "logits/chosen": 3.296509265899658, "logits/rejected": 3.377551317214966, "logps/chosen": -683.1427001953125, "logps/rejected": -585.7415161132812, "loss": 0.437, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7686322331428528, "rewards/margins": 1.0582411289215088, "rewards/rejected": -1.8268734216690063, "step": 2400 }, { "epoch": 2.61, "eval_logits/chosen": 3.4342610836029053, "eval_logits/rejected": 3.4902870655059814, "eval_logps/chosen": -631.598388671875, "eval_logps/rejected": -601.3099365234375, "eval_loss": 0.4921112656593323, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -0.8826150298118591, "eval_rewards/margins": 0.8897396922111511, "eval_rewards/rejected": -1.7723547220230103, "eval_runtime": 203.1843, "eval_samples_per_second": 9.843, "eval_steps_per_second": 0.31, "step": 2400 }, { "epoch": 2.62, "grad_norm": 6.363573204461412, "learning_rate": 3.0780523502189075e-07, "logits/chosen": 3.3436951637268066, "logits/rejected": 3.3255093097686768, "logps/chosen": -615.0836181640625, "logps/rejected": -604.822265625, "loss": 0.4602, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9606714248657227, "rewards/margins": 1.0685679912567139, "rewards/rejected": -2.0292391777038574, "step": 2410 }, { "epoch": 2.63, "grad_norm": 3.203187461754593, "learning_rate": 3.0639602691020093e-07, "logits/chosen": 3.455970287322998, "logits/rejected": 3.450157642364502, "logps/chosen": -680.4078369140625, "logps/rejected": -633.8014526367188, "loss": 0.46, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7696986794471741, "rewards/margins": 1.0905230045318604, "rewards/rejected": -1.8602216243743896, "step": 2420 }, { "epoch": 2.64, "grad_norm": 4.14139711172661, "learning_rate": 3.0498492825158176e-07, "logits/chosen": 3.320502519607544, "logits/rejected": 3.3242201805114746, "logps/chosen": -566.0848388671875, "logps/rejected": -572.5762329101562, "loss": 0.4654, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9355268478393555, "rewards/margins": 0.9019051790237427, "rewards/rejected": -1.8374321460723877, "step": 2430 }, { "epoch": 2.65, "grad_norm": 2.8171823269826555, "learning_rate": 3.0357198634986613e-07, "logits/chosen": 3.4914822578430176, "logits/rejected": 3.622373580932617, "logps/chosen": -582.17578125, "logps/rejected": -543.8931274414062, "loss": 0.4891, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6903704404830933, "rewards/margins": 0.7755452990531921, "rewards/rejected": -1.4659157991409302, "step": 2440 }, { "epoch": 2.66, "grad_norm": 3.50063011012773, "learning_rate": 3.0215724857067757e-07, "logits/chosen": 3.379225492477417, "logits/rejected": 3.545624256134033, "logps/chosen": -651.8958129882812, "logps/rejected": -535.3997802734375, "loss": 0.4437, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5937511324882507, "rewards/margins": 1.0226200819015503, "rewards/rejected": -1.6163713932037354, "step": 2450 }, { "epoch": 2.67, "grad_norm": 3.430383815773576, "learning_rate": 3.007407623398421e-07, "logits/chosen": 3.4755587577819824, "logits/rejected": 3.6219284534454346, "logps/chosen": -599.4044189453125, "logps/rejected": -561.2335205078125, "loss": 0.4602, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6869359016418457, "rewards/margins": 0.9378687143325806, "rewards/rejected": -1.6248044967651367, "step": 2460 }, { "epoch": 2.68, "grad_norm": 3.419580465406847, "learning_rate": 2.9932257514179854e-07, "logits/chosen": 3.4234156608581543, "logits/rejected": 3.3968899250030518, "logps/chosen": -549.8673095703125, "logps/rejected": -551.0244750976562, "loss": 0.4478, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8541863560676575, "rewards/margins": 0.8430056571960449, "rewards/rejected": -1.6971919536590576, "step": 2470 }, { "epoch": 2.69, "grad_norm": 3.4971290852379857, "learning_rate": 2.97902734518007e-07, "logits/chosen": 3.5135090351104736, "logits/rejected": 3.6605193614959717, "logps/chosen": -658.0453491210938, "logps/rejected": -585.8455200195312, "loss": 0.4364, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6410279273986816, "rewards/margins": 1.001507043838501, "rewards/rejected": -1.642534852027893, "step": 2480 }, { "epoch": 2.7, "grad_norm": 3.4235010827410073, "learning_rate": 2.9648128806535445e-07, "logits/chosen": 3.4521877765655518, "logits/rejected": 3.495452880859375, "logps/chosen": -621.9888916015625, "logps/rejected": -625.4398193359375, "loss": 0.4703, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.6590943336486816, "rewards/margins": 1.0803091526031494, "rewards/rejected": -1.739403486251831, "step": 2490 }, { "epoch": 2.71, "grad_norm": 5.415917141603628, "learning_rate": 2.9505828343456005e-07, "logits/chosen": 3.4482123851776123, "logits/rejected": 3.561087131500244, "logps/chosen": -622.19482421875, "logps/rejected": -625.0249633789062, "loss": 0.4204, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.8436199426651001, "rewards/margins": 1.1915690898895264, "rewards/rejected": -2.035189151763916, "step": 2500 }, { "epoch": 2.71, "eval_logits/chosen": 3.4803626537323, "eval_logits/rejected": 3.5446834564208984, "eval_logps/chosen": -626.717529296875, "eval_logps/rejected": -597.4497680664062, "eval_loss": 0.4889599084854126, "eval_rewards/accuracies": 0.7757936716079712, "eval_rewards/chosen": -0.8338061571121216, "eval_rewards/margins": 0.8999470472335815, "eval_rewards/rejected": -1.7337533235549927, "eval_runtime": 203.1467, "eval_samples_per_second": 9.845, "eval_steps_per_second": 0.31, "step": 2500 }, { "epoch": 2.73, "grad_norm": 3.435022858379588, "learning_rate": 2.936337683285768e-07, "logits/chosen": 3.328322649002075, "logits/rejected": 3.357412338256836, "logps/chosen": -610.0457763671875, "logps/rejected": -569.2310180664062, "loss": 0.4554, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9784911870956421, "rewards/margins": 0.7509005069732666, "rewards/rejected": -1.7293916940689087, "step": 2510 }, { "epoch": 2.74, "grad_norm": 3.5359479752810667, "learning_rate": 2.9220779050099344e-07, "logits/chosen": 3.3794853687286377, "logits/rejected": 3.358579158782959, "logps/chosen": -581.7650146484375, "logps/rejected": -571.266357421875, "loss": 0.4446, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9221858978271484, "rewards/margins": 1.0501763820648193, "rewards/rejected": -1.9723621606826782, "step": 2520 }, { "epoch": 2.75, "grad_norm": 3.5952662863841818, "learning_rate": 2.9078039775443247e-07, "logits/chosen": 3.4976468086242676, "logits/rejected": 3.4828574657440186, "logps/chosen": -619.1820678710938, "logps/rejected": -594.2859497070312, "loss": 0.4535, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9900520443916321, "rewards/margins": 0.9431132078170776, "rewards/rejected": -1.9331653118133545, "step": 2530 }, { "epoch": 2.76, "grad_norm": 4.430475128483214, "learning_rate": 2.893516379389489e-07, "logits/chosen": 3.391080141067505, "logits/rejected": 3.426509380340576, "logps/chosen": -693.9683837890625, "logps/rejected": -606.825927734375, "loss": 0.4668, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.917786717414856, "rewards/margins": 1.123294711112976, "rewards/rejected": -2.041081666946411, "step": 2540 }, { "epoch": 2.77, "grad_norm": 4.42604200822975, "learning_rate": 2.879215589504252e-07, "logits/chosen": 3.4469542503356934, "logits/rejected": 3.3772597312927246, "logps/chosen": -626.1776123046875, "logps/rejected": -585.4291381835938, "loss": 0.4767, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6976214647293091, "rewards/margins": 0.9503000974655151, "rewards/rejected": -1.6479215621948242, "step": 2550 }, { "epoch": 2.78, "grad_norm": 3.492132180536463, "learning_rate": 2.8649020872896606e-07, "logits/chosen": 3.2876811027526855, "logits/rejected": 3.3829731941223145, "logps/chosen": -637.89892578125, "logps/rejected": -559.44580078125, "loss": 0.4547, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7019818425178528, "rewards/margins": 0.9280465245246887, "rewards/rejected": -1.6300283670425415, "step": 2560 }, { "epoch": 2.79, "grad_norm": 5.386088412615246, "learning_rate": 2.850576352572916e-07, "logits/chosen": 3.428292751312256, "logits/rejected": 3.427203416824341, "logps/chosen": -631.5196533203125, "logps/rejected": -579.7611694335938, "loss": 0.4626, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8275982737541199, "rewards/margins": 0.9573984146118164, "rewards/rejected": -1.784996747970581, "step": 2570 }, { "epoch": 2.8, "grad_norm": 3.619396695108091, "learning_rate": 2.8362388655912826e-07, "logits/chosen": 3.3158061504364014, "logits/rejected": 3.344578981399536, "logps/chosen": -585.06689453125, "logps/rejected": -551.7364501953125, "loss": 0.4544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7403179407119751, "rewards/margins": 0.9482278823852539, "rewards/rejected": -1.688545823097229, "step": 2580 }, { "epoch": 2.81, "grad_norm": 3.43659878268593, "learning_rate": 2.821890106975996e-07, "logits/chosen": 3.4369442462921143, "logits/rejected": 3.549891710281372, "logps/chosen": -687.2131958007812, "logps/rejected": -632.4449462890625, "loss": 0.4885, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.4967229962348938, "rewards/margins": 0.9954892992973328, "rewards/rejected": -1.492212176322937, "step": 2590 }, { "epoch": 2.82, "grad_norm": 3.268971458217506, "learning_rate": 2.807530557736144e-07, "logits/chosen": 3.5161678791046143, "logits/rejected": 3.4077048301696777, "logps/chosen": -585.3170166015625, "logps/rejected": -609.959716796875, "loss": 0.467, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6477356553077698, "rewards/margins": 0.7576956748962402, "rewards/rejected": -1.4054313898086548, "step": 2600 }, { "epoch": 2.82, "eval_logits/chosen": 3.499997854232788, "eval_logits/rejected": 3.569040060043335, "eval_logps/chosen": -602.4325561523438, "eval_logps/rejected": -569.2332763671875, "eval_loss": 0.4865441918373108, "eval_rewards/accuracies": 0.7876983880996704, "eval_rewards/chosen": -0.590956449508667, "eval_rewards/margins": 0.8606314659118652, "eval_rewards/rejected": -1.4515879154205322, "eval_runtime": 203.3268, "eval_samples_per_second": 9.836, "eval_steps_per_second": 0.31, "step": 2600 }, { "epoch": 2.83, "grad_norm": 4.278479470624274, "learning_rate": 2.793160699242548e-07, "logits/chosen": 3.505661725997925, "logits/rejected": 3.3195648193359375, "logps/chosen": -605.200927734375, "logps/rejected": -576.4964599609375, "loss": 0.4454, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6035189628601074, "rewards/margins": 0.8821620941162109, "rewards/rejected": -1.4856809377670288, "step": 2610 }, { "epoch": 2.84, "grad_norm": 4.114384313390409, "learning_rate": 2.7787810132116196e-07, "logits/chosen": 3.464247465133667, "logits/rejected": 3.4501852989196777, "logps/chosen": -594.6990356445312, "logps/rejected": -574.1302490234375, "loss": 0.4631, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7224303483963013, "rewards/margins": 0.9456484913825989, "rewards/rejected": -1.6680786609649658, "step": 2620 }, { "epoch": 2.86, "grad_norm": 4.081478700217825, "learning_rate": 2.7643919816892215e-07, "logits/chosen": 3.2647666931152344, "logits/rejected": 3.3138420581817627, "logps/chosen": -562.5584106445312, "logps/rejected": -558.0347900390625, "loss": 0.4537, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8194862604141235, "rewards/margins": 0.8074823617935181, "rewards/rejected": -1.6269683837890625, "step": 2630 }, { "epoch": 2.87, "grad_norm": 4.121806949453411, "learning_rate": 2.749994087034498e-07, "logits/chosen": 3.2870640754699707, "logits/rejected": 3.3868587017059326, "logps/chosen": -646.748779296875, "logps/rejected": -587.2466430664062, "loss": 0.4233, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8164499998092651, "rewards/margins": 1.0539710521697998, "rewards/rejected": -1.870421051979065, "step": 2640 }, { "epoch": 2.88, "grad_norm": 3.75911473750284, "learning_rate": 2.7355878119037097e-07, "logits/chosen": 3.459559679031372, "logits/rejected": 3.4844887256622314, "logps/chosen": -644.4876098632812, "logps/rejected": -624.3375854492188, "loss": 0.435, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9554063677787781, "rewards/margins": 1.0268898010253906, "rewards/rejected": -1.9822959899902344, "step": 2650 }, { "epoch": 2.89, "grad_norm": 5.672731475848124, "learning_rate": 2.7211736392340567e-07, "logits/chosen": 3.3097336292266846, "logits/rejected": 3.4127914905548096, "logps/chosen": -670.7672119140625, "logps/rejected": -658.6070556640625, "loss": 0.4542, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9260674715042114, "rewards/margins": 1.0306978225708008, "rewards/rejected": -1.9567651748657227, "step": 2660 }, { "epoch": 2.9, "grad_norm": 3.944618363860535, "learning_rate": 2.706752052227483e-07, "logits/chosen": 3.387530565261841, "logits/rejected": 3.305820941925049, "logps/chosen": -629.259521484375, "logps/rejected": -633.9971923828125, "loss": 0.4659, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.833823025226593, "rewards/margins": 0.9765887260437012, "rewards/rejected": -1.810411810874939, "step": 2670 }, { "epoch": 2.91, "grad_norm": 5.4320141900216115, "learning_rate": 2.692323534334481e-07, "logits/chosen": 3.3067660331726074, "logits/rejected": 3.3124756813049316, "logps/chosen": -656.865234375, "logps/rejected": -581.8089599609375, "loss": 0.4874, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8518417477607727, "rewards/margins": 1.005443811416626, "rewards/rejected": -1.857285737991333, "step": 2680 }, { "epoch": 2.92, "grad_norm": 3.7610827920428194, "learning_rate": 2.6778885692378866e-07, "logits/chosen": 3.406977415084839, "logits/rejected": 3.33898663520813, "logps/chosen": -633.6414794921875, "logps/rejected": -593.6680297851562, "loss": 0.433, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.656938374042511, "rewards/margins": 1.116265058517456, "rewards/rejected": -1.7732034921646118, "step": 2690 }, { "epoch": 2.93, "grad_norm": 3.432377080662618, "learning_rate": 2.663447640836663e-07, "logits/chosen": 3.429065704345703, "logits/rejected": 3.451131820678711, "logps/chosen": -617.0606689453125, "logps/rejected": -597.2569580078125, "loss": 0.458, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8843738436698914, "rewards/margins": 0.9283639788627625, "rewards/rejected": -1.812738060951233, "step": 2700 }, { "epoch": 2.93, "eval_logits/chosen": 3.457927942276001, "eval_logits/rejected": 3.5208330154418945, "eval_logps/chosen": -620.0014038085938, "eval_logps/rejected": -591.329833984375, "eval_loss": 0.48605242371559143, "eval_rewards/accuracies": 0.783730149269104, "eval_rewards/chosen": -0.7666451334953308, "eval_rewards/margins": 0.9059080481529236, "eval_rewards/rejected": -1.672553300857544, "eval_runtime": 203.0125, "eval_samples_per_second": 9.852, "eval_steps_per_second": 0.31, "step": 2700 }, { "epoch": 2.94, "grad_norm": 3.5814744568515438, "learning_rate": 2.6490012332296796e-07, "logits/chosen": 3.4098687171936035, "logits/rejected": 3.5238165855407715, "logps/chosen": -650.0216674804688, "logps/rejected": -568.7059326171875, "loss": 0.4647, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8275938034057617, "rewards/margins": 0.9160548448562622, "rewards/rejected": -1.7436487674713135, "step": 2710 }, { "epoch": 2.95, "grad_norm": 3.7058563105254945, "learning_rate": 2.634549830699483e-07, "logits/chosen": 3.4752769470214844, "logits/rejected": 3.4913477897644043, "logps/chosen": -596.112548828125, "logps/rejected": -571.9755859375, "loss": 0.4499, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6368006467819214, "rewards/margins": 0.9434793591499329, "rewards/rejected": -1.580280065536499, "step": 2720 }, { "epoch": 2.96, "grad_norm": 4.263349011982545, "learning_rate": 2.620093917696063e-07, "logits/chosen": 3.3458282947540283, "logits/rejected": 3.513139247894287, "logps/chosen": -624.7337646484375, "logps/rejected": -580.9771728515625, "loss": 0.4574, "rewards/accuracies": 0.75, "rewards/chosen": -0.8148934245109558, "rewards/margins": 0.9292934536933899, "rewards/rejected": -1.7441869974136353, "step": 2730 }, { "epoch": 2.98, "grad_norm": 3.9498017642080914, "learning_rate": 2.605633978820613e-07, "logits/chosen": 3.3828988075256348, "logits/rejected": 3.3466384410858154, "logps/chosen": -639.5057373046875, "logps/rejected": -615.6836547851562, "loss": 0.4581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8807100057601929, "rewards/margins": 1.0138176679611206, "rewards/rejected": -1.894527792930603, "step": 2740 }, { "epoch": 2.99, "grad_norm": 3.788532661853087, "learning_rate": 2.591170498809284e-07, "logits/chosen": 3.449664354324341, "logits/rejected": 3.54020357131958, "logps/chosen": -686.0653076171875, "logps/rejected": -632.3584594726562, "loss": 0.4519, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7908682823181152, "rewards/margins": 0.9529739618301392, "rewards/rejected": -1.7438421249389648, "step": 2750 }, { "epoch": 3.0, "grad_norm": 4.184260963488442, "learning_rate": 2.576703962516937e-07, "logits/chosen": 3.3226122856140137, "logits/rejected": 3.37495493888855, "logps/chosen": -627.0601196289062, "logps/rejected": -629.4278564453125, "loss": 0.4632, "rewards/accuracies": 0.75, "rewards/chosen": -0.8550036549568176, "rewards/margins": 0.9417628049850464, "rewards/rejected": -1.7967665195465088, "step": 2760 }, { "epoch": 3.01, "grad_norm": 3.8352731309247283, "learning_rate": 2.5622348549008854e-07, "logits/chosen": 3.4446792602539062, "logits/rejected": 3.4306163787841797, "logps/chosen": -592.9773559570312, "logps/rejected": -580.4371337890625, "loss": 0.4588, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8938300013542175, "rewards/margins": 1.0764323472976685, "rewards/rejected": -1.9702622890472412, "step": 2770 }, { "epoch": 3.02, "grad_norm": 4.151038171191166, "learning_rate": 2.547763661004642e-07, "logits/chosen": 3.352771759033203, "logits/rejected": 3.3830676078796387, "logps/chosen": -684.36279296875, "logps/rejected": -622.1414794921875, "loss": 0.4183, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8378265500068665, "rewards/margins": 0.9874498248100281, "rewards/rejected": -1.8252766132354736, "step": 2780 }, { "epoch": 3.03, "grad_norm": 4.584896186132231, "learning_rate": 2.533290865941658e-07, "logits/chosen": 3.3703866004943848, "logits/rejected": 3.248610258102417, "logps/chosen": -632.7425537109375, "logps/rejected": -617.5534057617188, "loss": 0.4505, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9305559396743774, "rewards/margins": 1.0056320428848267, "rewards/rejected": -1.9361881017684937, "step": 2790 }, { "epoch": 3.04, "grad_norm": 3.9983708065161148, "learning_rate": 2.518816954879057e-07, "logits/chosen": 3.4359169006347656, "logits/rejected": 3.474580764770508, "logps/chosen": -594.47900390625, "logps/rejected": -558.6627197265625, "loss": 0.462, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8003416061401367, "rewards/margins": 0.8039595484733582, "rewards/rejected": -1.6043012142181396, "step": 2800 }, { "epoch": 3.04, "eval_logits/chosen": 3.49540376663208, "eval_logits/rejected": 3.5552937984466553, "eval_logps/chosen": -614.4227294921875, "eval_logps/rejected": -585.52685546875, "eval_loss": 0.4844111204147339, "eval_rewards/accuracies": 0.7916666865348816, "eval_rewards/chosen": -0.7108585834503174, "eval_rewards/margins": 0.9036649465560913, "eval_rewards/rejected": -1.6145235300064087, "eval_runtime": 203.0036, "eval_samples_per_second": 9.852, "eval_steps_per_second": 0.31, "step": 2800 }, { "epoch": 3.05, "grad_norm": 4.375685459289675, "learning_rate": 2.504342413021377e-07, "logits/chosen": 3.48063588142395, "logits/rejected": 3.4624500274658203, "logps/chosen": -573.4635009765625, "logps/rejected": -599.910888671875, "loss": 0.4279, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8659281730651855, "rewards/margins": 0.9196036458015442, "rewards/rejected": -1.785531759262085, "step": 2810 }, { "epoch": 3.06, "grad_norm": 3.448289578171233, "learning_rate": 2.4898677255943006e-07, "logits/chosen": 3.5123367309570312, "logits/rejected": 3.425743579864502, "logps/chosen": -659.736572265625, "logps/rejected": -652.95458984375, "loss": 0.4334, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8693497776985168, "rewards/margins": 0.971311092376709, "rewards/rejected": -1.8406610488891602, "step": 2820 }, { "epoch": 3.07, "grad_norm": 4.581154617393512, "learning_rate": 2.47539337782839e-07, "logits/chosen": 3.2755444049835205, "logits/rejected": 3.3720130920410156, "logps/chosen": -644.5631103515625, "logps/rejected": -581.09228515625, "loss": 0.4364, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9709026217460632, "rewards/margins": 1.0490353107452393, "rewards/rejected": -2.0199379920959473, "step": 2830 }, { "epoch": 3.08, "grad_norm": 3.7613899547216763, "learning_rate": 2.460919854942822e-07, "logits/chosen": 3.4118473529815674, "logits/rejected": 3.4696223735809326, "logps/chosen": -647.2174072265625, "logps/rejected": -637.8060302734375, "loss": 0.4491, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9573391675949097, "rewards/margins": 0.8948485255241394, "rewards/rejected": -1.8521878719329834, "step": 2840 }, { "epoch": 3.09, "grad_norm": 3.2320273947918983, "learning_rate": 2.44644764212912e-07, "logits/chosen": 3.3305840492248535, "logits/rejected": 3.2426304817199707, "logps/chosen": -624.9818115234375, "logps/rejected": -609.2310791015625, "loss": 0.4306, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7739254832267761, "rewards/margins": 0.9810514450073242, "rewards/rejected": -1.7549769878387451, "step": 2850 }, { "epoch": 3.11, "grad_norm": 3.7286268455772755, "learning_rate": 2.4319772245348927e-07, "logits/chosen": 3.474033832550049, "logits/rejected": 3.4578864574432373, "logps/chosen": -619.8948974609375, "logps/rejected": -615.1239013671875, "loss": 0.4383, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7849314212799072, "rewards/margins": 1.0505647659301758, "rewards/rejected": -1.835496187210083, "step": 2860 }, { "epoch": 3.12, "grad_norm": 3.9099335431162823, "learning_rate": 2.4175090872475645e-07, "logits/chosen": 3.5397415161132812, "logits/rejected": 3.4615206718444824, "logps/chosen": -599.0786743164062, "logps/rejected": -655.3748779296875, "loss": 0.4652, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7803353071212769, "rewards/margins": 0.8833073377609253, "rewards/rejected": -1.6636425256729126, "step": 2870 }, { "epoch": 3.13, "grad_norm": 3.9574565601831293, "learning_rate": 2.40304371527812e-07, "logits/chosen": 3.4028964042663574, "logits/rejected": 3.4590251445770264, "logps/chosen": -649.6727294921875, "logps/rejected": -638.2109985351562, "loss": 0.4611, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9411758184432983, "rewards/margins": 0.9360073804855347, "rewards/rejected": -1.8771835565567017, "step": 2880 }, { "epoch": 3.14, "grad_norm": 3.395616713371064, "learning_rate": 2.3885815935448435e-07, "logits/chosen": 3.278578519821167, "logits/rejected": 3.398362636566162, "logps/chosen": -619.1382446289062, "logps/rejected": -610.914306640625, "loss": 0.4205, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0494105815887451, "rewards/margins": 1.2443387508392334, "rewards/rejected": -2.2937493324279785, "step": 2890 }, { "epoch": 3.15, "grad_norm": 4.10371958008165, "learning_rate": 2.3741232068570605e-07, "logits/chosen": 3.4090774059295654, "logits/rejected": 3.3995368480682373, "logps/chosen": -701.1258544921875, "logps/rejected": -666.7429809570312, "loss": 0.4258, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9114104509353638, "rewards/margins": 1.1804288625717163, "rewards/rejected": -2.09183931350708, "step": 2900 }, { "epoch": 3.15, "eval_logits/chosen": 3.4227328300476074, "eval_logits/rejected": 3.476104259490967, "eval_logps/chosen": -641.4771728515625, "eval_logps/rejected": -618.2141723632812, "eval_loss": 0.48884913325309753, "eval_rewards/accuracies": 0.7817460298538208, "eval_rewards/chosen": -0.9814031720161438, "eval_rewards/margins": 0.9599937200546265, "eval_rewards/rejected": -1.9413968324661255, "eval_runtime": 203.217, "eval_samples_per_second": 9.842, "eval_steps_per_second": 0.31, "step": 2900 }, { "epoch": 3.16, "grad_norm": 4.071596946207353, "learning_rate": 2.3596690398988903e-07, "logits/chosen": 3.367732286453247, "logits/rejected": 3.335725784301758, "logps/chosen": -563.6433715820312, "logps/rejected": -563.1458740234375, "loss": 0.4532, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1280577182769775, "rewards/margins": 1.1104881763458252, "rewards/rejected": -2.2385458946228027, "step": 2910 }, { "epoch": 3.17, "grad_norm": 3.3245284364131638, "learning_rate": 2.3452195772129937e-07, "logits/chosen": 3.4156277179718018, "logits/rejected": 3.4742846488952637, "logps/chosen": -667.6085205078125, "logps/rejected": -634.3673095703125, "loss": 0.4439, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9609830975532532, "rewards/margins": 1.044752597808838, "rewards/rejected": -2.0057358741760254, "step": 2920 }, { "epoch": 3.18, "grad_norm": 3.6395486327587605, "learning_rate": 2.3307753031843312e-07, "logits/chosen": 3.436938524246216, "logits/rejected": 3.4605090618133545, "logps/chosen": -603.6176147460938, "logps/rejected": -600.2156982421875, "loss": 0.4564, "rewards/accuracies": 0.875, "rewards/chosen": -0.8700806498527527, "rewards/margins": 1.0620027780532837, "rewards/rejected": -1.9320834875106812, "step": 2930 }, { "epoch": 3.19, "grad_norm": 4.305925067636892, "learning_rate": 2.3163367020239264e-07, "logits/chosen": 3.4125266075134277, "logits/rejected": 3.426459550857544, "logps/chosen": -641.9744262695312, "logps/rejected": -601.0375366210938, "loss": 0.4413, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8042587041854858, "rewards/margins": 1.035605788230896, "rewards/rejected": -1.8398644924163818, "step": 2940 }, { "epoch": 3.2, "grad_norm": 3.7325297948726406, "learning_rate": 2.3019042577526337e-07, "logits/chosen": 3.386017322540283, "logits/rejected": 3.4424185752868652, "logps/chosen": -706.9832763671875, "logps/rejected": -649.3670654296875, "loss": 0.4625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8057816624641418, "rewards/margins": 0.9790631532669067, "rewards/rejected": -1.7848447561264038, "step": 2950 }, { "epoch": 3.21, "grad_norm": 5.9773377993199475, "learning_rate": 2.2874784541849105e-07, "logits/chosen": 3.3580546379089355, "logits/rejected": 3.3814473152160645, "logps/chosen": -612.360595703125, "logps/rejected": -565.7884521484375, "loss": 0.4429, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.806524395942688, "rewards/margins": 0.977981448173523, "rewards/rejected": -1.78450608253479, "step": 2960 }, { "epoch": 3.22, "grad_norm": 4.99474748571327, "learning_rate": 2.2730597749126014e-07, "logits/chosen": 3.396486759185791, "logits/rejected": 3.502777099609375, "logps/chosen": -637.4274291992188, "logps/rejected": -604.4832763671875, "loss": 0.4255, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8480658531188965, "rewards/margins": 1.0878616571426392, "rewards/rejected": -1.9359276294708252, "step": 2970 }, { "epoch": 3.24, "grad_norm": 4.333271910357194, "learning_rate": 2.2586487032887237e-07, "logits/chosen": 3.3672001361846924, "logits/rejected": 3.4402644634246826, "logps/chosen": -658.7831420898438, "logps/rejected": -598.9805297851562, "loss": 0.4417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0384938716888428, "rewards/margins": 0.9810712933540344, "rewards/rejected": -2.0195651054382324, "step": 2980 }, { "epoch": 3.25, "grad_norm": 6.272194848676277, "learning_rate": 2.2442457224112676e-07, "logits/chosen": 3.4500479698181152, "logits/rejected": 3.4857699871063232, "logps/chosen": -698.882568359375, "logps/rejected": -676.0650634765625, "loss": 0.4398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.004777193069458, "rewards/margins": 1.0942362546920776, "rewards/rejected": -2.099013566970825, "step": 2990 }, { "epoch": 3.26, "grad_norm": 4.330090833439255, "learning_rate": 2.229851315106999e-07, "logits/chosen": 3.323119640350342, "logits/rejected": 3.391847610473633, "logps/chosen": -649.1282348632812, "logps/rejected": -610.1156616210938, "loss": 0.4219, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.028066873550415, "rewards/margins": 1.0071299076080322, "rewards/rejected": -2.0351967811584473, "step": 3000 }, { "epoch": 3.26, "eval_logits/chosen": 3.4361824989318848, "eval_logits/rejected": 3.4894895553588867, "eval_logps/chosen": -631.9180908203125, "eval_logps/rejected": -607.30712890625, "eval_loss": 0.4856274127960205, "eval_rewards/accuracies": 0.7936508059501648, "eval_rewards/chosen": -0.885812520980835, "eval_rewards/margins": 0.9465143084526062, "eval_rewards/rejected": -1.8323270082473755, "eval_runtime": 203.2097, "eval_samples_per_second": 9.842, "eval_steps_per_second": 0.31, "step": 3000 }, { "epoch": 3.27, "grad_norm": 3.9494363632047027, "learning_rate": 2.2154659639152728e-07, "logits/chosen": 3.4298388957977295, "logits/rejected": 3.5834994316101074, "logps/chosen": -623.3873291015625, "logps/rejected": -590.0366821289062, "loss": 0.4373, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8894636034965515, "rewards/margins": 0.947036862373352, "rewards/rejected": -1.8365005254745483, "step": 3010 }, { "epoch": 3.28, "grad_norm": 4.37004923030797, "learning_rate": 2.2010901510718623e-07, "logits/chosen": 3.4085888862609863, "logits/rejected": 3.4778428077697754, "logps/chosen": -661.8770141601562, "logps/rejected": -620.317138671875, "loss": 0.4165, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7813987135887146, "rewards/margins": 1.2368358373641968, "rewards/rejected": -2.0182344913482666, "step": 3020 }, { "epoch": 3.29, "grad_norm": 4.000055891899185, "learning_rate": 2.186724358492785e-07, "logits/chosen": 3.410771131515503, "logits/rejected": 3.4011940956115723, "logps/chosen": -627.5743408203125, "logps/rejected": -614.0305786132812, "loss": 0.4333, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7949288487434387, "rewards/margins": 1.1482188701629639, "rewards/rejected": -1.943147897720337, "step": 3030 }, { "epoch": 3.3, "grad_norm": 3.8870438933659472, "learning_rate": 2.1723690677581567e-07, "logits/chosen": 3.366727352142334, "logits/rejected": 3.4489874839782715, "logps/chosen": -683.2049560546875, "logps/rejected": -597.3983154296875, "loss": 0.4399, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9306663274765015, "rewards/margins": 0.9225971102714539, "rewards/rejected": -1.8532634973526, "step": 3040 }, { "epoch": 3.31, "grad_norm": 3.621627142820652, "learning_rate": 2.1580247600960392e-07, "logits/chosen": 3.2974860668182373, "logits/rejected": 3.3342857360839844, "logps/chosen": -685.1504516601562, "logps/rejected": -596.3431396484375, "loss": 0.4464, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7522670030593872, "rewards/margins": 1.0248292684555054, "rewards/rejected": -1.7770963907241821, "step": 3050 }, { "epoch": 3.32, "grad_norm": 3.5254386902793557, "learning_rate": 2.1436919163663153e-07, "logits/chosen": 3.359710693359375, "logits/rejected": 3.3922533988952637, "logps/chosen": -635.1802978515625, "logps/rejected": -596.8336181640625, "loss": 0.4298, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8741143345832825, "rewards/margins": 1.2291319370269775, "rewards/rejected": -2.1032462120056152, "step": 3060 }, { "epoch": 3.33, "grad_norm": 3.475210735676778, "learning_rate": 2.1293710170445633e-07, "logits/chosen": 3.355104446411133, "logits/rejected": 3.3470497131347656, "logps/chosen": -659.673095703125, "logps/rejected": -600.4511108398438, "loss": 0.4592, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9288552403450012, "rewards/margins": 0.7822314500808716, "rewards/rejected": -1.711086630821228, "step": 3070 }, { "epoch": 3.34, "grad_norm": 3.963907942625747, "learning_rate": 2.1150625422059537e-07, "logits/chosen": 3.4125232696533203, "logits/rejected": 3.4644877910614014, "logps/chosen": -651.6043090820312, "logps/rejected": -637.5367431640625, "loss": 0.4183, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.7479305267333984, "rewards/margins": 1.0954493284225464, "rewards/rejected": -1.8433797359466553, "step": 3080 }, { "epoch": 3.36, "grad_norm": 4.79735214093586, "learning_rate": 2.100766971509156e-07, "logits/chosen": 3.3415913581848145, "logits/rejected": 3.482771396636963, "logps/chosen": -642.8465576171875, "logps/rejected": -587.8502197265625, "loss": 0.4427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8390420079231262, "rewards/margins": 1.0515780448913574, "rewards/rejected": -1.8906199932098389, "step": 3090 }, { "epoch": 3.37, "grad_norm": 3.9755901299610765, "learning_rate": 2.0864847841802555e-07, "logits/chosen": 3.313795566558838, "logits/rejected": 3.35662841796875, "logps/chosen": -625.80615234375, "logps/rejected": -577.7479248046875, "loss": 0.4295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9515978097915649, "rewards/margins": 1.0637649297714233, "rewards/rejected": -2.0153627395629883, "step": 3100 }, { "epoch": 3.37, "eval_logits/chosen": 3.4356937408447266, "eval_logits/rejected": 3.4879915714263916, "eval_logps/chosen": -624.732666015625, "eval_logps/rejected": -600.5797119140625, "eval_loss": 0.48225274682044983, "eval_rewards/accuracies": 0.7976190447807312, "eval_rewards/chosen": -0.8139576315879822, "eval_rewards/margins": 0.9510951042175293, "eval_rewards/rejected": -1.7650526762008667, "eval_runtime": 203.1657, "eval_samples_per_second": 9.844, "eval_steps_per_second": 0.31, "step": 3100 }, { "epoch": 3.38, "grad_norm": 3.5189161775928937, "learning_rate": 2.0722164589966936e-07, "logits/chosen": 3.437371015548706, "logits/rejected": 3.6356558799743652, "logps/chosen": -656.8878173828125, "logps/rejected": -565.6344604492188, "loss": 0.4507, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.805374026298523, "rewards/margins": 1.0437090396881104, "rewards/rejected": -1.8490829467773438, "step": 3110 }, { "epoch": 3.39, "grad_norm": 4.645784250708397, "learning_rate": 2.0579624742712128e-07, "logits/chosen": 3.2445125579833984, "logits/rejected": 3.2976136207580566, "logps/chosen": -600.2672729492188, "logps/rejected": -565.233642578125, "loss": 0.4238, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8405818939208984, "rewards/margins": 1.0852935314178467, "rewards/rejected": -1.9258753061294556, "step": 3120 }, { "epoch": 3.4, "grad_norm": 3.405528009644704, "learning_rate": 2.0437233078358275e-07, "logits/chosen": 3.4257736206054688, "logits/rejected": 3.4833385944366455, "logps/chosen": -643.4204711914062, "logps/rejected": -617.6407470703125, "loss": 0.4252, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.658096194267273, "rewards/margins": 0.9798544645309448, "rewards/rejected": -1.6379506587982178, "step": 3130 }, { "epoch": 3.41, "grad_norm": 3.8784222204050627, "learning_rate": 2.0294994370258e-07, "logits/chosen": 3.4538047313690186, "logits/rejected": 3.4483654499053955, "logps/chosen": -607.20654296875, "logps/rejected": -605.8489990234375, "loss": 0.439, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9570397138595581, "rewards/margins": 1.0782394409179688, "rewards/rejected": -2.0352795124053955, "step": 3140 }, { "epoch": 3.42, "grad_norm": 3.5584730989638045, "learning_rate": 2.015291338663644e-07, "logits/chosen": 3.2920451164245605, "logits/rejected": 3.3430659770965576, "logps/chosen": -635.3215942382812, "logps/rejected": -594.5906372070312, "loss": 0.4207, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8983471989631653, "rewards/margins": 1.1466476917266846, "rewards/rejected": -2.044994831085205, "step": 3150 }, { "epoch": 3.43, "grad_norm": 5.678551225331759, "learning_rate": 2.001099489043138e-07, "logits/chosen": 3.4382717609405518, "logits/rejected": 3.3808326721191406, "logps/chosen": -643.2792358398438, "logps/rejected": -634.2994384765625, "loss": 0.4408, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7990803718566895, "rewards/margins": 0.9194300770759583, "rewards/rejected": -1.718510389328003, "step": 3160 }, { "epoch": 3.44, "grad_norm": 5.085374543246204, "learning_rate": 1.9869243639133577e-07, "logits/chosen": 3.4107460975646973, "logits/rejected": 3.396080732345581, "logps/chosen": -594.4137573242188, "logps/rejected": -613.5217895507812, "loss": 0.4556, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7500702142715454, "rewards/margins": 1.1137340068817139, "rewards/rejected": -1.8638041019439697, "step": 3170 }, { "epoch": 3.45, "grad_norm": 4.250629396797435, "learning_rate": 1.9727664384627306e-07, "logits/chosen": 3.4325294494628906, "logits/rejected": 3.4035377502441406, "logps/chosen": -596.2446899414062, "logps/rejected": -564.5435791015625, "loss": 0.4234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.822806179523468, "rewards/margins": 1.038006067276001, "rewards/rejected": -1.8608121871948242, "step": 3180 }, { "epoch": 3.46, "grad_norm": 4.019039940993413, "learning_rate": 1.9586261873031025e-07, "logits/chosen": 3.2630324363708496, "logits/rejected": 3.3803462982177734, "logps/chosen": -616.1365966796875, "logps/rejected": -603.97216796875, "loss": 0.4576, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8340195417404175, "rewards/margins": 1.0018597841262817, "rewards/rejected": -1.8358793258666992, "step": 3190 }, { "epoch": 3.47, "grad_norm": 4.141701500597555, "learning_rate": 1.9445040844538313e-07, "logits/chosen": 3.283325672149658, "logits/rejected": 3.3204421997070312, "logps/chosen": -604.6246948242188, "logps/rejected": -588.3277587890625, "loss": 0.4268, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9514004588127136, "rewards/margins": 0.8670446276664734, "rewards/rejected": -1.8184449672698975, "step": 3200 }, { "epoch": 3.47, "eval_logits/chosen": 3.4126436710357666, "eval_logits/rejected": 3.453640937805176, "eval_logps/chosen": -629.2567138671875, "eval_logps/rejected": -606.8929443359375, "eval_loss": 0.480047345161438, "eval_rewards/accuracies": 0.7976190447807312, "eval_rewards/chosen": -0.8591986298561096, "eval_rewards/margins": 0.9689861536026001, "eval_rewards/rejected": -1.8281848430633545, "eval_runtime": 203.0923, "eval_samples_per_second": 9.848, "eval_steps_per_second": 0.31, "step": 3200 }, { "epoch": 3.49, "grad_norm": 6.599030236425171, "learning_rate": 1.930400603325893e-07, "logits/chosen": 3.302114963531494, "logits/rejected": 3.29858136177063, "logps/chosen": -666.8406982421875, "logps/rejected": -597.25927734375, "loss": 0.4352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0344960689544678, "rewards/margins": 1.0878442525863647, "rewards/rejected": -2.122340679168701, "step": 3210 }, { "epoch": 3.5, "grad_norm": 5.78140760080788, "learning_rate": 1.9163162167060144e-07, "logits/chosen": 3.367105484008789, "logits/rejected": 3.3133113384246826, "logps/chosen": -611.474365234375, "logps/rejected": -590.4954833984375, "loss": 0.4498, "rewards/accuracies": 0.75, "rewards/chosen": -0.9751068353652954, "rewards/margins": 0.9375247955322266, "rewards/rejected": -1.912631630897522, "step": 3220 }, { "epoch": 3.51, "grad_norm": 4.831797084110146, "learning_rate": 1.9022513967408227e-07, "logits/chosen": 3.2902634143829346, "logits/rejected": 3.23241925239563, "logps/chosen": -622.2492065429688, "logps/rejected": -596.3099365234375, "loss": 0.4465, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8274933695793152, "rewards/margins": 1.0548157691955566, "rewards/rejected": -1.8823089599609375, "step": 3230 }, { "epoch": 3.52, "grad_norm": 3.8203582598938897, "learning_rate": 1.8882066149210164e-07, "logits/chosen": 3.298330307006836, "logits/rejected": 3.444213390350342, "logps/chosen": -691.8465576171875, "logps/rejected": -577.5040283203125, "loss": 0.433, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8670648336410522, "rewards/margins": 1.0957075357437134, "rewards/rejected": -1.9627723693847656, "step": 3240 }, { "epoch": 3.53, "grad_norm": 4.931291520321011, "learning_rate": 1.8741823420655642e-07, "logits/chosen": 3.3172030448913574, "logits/rejected": 3.2955126762390137, "logps/chosen": -661.6648559570312, "logps/rejected": -650.8112182617188, "loss": 0.4275, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8657820820808411, "rewards/margins": 1.2847552299499512, "rewards/rejected": -2.1505374908447266, "step": 3250 }, { "epoch": 3.54, "grad_norm": 4.223378935572041, "learning_rate": 1.8601790483059165e-07, "logits/chosen": 3.3851325511932373, "logits/rejected": 3.3895251750946045, "logps/chosen": -630.2635498046875, "logps/rejected": -625.9153442382812, "loss": 0.3994, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.9566653370857239, "rewards/margins": 1.256395697593689, "rewards/rejected": -2.2130610942840576, "step": 3260 }, { "epoch": 3.55, "grad_norm": 4.152887166652273, "learning_rate": 1.846197203070249e-07, "logits/chosen": 3.180065393447876, "logits/rejected": 3.228161573410034, "logps/chosen": -635.3770751953125, "logps/rejected": -615.8402099609375, "loss": 0.4244, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.1644928455352783, "rewards/margins": 1.269607663154602, "rewards/rejected": -2.43410062789917, "step": 3270 }, { "epoch": 3.56, "grad_norm": 5.405068734416844, "learning_rate": 1.8322372750677247e-07, "logits/chosen": 3.346701145172119, "logits/rejected": 3.2889461517333984, "logps/chosen": -687.7290649414062, "logps/rejected": -670.6973876953125, "loss": 0.4369, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.013751745223999, "rewards/margins": 1.1379069089889526, "rewards/rejected": -2.151658535003662, "step": 3280 }, { "epoch": 3.57, "grad_norm": 4.239245009187299, "learning_rate": 1.8182997322727828e-07, "logits/chosen": 3.3153834342956543, "logits/rejected": 3.370880603790283, "logps/chosen": -688.4138793945312, "logps/rejected": -635.6961669921875, "loss": 0.4318, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9035437703132629, "rewards/margins": 1.1481924057006836, "rewards/rejected": -2.0517361164093018, "step": 3290 }, { "epoch": 3.58, "grad_norm": 4.219791148370747, "learning_rate": 1.8043850419094478e-07, "logits/chosen": 3.151858329772949, "logits/rejected": 3.2416319847106934, "logps/chosen": -656.4683227539062, "logps/rejected": -635.33740234375, "loss": 0.4338, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9841516613960266, "rewards/margins": 1.0485494136810303, "rewards/rejected": -2.032701015472412, "step": 3300 }, { "epoch": 3.58, "eval_logits/chosen": 3.409619092941284, "eval_logits/rejected": 3.447129249572754, "eval_logps/chosen": -631.173095703125, "eval_logps/rejected": -608.6550903320312, "eval_loss": 0.47853177785873413, "eval_rewards/accuracies": 0.795634925365448, "eval_rewards/chosen": -0.878362238407135, "eval_rewards/margins": 0.967444121837616, "eval_rewards/rejected": -1.8458064794540405, "eval_runtime": 203.0003, "eval_samples_per_second": 9.852, "eval_steps_per_second": 0.31, "step": 3300 }, { "epoch": 3.59, "grad_norm": 3.502741163655858, "learning_rate": 1.7904936704356715e-07, "logits/chosen": 3.304037570953369, "logits/rejected": 3.3648738861083984, "logps/chosen": -668.6808471679688, "logps/rejected": -606.8001098632812, "loss": 0.4399, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7760350108146667, "rewards/margins": 0.9891276359558105, "rewards/rejected": -1.765162706375122, "step": 3310 }, { "epoch": 3.6, "grad_norm": 3.2909726228421965, "learning_rate": 1.7766260835276919e-07, "logits/chosen": 3.3433239459991455, "logits/rejected": 3.241948366165161, "logps/chosen": -656.866943359375, "logps/rejected": -676.34765625, "loss": 0.4217, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8042502403259277, "rewards/margins": 1.105515956878662, "rewards/rejected": -1.909766435623169, "step": 3320 }, { "epoch": 3.62, "grad_norm": 4.470996733143754, "learning_rate": 1.7627827460644256e-07, "logits/chosen": 3.405937671661377, "logits/rejected": 3.56890869140625, "logps/chosen": -654.4332885742188, "logps/rejected": -613.1698608398438, "loss": 0.4156, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8114805221557617, "rewards/margins": 1.1845698356628418, "rewards/rejected": -1.996050238609314, "step": 3330 }, { "epoch": 3.63, "grad_norm": 4.0251845812339155, "learning_rate": 1.7489641221118807e-07, "logits/chosen": 3.2184112071990967, "logits/rejected": 3.236548900604248, "logps/chosen": -621.3856201171875, "logps/rejected": -585.7027587890625, "loss": 0.4408, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9119928479194641, "rewards/margins": 1.1261918544769287, "rewards/rejected": -2.038184642791748, "step": 3340 }, { "epoch": 3.64, "grad_norm": 3.0931365871130216, "learning_rate": 1.7351706749076034e-07, "logits/chosen": 3.437945604324341, "logits/rejected": 3.340913772583008, "logps/chosen": -591.0047607421875, "logps/rejected": -596.6959228515625, "loss": 0.4213, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9432751536369324, "rewards/margins": 0.9771154522895813, "rewards/rejected": -1.9203904867172241, "step": 3350 }, { "epoch": 3.65, "grad_norm": 4.426085674307634, "learning_rate": 1.7214028668451463e-07, "logits/chosen": 3.311800003051758, "logits/rejected": 3.3910815715789795, "logps/chosen": -671.0889892578125, "logps/rejected": -613.1412353515625, "loss": 0.4266, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7964714169502258, "rewards/margins": 1.3165581226348877, "rewards/rejected": -2.1130294799804688, "step": 3360 }, { "epoch": 3.66, "grad_norm": 5.41069681121707, "learning_rate": 1.707661159458569e-07, "logits/chosen": 3.3232052326202393, "logits/rejected": 3.3433849811553955, "logps/chosen": -629.1781005859375, "logps/rejected": -612.620849609375, "loss": 0.45, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8367223739624023, "rewards/margins": 0.9473745226860046, "rewards/rejected": -1.7840969562530518, "step": 3370 }, { "epoch": 3.67, "grad_norm": 3.743840034344002, "learning_rate": 1.693946013406967e-07, "logits/chosen": 3.320786714553833, "logits/rejected": 3.3713016510009766, "logps/chosen": -645.5264892578125, "logps/rejected": -607.998291015625, "loss": 0.421, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8214617967605591, "rewards/margins": 1.0396496057510376, "rewards/rejected": -1.8611112833023071, "step": 3380 }, { "epoch": 3.68, "grad_norm": 3.799964746211055, "learning_rate": 1.6802578884590266e-07, "logits/chosen": 3.4401779174804688, "logits/rejected": 3.4008584022521973, "logps/chosen": -610.3036499023438, "logps/rejected": -604.7216796875, "loss": 0.4236, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7632851004600525, "rewards/margins": 1.1178282499313354, "rewards/rejected": -1.8811134099960327, "step": 3390 }, { "epoch": 3.69, "grad_norm": 4.5967188360385585, "learning_rate": 1.6665972434776154e-07, "logits/chosen": 3.250286817550659, "logits/rejected": 3.3413288593292236, "logps/chosen": -614.2950439453125, "logps/rejected": -629.7451782226562, "loss": 0.4297, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9370313882827759, "rewards/margins": 1.132611870765686, "rewards/rejected": -2.069643020629883, "step": 3400 }, { "epoch": 3.69, "eval_logits/chosen": 3.432600259780884, "eval_logits/rejected": 3.471015691757202, "eval_logps/chosen": -633.59619140625, "eval_logps/rejected": -613.3634033203125, "eval_loss": 0.477384477853775, "eval_rewards/accuracies": 0.795634925365448, "eval_rewards/chosen": -0.9025925397872925, "eval_rewards/margins": 0.990296483039856, "eval_rewards/rejected": -1.8928890228271484, "eval_runtime": 202.9773, "eval_samples_per_second": 9.853, "eval_steps_per_second": 0.31, "step": 3400 }, { "epoch": 3.7, "grad_norm": 3.5030890328455397, "learning_rate": 1.652964536404397e-07, "logits/chosen": 3.3207294940948486, "logits/rejected": 3.187941312789917, "logps/chosen": -633.8218994140625, "logps/rejected": -643.2042236328125, "loss": 0.4357, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.082877516746521, "rewards/margins": 0.9908887147903442, "rewards/rejected": -2.0737662315368652, "step": 3410 }, { "epoch": 3.71, "grad_norm": 4.5600807867940265, "learning_rate": 1.6393602242444826e-07, "logits/chosen": 3.4069080352783203, "logits/rejected": 3.401517868041992, "logps/chosen": -687.3406982421875, "logps/rejected": -671.100830078125, "loss": 0.4146, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.9297691583633423, "rewards/margins": 1.207360863685608, "rewards/rejected": -2.1371302604675293, "step": 3420 }, { "epoch": 3.72, "grad_norm": 3.7194151093988306, "learning_rate": 1.625784763051108e-07, "logits/chosen": 3.2494473457336426, "logits/rejected": 3.287376880645752, "logps/chosen": -664.7005615234375, "logps/rejected": -613.7723999023438, "loss": 0.4432, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0459792613983154, "rewards/margins": 1.1180553436279297, "rewards/rejected": -2.164034605026245, "step": 3430 }, { "epoch": 3.74, "grad_norm": 3.819169373247336, "learning_rate": 1.6122386079103466e-07, "logits/chosen": 3.338268280029297, "logits/rejected": 3.398308515548706, "logps/chosen": -608.5794067382812, "logps/rejected": -587.7130737304688, "loss": 0.4226, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9657732844352722, "rewards/margins": 0.9586571455001831, "rewards/rejected": -1.9244304895401, "step": 3440 }, { "epoch": 3.75, "grad_norm": 3.7443775545990237, "learning_rate": 1.5987222129258548e-07, "logits/chosen": 3.3651328086853027, "logits/rejected": 3.421379804611206, "logps/chosen": -688.8221435546875, "logps/rejected": -598.705322265625, "loss": 0.4318, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8808202743530273, "rewards/margins": 1.1631778478622437, "rewards/rejected": -2.0439980030059814, "step": 3450 }, { "epoch": 3.76, "grad_norm": 3.375910739290284, "learning_rate": 1.585236031203648e-07, "logits/chosen": 3.4626574516296387, "logits/rejected": 3.4364490509033203, "logps/chosen": -666.2822265625, "logps/rejected": -651.7742919921875, "loss": 0.4027, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9553236961364746, "rewards/margins": 1.0626842975616455, "rewards/rejected": -2.018007755279541, "step": 3460 }, { "epoch": 3.77, "grad_norm": 5.077748851404105, "learning_rate": 1.571780514836912e-07, "logits/chosen": 3.360103130340576, "logits/rejected": 3.424436569213867, "logps/chosen": -623.270751953125, "logps/rejected": -584.3328857421875, "loss": 0.4326, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0589148998260498, "rewards/margins": 0.9808281660079956, "rewards/rejected": -2.039742946624756, "step": 3470 }, { "epoch": 3.78, "grad_norm": 4.37910107687616, "learning_rate": 1.5583561148908456e-07, "logits/chosen": 3.3990330696105957, "logits/rejected": 3.3405818939208984, "logps/chosen": -691.7584228515625, "logps/rejected": -686.2445068359375, "loss": 0.4209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9749807119369507, "rewards/margins": 1.1193406581878662, "rewards/rejected": -2.0943212509155273, "step": 3480 }, { "epoch": 3.79, "grad_norm": 5.237860985497682, "learning_rate": 1.5449632813875435e-07, "logits/chosen": 3.300654172897339, "logits/rejected": 3.2150654792785645, "logps/chosen": -606.1732788085938, "logps/rejected": -579.9727783203125, "loss": 0.4591, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0850937366485596, "rewards/margins": 0.8942669630050659, "rewards/rejected": -1.9793609380722046, "step": 3490 }, { "epoch": 3.8, "grad_norm": 3.343823323897354, "learning_rate": 1.531602463290906e-07, "logits/chosen": 3.3689708709716797, "logits/rejected": 3.375143051147461, "logps/chosen": -665.4837036132812, "logps/rejected": -624.3529052734375, "loss": 0.4133, "rewards/accuracies": 0.875, "rewards/chosen": -1.0091758966445923, "rewards/margins": 1.250361442565918, "rewards/rejected": -2.2595372200012207, "step": 3500 }, { "epoch": 3.8, "eval_logits/chosen": 3.4232242107391357, "eval_logits/rejected": 3.460996627807617, "eval_logps/chosen": -635.0674438476562, "eval_logps/rejected": -614.7964477539062, "eval_loss": 0.478471577167511, "eval_rewards/accuracies": 0.7936508059501648, "eval_rewards/chosen": -0.9173057079315186, "eval_rewards/margins": 0.9899141788482666, "eval_rewards/rejected": -1.9072200059890747, "eval_runtime": 202.9263, "eval_samples_per_second": 9.856, "eval_steps_per_second": 0.31, "step": 3500 }, { "epoch": 3.81, "grad_norm": 3.4095309561138643, "learning_rate": 1.5182741084915916e-07, "logits/chosen": 3.3521697521209717, "logits/rejected": 3.3369452953338623, "logps/chosen": -695.1788330078125, "logps/rejected": -645.53125, "loss": 0.4328, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.7928581833839417, "rewards/margins": 0.9856538772583008, "rewards/rejected": -1.7785122394561768, "step": 3510 }, { "epoch": 3.82, "grad_norm": 3.8735531647630554, "learning_rate": 1.5049786637920023e-07, "logits/chosen": 3.3590404987335205, "logits/rejected": 3.385971784591675, "logps/chosen": -658.2880249023438, "logps/rejected": -650.6922607421875, "loss": 0.4226, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9198244214057922, "rewards/margins": 1.0969880819320679, "rewards/rejected": -2.016812562942505, "step": 3520 }, { "epoch": 3.83, "grad_norm": 3.9959550332302243, "learning_rate": 1.4917165748913027e-07, "logits/chosen": 3.255180835723877, "logits/rejected": 3.1931753158569336, "logps/chosen": -608.6309204101562, "logps/rejected": -671.03076171875, "loss": 0.4369, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8883172869682312, "rewards/margins": 1.0490520000457764, "rewards/rejected": -1.9373695850372314, "step": 3530 }, { "epoch": 3.84, "grad_norm": 5.368006978349376, "learning_rate": 1.4784882863704837e-07, "logits/chosen": 3.223203182220459, "logits/rejected": 3.1401946544647217, "logps/chosen": -638.2977905273438, "logps/rejected": -630.3391723632812, "loss": 0.4292, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0007436275482178, "rewards/margins": 1.2180492877960205, "rewards/rejected": -2.2187929153442383, "step": 3540 }, { "epoch": 3.85, "grad_norm": 5.818751422041991, "learning_rate": 1.4652942416774538e-07, "logits/chosen": 3.483919858932495, "logits/rejected": 3.515242338180542, "logps/chosen": -657.88818359375, "logps/rejected": -634.6720581054688, "loss": 0.4236, "rewards/accuracies": 0.8125, "rewards/chosen": -0.857334315776825, "rewards/margins": 1.2372469902038574, "rewards/rejected": -2.094581127166748, "step": 3550 }, { "epoch": 3.87, "grad_norm": 4.515749178887459, "learning_rate": 1.452134883112178e-07, "logits/chosen": 3.3508925437927246, "logits/rejected": 3.3791816234588623, "logps/chosen": -668.8388671875, "logps/rejected": -632.3463134765625, "loss": 0.4399, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8480029106140137, "rewards/margins": 1.2185410261154175, "rewards/rejected": -2.0665438175201416, "step": 3560 }, { "epoch": 3.88, "grad_norm": 4.067413866713914, "learning_rate": 1.4390106518118473e-07, "logits/chosen": 3.379002094268799, "logits/rejected": 3.3554329872131348, "logps/chosen": -599.6312255859375, "logps/rejected": -620.1608276367188, "loss": 0.439, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9548500180244446, "rewards/margins": 1.1053146123886108, "rewards/rejected": -2.0601646900177, "step": 3570 }, { "epoch": 3.89, "grad_norm": 5.089219361186417, "learning_rate": 1.4259219877360934e-07, "logits/chosen": 3.4198906421661377, "logits/rejected": 3.3556511402130127, "logps/chosen": -652.1272583007812, "logps/rejected": -668.5350952148438, "loss": 0.4417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8929985761642456, "rewards/margins": 1.2331557273864746, "rewards/rejected": -2.1261544227600098, "step": 3580 }, { "epoch": 3.9, "grad_norm": 4.432906259186991, "learning_rate": 1.4128693296522364e-07, "logits/chosen": 3.2374885082244873, "logits/rejected": 3.130519390106201, "logps/chosen": -585.5250854492188, "logps/rejected": -623.3826293945312, "loss": 0.4117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0717624425888062, "rewards/margins": 1.0361723899841309, "rewards/rejected": -2.1079349517822266, "step": 3590 }, { "epoch": 3.91, "grad_norm": 3.636654655309098, "learning_rate": 1.3998531151205805e-07, "logits/chosen": 3.287585496902466, "logits/rejected": 3.278582811355591, "logps/chosen": -686.5152587890625, "logps/rejected": -646.55908203125, "loss": 0.4275, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.1729423999786377, "rewards/margins": 1.310024619102478, "rewards/rejected": -2.482966899871826, "step": 3600 }, { "epoch": 3.91, "eval_logits/chosen": 3.422714948654175, "eval_logits/rejected": 3.4634690284729004, "eval_logps/chosen": -645.4227294921875, "eval_logps/rejected": -627.8748168945312, "eval_loss": 0.47942695021629333, "eval_rewards/accuracies": 0.783730149269104, "eval_rewards/chosen": -1.020858645439148, "eval_rewards/margins": 1.0171442031860352, "eval_rewards/rejected": -2.0380029678344727, "eval_runtime": 202.8836, "eval_samples_per_second": 9.858, "eval_steps_per_second": 0.311, "step": 3600 }, { "epoch": 3.92, "grad_norm": 4.160209491669761, "learning_rate": 1.3868737804797454e-07, "logits/chosen": 3.408064603805542, "logits/rejected": 3.399200439453125, "logps/chosen": -596.4379272460938, "logps/rejected": -607.2189331054688, "loss": 0.4449, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0274633169174194, "rewards/margins": 1.101192593574524, "rewards/rejected": -2.1286559104919434, "step": 3610 }, { "epoch": 3.93, "grad_norm": 3.7399365896773533, "learning_rate": 1.3739317608320317e-07, "logits/chosen": 3.3484718799591064, "logits/rejected": 3.4230563640594482, "logps/chosen": -610.9447021484375, "logps/rejected": -623.60986328125, "loss": 0.4218, "rewards/accuracies": 0.8125, "rewards/chosen": -0.981761634349823, "rewards/margins": 0.9789684414863586, "rewards/rejected": -1.9607301950454712, "step": 3620 }, { "epoch": 3.94, "grad_norm": 4.638442822469067, "learning_rate": 1.3610274900288465e-07, "logits/chosen": 3.3025145530700684, "logits/rejected": 3.3533711433410645, "logps/chosen": -610.4231567382812, "logps/rejected": -615.0877685546875, "loss": 0.4159, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0207935571670532, "rewards/margins": 1.1630699634552002, "rewards/rejected": -2.183863401412964, "step": 3630 }, { "epoch": 3.95, "grad_norm": 5.474794030062789, "learning_rate": 1.3481614006561518e-07, "logits/chosen": 3.3728854656219482, "logits/rejected": 3.40693736076355, "logps/chosen": -727.2763671875, "logps/rejected": -657.0306396484375, "loss": 0.4278, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9185264706611633, "rewards/margins": 1.1274440288543701, "rewards/rejected": -2.0459704399108887, "step": 3640 }, { "epoch": 3.96, "grad_norm": 6.250079676188916, "learning_rate": 1.3353339240199633e-07, "logits/chosen": 3.205505847930908, "logits/rejected": 3.2889225482940674, "logps/chosen": -590.033447265625, "logps/rejected": -564.714111328125, "loss": 0.4495, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1220542192459106, "rewards/margins": 0.9148964881896973, "rewards/rejected": -2.0369505882263184, "step": 3650 }, { "epoch": 3.97, "grad_norm": 4.8571334532279655, "learning_rate": 1.322545490131896e-07, "logits/chosen": 3.418250322341919, "logits/rejected": 3.4689507484436035, "logps/chosen": -655.3072509765625, "logps/rejected": -578.1636962890625, "loss": 0.4413, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8048042058944702, "rewards/margins": 1.078054428100586, "rewards/rejected": -1.8828586339950562, "step": 3660 }, { "epoch": 3.98, "grad_norm": 3.7546128361643647, "learning_rate": 1.309796527694746e-07, "logits/chosen": 3.2979984283447266, "logits/rejected": 3.276188611984253, "logps/chosen": -575.6425170898438, "logps/rejected": -578.5830078125, "loss": 0.4413, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.0049642324447632, "rewards/margins": 1.0463021993637085, "rewards/rejected": -2.0512664318084717, "step": 3670 }, { "epoch": 4.0, "grad_norm": 4.927909946213724, "learning_rate": 1.2970874640881205e-07, "logits/chosen": 3.393319606781006, "logits/rejected": 3.380615234375, "logps/chosen": -633.817626953125, "logps/rejected": -645.5632934570312, "loss": 0.4437, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8656120300292969, "rewards/margins": 1.1373414993286133, "rewards/rejected": -2.00295352935791, "step": 3680 }, { "epoch": 4.01, "grad_norm": 3.635090670963502, "learning_rate": 1.2844187253541081e-07, "logits/chosen": 3.307116985321045, "logits/rejected": 3.3995635509490967, "logps/chosen": -668.4981689453125, "logps/rejected": -595.920166015625, "loss": 0.4335, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8938275575637817, "rewards/margins": 1.0600321292877197, "rewards/rejected": -1.9538596868515015, "step": 3690 }, { "epoch": 4.02, "grad_norm": 4.379048669567367, "learning_rate": 1.271790736183001e-07, "logits/chosen": 3.482682704925537, "logits/rejected": 3.450831174850464, "logps/chosen": -650.8020629882812, "logps/rejected": -644.2947998046875, "loss": 0.4224, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8331559896469116, "rewards/margins": 1.0969077348709106, "rewards/rejected": -1.9300638437271118, "step": 3700 }, { "epoch": 4.02, "eval_logits/chosen": 3.4400007724761963, "eval_logits/rejected": 3.481160879135132, "eval_logps/chosen": -634.6395874023438, "eval_logps/rejected": -614.9320068359375, "eval_loss": 0.47838032245635986, "eval_rewards/accuracies": 0.7936508059501648, "eval_rewards/chosen": -0.9130271673202515, "eval_rewards/margins": 0.9955475926399231, "eval_rewards/rejected": -1.9085748195648193, "eval_runtime": 202.7333, "eval_samples_per_second": 9.865, "eval_steps_per_second": 0.311, "step": 3700 }, { "epoch": 4.03, "grad_norm": 4.744919850350896, "learning_rate": 1.2592039198990567e-07, "logits/chosen": 3.4300265312194824, "logits/rejected": 3.4588325023651123, "logps/chosen": -582.3802490234375, "logps/rejected": -574.881103515625, "loss": 0.4068, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9523553848266602, "rewards/margins": 1.1304038763046265, "rewards/rejected": -2.082759380340576, "step": 3710 }, { "epoch": 4.04, "grad_norm": 4.2781980433834175, "learning_rate": 1.2466586984463033e-07, "logits/chosen": 3.283041477203369, "logits/rejected": 3.3292555809020996, "logps/chosen": -655.4451904296875, "logps/rejected": -611.529296875, "loss": 0.4244, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.9574171304702759, "rewards/margins": 1.0868561267852783, "rewards/rejected": -2.0442731380462646, "step": 3720 }, { "epoch": 4.05, "grad_norm": 4.884052600454236, "learning_rate": 1.2341554923744007e-07, "logits/chosen": 3.3003993034362793, "logits/rejected": 3.383582353591919, "logps/chosen": -637.0278930664062, "logps/rejected": -624.9862060546875, "loss": 0.4397, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.011597990989685, "rewards/margins": 1.0633455514907837, "rewards/rejected": -2.074943780899048, "step": 3730 }, { "epoch": 4.06, "grad_norm": 3.652258541540437, "learning_rate": 1.2216947208245395e-07, "logits/chosen": 3.4180221557617188, "logits/rejected": 3.314668655395508, "logps/chosen": -626.7178955078125, "logps/rejected": -600.8445434570312, "loss": 0.4228, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0129839181900024, "rewards/margins": 1.0503250360488892, "rewards/rejected": -2.0633087158203125, "step": 3740 }, { "epoch": 4.07, "grad_norm": 4.5189078365069255, "learning_rate": 1.2092768015153913e-07, "logits/chosen": 3.302248477935791, "logits/rejected": 3.437277317047119, "logps/chosen": -627.9656372070312, "logps/rejected": -564.1021728515625, "loss": 0.4134, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9830034375190735, "rewards/margins": 1.0382428169250488, "rewards/rejected": -2.0212459564208984, "step": 3750 }, { "epoch": 4.08, "grad_norm": 4.651438848297526, "learning_rate": 1.1969021507291018e-07, "logits/chosen": 3.34623384475708, "logits/rejected": 3.4822335243225098, "logps/chosen": -670.3914794921875, "logps/rejected": -595.1871948242188, "loss": 0.4345, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9710467457771301, "rewards/margins": 1.094674825668335, "rewards/rejected": -2.0657215118408203, "step": 3760 }, { "epoch": 4.09, "grad_norm": 4.831404139992892, "learning_rate": 1.1845711832973429e-07, "logits/chosen": 3.3657729625701904, "logits/rejected": 3.376382827758789, "logps/chosen": -630.7979736328125, "logps/rejected": -632.58154296875, "loss": 0.4056, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9402076005935669, "rewards/margins": 1.0973269939422607, "rewards/rejected": -2.037534713745117, "step": 3770 }, { "epoch": 4.1, "grad_norm": 5.832474258246264, "learning_rate": 1.1722843125874016e-07, "logits/chosen": 3.3085269927978516, "logits/rejected": 3.3649120330810547, "logps/chosen": -656.54150390625, "logps/rejected": -626.4422607421875, "loss": 0.4198, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.949749767780304, "rewards/margins": 1.1984117031097412, "rewards/rejected": -2.1481614112854004, "step": 3780 }, { "epoch": 4.12, "grad_norm": 4.985892937566476, "learning_rate": 1.1600419504883215e-07, "logits/chosen": 3.4230434894561768, "logits/rejected": 3.4386258125305176, "logps/chosen": -670.7752685546875, "logps/rejected": -629.9710693359375, "loss": 0.4387, "rewards/accuracies": 0.78125, "rewards/chosen": -1.013933777809143, "rewards/margins": 1.1262916326522827, "rewards/rejected": -2.140225410461426, "step": 3790 }, { "epoch": 4.13, "grad_norm": 4.608938175733721, "learning_rate": 1.1478445073971007e-07, "logits/chosen": 3.133939743041992, "logits/rejected": 3.20147967338562, "logps/chosen": -592.5680541992188, "logps/rejected": -570.9287109375, "loss": 0.4101, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0050890445709229, "rewards/margins": 1.0093024969100952, "rewards/rejected": -2.0143914222717285, "step": 3800 }, { "epoch": 4.13, "eval_logits/chosen": 3.422454833984375, "eval_logits/rejected": 3.4569199085235596, "eval_logps/chosen": -638.0772094726562, "eval_logps/rejected": -619.7818603515625, "eval_loss": 0.47731995582580566, "eval_rewards/accuracies": 0.7876983880996704, "eval_rewards/chosen": -0.9474031925201416, "eval_rewards/margins": 1.0096713304519653, "eval_rewards/rejected": -1.957074522972107, "eval_runtime": 202.9426, "eval_samples_per_second": 9.855, "eval_steps_per_second": 0.31, "step": 3800 }, { "epoch": 4.14, "grad_norm": 4.266967058847925, "learning_rate": 1.1356923922049297e-07, "logits/chosen": 3.4540011882781982, "logits/rejected": 3.403491258621216, "logps/chosen": -628.2205810546875, "logps/rejected": -639.0848999023438, "loss": 0.4002, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9759773015975952, "rewards/margins": 1.1415761709213257, "rewards/rejected": -2.117553234100342, "step": 3810 }, { "epoch": 4.15, "grad_norm": 4.27011854566667, "learning_rate": 1.1235860122834858e-07, "logits/chosen": 3.3374085426330566, "logits/rejected": 3.3360908031463623, "logps/chosen": -704.3816528320312, "logps/rejected": -671.1971435546875, "loss": 0.4087, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0669662952423096, "rewards/margins": 1.1149975061416626, "rewards/rejected": -2.1819636821746826, "step": 3820 }, { "epoch": 4.16, "grad_norm": 4.340973016665936, "learning_rate": 1.1115257734712755e-07, "logits/chosen": 3.3176560401916504, "logits/rejected": 3.362886428833008, "logps/chosen": -719.64208984375, "logps/rejected": -633.7008056640625, "loss": 0.4215, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.0610309839248657, "rewards/margins": 1.3459486961364746, "rewards/rejected": -2.4069790840148926, "step": 3830 }, { "epoch": 4.17, "grad_norm": 3.5539401102691444, "learning_rate": 1.0995120800600322e-07, "logits/chosen": 3.295255661010742, "logits/rejected": 3.3017280101776123, "logps/chosen": -630.6492919921875, "logps/rejected": -632.7191772460938, "loss": 0.41, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0907437801361084, "rewards/margins": 1.179003357887268, "rewards/rejected": -2.269747257232666, "step": 3840 }, { "epoch": 4.18, "grad_norm": 5.480402411765085, "learning_rate": 1.0875453347811623e-07, "logits/chosen": 3.3341212272644043, "logits/rejected": 3.3940231800079346, "logps/chosen": -640.7610473632812, "logps/rejected": -615.4697265625, "loss": 0.4057, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9333788752555847, "rewards/margins": 1.2855052947998047, "rewards/rejected": -2.218884229660034, "step": 3850 }, { "epoch": 4.19, "grad_norm": 4.0247985991808335, "learning_rate": 1.0756259387922417e-07, "logits/chosen": 3.4757308959960938, "logits/rejected": 3.3736164569854736, "logps/chosen": -592.9967651367188, "logps/rejected": -604.2167358398438, "loss": 0.4186, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.9449540376663208, "rewards/margins": 1.1130799055099487, "rewards/rejected": -2.0580339431762695, "step": 3860 }, { "epoch": 4.2, "grad_norm": 3.808674396015461, "learning_rate": 1.0637542916635733e-07, "logits/chosen": 3.385357618331909, "logits/rejected": 3.3909316062927246, "logps/chosen": -614.0596313476562, "logps/rejected": -613.41015625, "loss": 0.4293, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7040565609931946, "rewards/margins": 1.097634196281433, "rewards/rejected": -1.801690697669983, "step": 3870 }, { "epoch": 4.21, "grad_norm": 3.7696858880428796, "learning_rate": 1.051930791364788e-07, "logits/chosen": 3.235701084136963, "logits/rejected": 3.214096784591675, "logps/chosen": -655.4987182617188, "logps/rejected": -564.7158813476562, "loss": 0.4096, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9135665893554688, "rewards/margins": 1.1304066181182861, "rewards/rejected": -2.043973445892334, "step": 3880 }, { "epoch": 4.22, "grad_norm": 6.8837413169779955, "learning_rate": 1.0401558342515063e-07, "logits/chosen": 3.31986927986145, "logits/rejected": 3.338214874267578, "logps/chosen": -690.4530029296875, "logps/rejected": -661.3316650390625, "loss": 0.4307, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.9339920282363892, "rewards/margins": 1.0488418340682983, "rewards/rejected": -1.9828341007232666, "step": 3890 }, { "epoch": 4.23, "grad_norm": 4.231560477523635, "learning_rate": 1.028429815052047e-07, "logits/chosen": 3.285670518875122, "logits/rejected": 3.300931215286255, "logps/chosen": -687.8453369140625, "logps/rejected": -601.5693969726562, "loss": 0.4295, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9780045747756958, "rewards/margins": 1.2776046991348267, "rewards/rejected": -2.2556090354919434, "step": 3900 }, { "epoch": 4.23, "eval_logits/chosen": 3.3998217582702637, "eval_logits/rejected": 3.428964376449585, "eval_logps/chosen": -642.2666015625, "eval_logps/rejected": -625.0360717773438, "eval_loss": 0.47901660203933716, "eval_rewards/accuracies": 0.795634925365448, "eval_rewards/chosen": -0.9892975091934204, "eval_rewards/margins": 1.020318865776062, "eval_rewards/rejected": -2.0096163749694824, "eval_runtime": 202.8405, "eval_samples_per_second": 9.86, "eval_steps_per_second": 0.311, "step": 3900 }, { "epoch": 4.25, "grad_norm": 4.84330652049246, "learning_rate": 1.0167531268542026e-07, "logits/chosen": 3.4341633319854736, "logits/rejected": 3.386991024017334, "logps/chosen": -623.751708984375, "logps/rejected": -648.4140625, "loss": 0.4191, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9651447534561157, "rewards/margins": 1.2494183778762817, "rewards/rejected": -2.2145628929138184, "step": 3910 }, { "epoch": 4.26, "grad_norm": 4.525568517684105, "learning_rate": 1.005126161092053e-07, "logits/chosen": 3.2907967567443848, "logits/rejected": 3.3596577644348145, "logps/chosen": -630.5189208984375, "logps/rejected": -607.6754150390625, "loss": 0.4117, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0380223989486694, "rewards/margins": 1.147351622581482, "rewards/rejected": -2.1853740215301514, "step": 3920 }, { "epoch": 4.27, "grad_norm": 4.723979298330365, "learning_rate": 9.935493075328518e-08, "logits/chosen": 3.127185821533203, "logits/rejected": 3.149927854537964, "logps/chosen": -591.57177734375, "logps/rejected": -588.5655517578125, "loss": 0.4143, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.0499693155288696, "rewards/margins": 1.2105190753936768, "rewards/rejected": -2.260488510131836, "step": 3930 }, { "epoch": 4.28, "grad_norm": 4.2884489136695185, "learning_rate": 9.820229542639529e-08, "logits/chosen": 3.2507805824279785, "logits/rejected": 3.0884203910827637, "logps/chosen": -626.3121337890625, "logps/rejected": -596.10595703125, "loss": 0.4207, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.01030695438385, "rewards/margins": 1.1719428300857544, "rewards/rejected": -2.1822495460510254, "step": 3940 }, { "epoch": 4.29, "grad_norm": 3.9822564622794254, "learning_rate": 9.705474876798068e-08, "logits/chosen": 3.262510299682617, "logits/rejected": 3.3105883598327637, "logps/chosen": -607.6744384765625, "logps/rejected": -613.1331787109375, "loss": 0.4231, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.054054856300354, "rewards/margins": 1.0771335363388062, "rewards/rejected": -2.131188154220581, "step": 3950 }, { "epoch": 4.3, "grad_norm": 4.49091860837814, "learning_rate": 9.591232924690037e-08, "logits/chosen": 3.3651633262634277, "logits/rejected": 3.3389458656311035, "logps/chosen": -629.3226318359375, "logps/rejected": -607.5804443359375, "loss": 0.4161, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6730826497077942, "rewards/margins": 1.1576800346374512, "rewards/rejected": -1.8307626247406006, "step": 3960 }, { "epoch": 4.31, "grad_norm": 5.515041237259014, "learning_rate": 9.477507516013811e-08, "logits/chosen": 3.480274200439453, "logits/rejected": 3.4106287956237793, "logps/chosen": -641.4927978515625, "logps/rejected": -647.2362060546875, "loss": 0.4005, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8588997721672058, "rewards/margins": 1.1452442407608032, "rewards/rejected": -2.0041441917419434, "step": 3970 }, { "epoch": 4.32, "grad_norm": 5.163689090047744, "learning_rate": 9.3643024631518e-08, "logits/chosen": 3.230050563812256, "logits/rejected": 3.2616755962371826, "logps/chosen": -614.7962646484375, "logps/rejected": -592.8779907226562, "loss": 0.4113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.949069619178772, "rewards/margins": 1.0197830200195312, "rewards/rejected": -1.9688526391983032, "step": 3980 }, { "epoch": 4.33, "grad_norm": 4.902733937586537, "learning_rate": 9.251621561042716e-08, "logits/chosen": 3.240537643432617, "logits/rejected": 3.270631790161133, "logps/chosen": -628.7385864257812, "logps/rejected": -608.5638427734375, "loss": 0.4175, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9540501832962036, "rewards/margins": 1.1350681781768799, "rewards/rejected": -2.089118480682373, "step": 3990 }, { "epoch": 4.34, "grad_norm": 4.454837215674531, "learning_rate": 9.139468587054317e-08, "logits/chosen": 3.2771542072296143, "logits/rejected": 3.2822394371032715, "logps/chosen": -646.6663818359375, "logps/rejected": -622.8614501953125, "loss": 0.4162, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.061971664428711, "rewards/margins": 1.0516541004180908, "rewards/rejected": -2.1136257648468018, "step": 4000 }, { "epoch": 4.34, "eval_logits/chosen": 3.404010772705078, "eval_logits/rejected": 3.4341719150543213, "eval_logps/chosen": -640.1561889648438, "eval_logps/rejected": -623.0465087890625, "eval_loss": 0.47693389654159546, "eval_rewards/accuracies": 0.795634925365448, "eval_rewards/chosen": -0.9681926369667053, "eval_rewards/margins": 1.021527886390686, "eval_rewards/rejected": -1.989720344543457, "eval_runtime": 203.2457, "eval_samples_per_second": 9.84, "eval_steps_per_second": 0.31, "step": 4000 }, { "epoch": 4.35, "grad_norm": 4.998365579065085, "learning_rate": 9.027847300856769e-08, "logits/chosen": 3.236513137817383, "logits/rejected": 3.22514009475708, "logps/chosen": -609.8795776367188, "logps/rejected": -629.8914184570312, "loss": 0.4246, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.075141429901123, "rewards/margins": 1.0727479457855225, "rewards/rejected": -2.1478893756866455, "step": 4010 }, { "epoch": 4.36, "grad_norm": 5.078456843813655, "learning_rate": 8.91676144429665e-08, "logits/chosen": 3.372511386871338, "logits/rejected": 3.374516725540161, "logps/chosen": -650.2095336914062, "logps/rejected": -650.6064453125, "loss": 0.4155, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0647146701812744, "rewards/margins": 1.0481324195861816, "rewards/rejected": -2.112846851348877, "step": 4020 }, { "epoch": 4.38, "grad_norm": 4.3609569043300045, "learning_rate": 8.806214741271483e-08, "logits/chosen": 3.3630530834198, "logits/rejected": 3.356684923171997, "logps/chosen": -674.0843505859375, "logps/rejected": -627.49169921875, "loss": 0.4247, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8967161178588867, "rewards/margins": 1.1224383115768433, "rewards/rejected": -2.0191545486450195, "step": 4030 }, { "epoch": 4.39, "grad_norm": 3.7584393944277377, "learning_rate": 8.696210897604922e-08, "logits/chosen": 3.265223741531372, "logits/rejected": 3.300980806350708, "logps/chosen": -633.5123291015625, "logps/rejected": -619.6534423828125, "loss": 0.4159, "rewards/accuracies": 0.8125, "rewards/chosen": -1.03200364112854, "rewards/margins": 1.0300449132919312, "rewards/rejected": -2.0620484352111816, "step": 4040 }, { "epoch": 4.4, "grad_norm": 3.265323883185431, "learning_rate": 8.586753600922486e-08, "logits/chosen": 3.3313965797424316, "logits/rejected": 3.3457539081573486, "logps/chosen": -576.4733276367188, "logps/rejected": -584.6893310546875, "loss": 0.4309, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8862001299858093, "rewards/margins": 1.171441912651062, "rewards/rejected": -2.0576419830322266, "step": 4050 }, { "epoch": 4.41, "grad_norm": 3.761267636922022, "learning_rate": 8.477846520527984e-08, "logits/chosen": 3.411687135696411, "logits/rejected": 3.3630423545837402, "logps/chosen": -665.51123046875, "logps/rejected": -620.060791015625, "loss": 0.4089, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8109720945358276, "rewards/margins": 1.247515082359314, "rewards/rejected": -2.0584874153137207, "step": 4060 }, { "epoch": 4.42, "grad_norm": 3.778177699748287, "learning_rate": 8.3694933072805e-08, "logits/chosen": 3.2529239654541016, "logits/rejected": 3.324174165725708, "logps/chosen": -619.845703125, "logps/rejected": -602.2738037109375, "loss": 0.4301, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.932303249835968, "rewards/margins": 1.0312929153442383, "rewards/rejected": -1.963595986366272, "step": 4070 }, { "epoch": 4.43, "grad_norm": 5.015487993856013, "learning_rate": 8.261697593471967e-08, "logits/chosen": 3.2463810443878174, "logits/rejected": 3.315427303314209, "logps/chosen": -588.807373046875, "logps/rejected": -591.37890625, "loss": 0.4299, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9638729095458984, "rewards/margins": 1.1048758029937744, "rewards/rejected": -2.068748950958252, "step": 4080 }, { "epoch": 4.44, "grad_norm": 4.276649164182125, "learning_rate": 8.154462992705454e-08, "logits/chosen": 3.343524932861328, "logits/rejected": 3.326464891433716, "logps/chosen": -631.861572265625, "logps/rejected": -643.6549072265625, "loss": 0.4306, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9578490257263184, "rewards/margins": 1.0083853006362915, "rewards/rejected": -1.9662344455718994, "step": 4090 }, { "epoch": 4.45, "grad_norm": 4.593391665388458, "learning_rate": 8.047793099774014e-08, "logits/chosen": 3.2465949058532715, "logits/rejected": 3.3000450134277344, "logps/chosen": -645.3646850585938, "logps/rejected": -608.7596435546875, "loss": 0.425, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9739478826522827, "rewards/margins": 1.23960280418396, "rewards/rejected": -2.213550567626953, "step": 4100 }, { "epoch": 4.45, "eval_logits/chosen": 3.4237234592437744, "eval_logits/rejected": 3.4579787254333496, "eval_logps/chosen": -638.862060546875, "eval_logps/rejected": -621.9555053710938, "eval_loss": 0.47585567831993103, "eval_rewards/accuracies": 0.7916666865348816, "eval_rewards/chosen": -0.9552515745162964, "eval_rewards/margins": 1.0235581398010254, "eval_rewards/rejected": -1.9788098335266113, "eval_runtime": 202.8726, "eval_samples_per_second": 9.858, "eval_steps_per_second": 0.311, "step": 4100 }, { "epoch": 4.46, "grad_norm": 5.088982868472457, "learning_rate": 7.941691490540161e-08, "logits/chosen": 3.4403469562530518, "logits/rejected": 3.5343425273895264, "logps/chosen": -696.0791625976562, "logps/rejected": -612.7171630859375, "loss": 0.4457, "rewards/accuracies": 0.84375, "rewards/chosen": -1.076757550239563, "rewards/margins": 1.2494875192642212, "rewards/rejected": -2.326245069503784, "step": 4110 }, { "epoch": 4.47, "grad_norm": 3.843403335802077, "learning_rate": 7.836161721815992e-08, "logits/chosen": 3.3702099323272705, "logits/rejected": 3.3862807750701904, "logps/chosen": -700.8905029296875, "logps/rejected": -730.05126953125, "loss": 0.4042, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.902929961681366, "rewards/margins": 1.358778715133667, "rewards/rejected": -2.2617084980010986, "step": 4120 }, { "epoch": 4.48, "grad_norm": 4.310462247780051, "learning_rate": 7.731207331243992e-08, "logits/chosen": 3.2865593433380127, "logits/rejected": 3.242449998855591, "logps/chosen": -678.3230590820312, "logps/rejected": -682.6590576171875, "loss": 0.4111, "rewards/accuracies": 0.875, "rewards/chosen": -0.9124001264572144, "rewards/margins": 1.4731018543243408, "rewards/rejected": -2.3855018615722656, "step": 4130 }, { "epoch": 4.5, "grad_norm": 3.435121047348075, "learning_rate": 7.626831837178413e-08, "logits/chosen": 3.4061710834503174, "logits/rejected": 3.3974239826202393, "logps/chosen": -679.5182495117188, "logps/rejected": -665.0508422851562, "loss": 0.3845, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8448168039321899, "rewards/margins": 1.3219852447509766, "rewards/rejected": -2.166801929473877, "step": 4140 }, { "epoch": 4.51, "grad_norm": 4.65878306914775, "learning_rate": 7.523038738567317e-08, "logits/chosen": 3.3584542274475098, "logits/rejected": 3.4139747619628906, "logps/chosen": -634.37255859375, "logps/rejected": -603.0714111328125, "loss": 0.4122, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0318065881729126, "rewards/margins": 1.1386983394622803, "rewards/rejected": -2.1705050468444824, "step": 4150 }, { "epoch": 4.52, "grad_norm": 4.581289609864568, "learning_rate": 7.419831514835318e-08, "logits/chosen": 3.4580376148223877, "logits/rejected": 3.436858654022217, "logps/chosen": -698.56494140625, "logps/rejected": -683.8887939453125, "loss": 0.417, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9824358224868774, "rewards/margins": 1.047573208808899, "rewards/rejected": -2.0300090312957764, "step": 4160 }, { "epoch": 4.53, "grad_norm": 4.181278989327639, "learning_rate": 7.317213625766921e-08, "logits/chosen": 3.445160388946533, "logits/rejected": 3.3971214294433594, "logps/chosen": -610.2749633789062, "logps/rejected": -648.83984375, "loss": 0.4002, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9260603785514832, "rewards/margins": 1.2716095447540283, "rewards/rejected": -2.1976699829101562, "step": 4170 }, { "epoch": 4.54, "grad_norm": 5.517001823658448, "learning_rate": 7.215188511390549e-08, "logits/chosen": 3.3265254497528076, "logits/rejected": 3.3422646522521973, "logps/chosen": -642.5684204101562, "logps/rejected": -604.9553833007812, "loss": 0.4061, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0304888486862183, "rewards/margins": 1.15310800075531, "rewards/rejected": -2.1835970878601074, "step": 4180 }, { "epoch": 4.55, "grad_norm": 5.084928927565557, "learning_rate": 7.113759591863197e-08, "logits/chosen": 3.2932231426239014, "logits/rejected": 3.3378074169158936, "logps/chosen": -646.2611694335938, "logps/rejected": -613.41650390625, "loss": 0.415, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8827333450317383, "rewards/margins": 1.1417460441589355, "rewards/rejected": -2.024479389190674, "step": 4190 }, { "epoch": 4.56, "grad_norm": 6.4229927408586125, "learning_rate": 7.012930267355818e-08, "logits/chosen": 3.3931682109832764, "logits/rejected": 3.4705066680908203, "logps/chosen": -614.5667724609375, "logps/rejected": -627.6751708984375, "loss": 0.4155, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.155367136001587, "rewards/margins": 1.1422535181045532, "rewards/rejected": -2.2976207733154297, "step": 4200 }, { "epoch": 4.56, "eval_logits/chosen": 3.398123025894165, "eval_logits/rejected": 3.427708148956299, "eval_logps/chosen": -645.1696166992188, "eval_logps/rejected": -629.8077392578125, "eval_loss": 0.47777795791625977, "eval_rewards/accuracies": 0.7916666865348816, "eval_rewards/chosen": -1.0183273553848267, "eval_rewards/margins": 1.0390048027038574, "eval_rewards/rejected": -2.0573320388793945, "eval_runtime": 203.136, "eval_samples_per_second": 9.846, "eval_steps_per_second": 0.31, "step": 4200 }, { "epoch": 4.57, "grad_norm": 4.983938050066726, "learning_rate": 6.912703917939331e-08, "logits/chosen": 3.275489330291748, "logits/rejected": 3.266265869140625, "logps/chosen": -667.8606567382812, "logps/rejected": -658.889404296875, "loss": 0.4066, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.0209743976593018, "rewards/margins": 1.134408712387085, "rewards/rejected": -2.1553831100463867, "step": 4210 }, { "epoch": 4.58, "grad_norm": 5.492779356862019, "learning_rate": 6.81308390347127e-08, "logits/chosen": 3.339324951171875, "logits/rejected": 3.317880630493164, "logps/chosen": -618.5656127929688, "logps/rejected": -620.5933837890625, "loss": 0.407, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9727081060409546, "rewards/margins": 1.3094838857650757, "rewards/rejected": -2.2821919918060303, "step": 4220 }, { "epoch": 4.59, "grad_norm": 4.854526188053133, "learning_rate": 6.714073563483221e-08, "logits/chosen": 3.2255160808563232, "logits/rejected": 3.220808506011963, "logps/chosen": -673.5921630859375, "logps/rejected": -639.7328491210938, "loss": 0.4328, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1177629232406616, "rewards/margins": 1.1165602207183838, "rewards/rejected": -2.234323024749756, "step": 4230 }, { "epoch": 4.6, "grad_norm": 4.141750829062684, "learning_rate": 6.615676217068833e-08, "logits/chosen": 3.2563107013702393, "logits/rejected": 3.2750449180603027, "logps/chosen": -667.8575439453125, "logps/rejected": -636.2128295898438, "loss": 0.4224, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.1001965999603271, "rewards/margins": 1.0079988241195679, "rewards/rejected": -2.1081955432891846, "step": 4240 }, { "epoch": 4.61, "grad_norm": 4.088192065267914, "learning_rate": 6.517895162772538e-08, "logits/chosen": 3.3319525718688965, "logits/rejected": 3.2376513481140137, "logps/chosen": -616.2135009765625, "logps/rejected": -643.5526733398438, "loss": 0.4049, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.1921319961547852, "rewards/margins": 1.1689560413360596, "rewards/rejected": -2.3610877990722656, "step": 4250 }, { "epoch": 4.63, "grad_norm": 3.7531703944598815, "learning_rate": 6.420733678478995e-08, "logits/chosen": 3.4484024047851562, "logits/rejected": 3.348048448562622, "logps/chosen": -638.4662475585938, "logps/rejected": -627.6661376953125, "loss": 0.4192, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9249661564826965, "rewards/margins": 1.3605735301971436, "rewards/rejected": -2.2855398654937744, "step": 4260 }, { "epoch": 4.64, "grad_norm": 6.846389928383541, "learning_rate": 6.324195021303225e-08, "logits/chosen": 3.4238860607147217, "logits/rejected": 3.410504102706909, "logps/chosen": -598.2091064453125, "logps/rejected": -618.4214477539062, "loss": 0.4063, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9846474528312683, "rewards/margins": 1.142899513244629, "rewards/rejected": -2.127547025680542, "step": 4270 }, { "epoch": 4.65, "grad_norm": 4.636272897331951, "learning_rate": 6.228282427481394e-08, "logits/chosen": 3.274707317352295, "logits/rejected": 3.250917434692383, "logps/chosen": -642.2122192382812, "logps/rejected": -667.5880737304688, "loss": 0.4132, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9661245346069336, "rewards/margins": 1.148542881011963, "rewards/rejected": -2.1146671772003174, "step": 4280 }, { "epoch": 4.66, "grad_norm": 5.207389231138979, "learning_rate": 6.132999112262339e-08, "logits/chosen": 3.3857948780059814, "logits/rejected": 3.430241107940674, "logps/chosen": -606.1727294921875, "logps/rejected": -611.9073486328125, "loss": 0.3993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9053149223327637, "rewards/margins": 1.331397294998169, "rewards/rejected": -2.2367124557495117, "step": 4290 }, { "epoch": 4.67, "grad_norm": 4.231587027915136, "learning_rate": 6.038348269799764e-08, "logits/chosen": 3.3768208026885986, "logits/rejected": 3.30268931388855, "logps/chosen": -636.0169067382812, "logps/rejected": -650.2225341796875, "loss": 0.4311, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9160051345825195, "rewards/margins": 1.140350103378296, "rewards/rejected": -2.0563549995422363, "step": 4300 }, { "epoch": 4.67, "eval_logits/chosen": 3.410733938217163, "eval_logits/rejected": 3.4412810802459717, "eval_logps/chosen": -640.4597778320312, "eval_logps/rejected": -624.7266235351562, "eval_loss": 0.47653162479400635, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -0.9712289571762085, "eval_rewards/margins": 1.0352927446365356, "eval_rewards/rejected": -2.006521701812744, "eval_runtime": 203.1315, "eval_samples_per_second": 9.846, "eval_steps_per_second": 0.31, "step": 4300 }, { "epoch": 4.68, "grad_norm": 4.973306644416572, "learning_rate": 5.944333073045205e-08, "logits/chosen": 3.374052047729492, "logits/rejected": 3.4084370136260986, "logps/chosen": -642.2147216796875, "logps/rejected": -634.2425537109375, "loss": 0.4347, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.986552357673645, "rewards/margins": 1.129396677017212, "rewards/rejected": -2.1159489154815674, "step": 4310 }, { "epoch": 4.69, "grad_norm": 4.926003781899123, "learning_rate": 5.85095667364163e-08, "logits/chosen": 3.329627513885498, "logits/rejected": 3.2988522052764893, "logps/chosen": -620.7818603515625, "logps/rejected": -593.2532958984375, "loss": 0.3997, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8992365002632141, "rewards/margins": 1.41867995262146, "rewards/rejected": -2.3179163932800293, "step": 4320 }, { "epoch": 4.7, "grad_norm": 4.949352813141063, "learning_rate": 5.758222201817786e-08, "logits/chosen": 3.2874362468719482, "logits/rejected": 3.2021355628967285, "logps/chosen": -618.588623046875, "logps/rejected": -636.9315185546875, "loss": 0.4267, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9578142166137695, "rewards/margins": 0.945949912071228, "rewards/rejected": -1.9037641286849976, "step": 4330 }, { "epoch": 4.71, "grad_norm": 4.199638045038265, "learning_rate": 5.666132766283291e-08, "logits/chosen": 3.2147376537323, "logits/rejected": 3.19602632522583, "logps/chosen": -635.8148803710938, "logps/rejected": -647.5721435546875, "loss": 0.4278, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9341497421264648, "rewards/margins": 1.1103743314743042, "rewards/rejected": -2.0445237159729004, "step": 4340 }, { "epoch": 4.72, "grad_norm": 4.845012036468722, "learning_rate": 5.574691454124397e-08, "logits/chosen": 3.2677807807922363, "logits/rejected": 3.245954990386963, "logps/chosen": -653.6508178710938, "logps/rejected": -608.2535400390625, "loss": 0.4263, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9203804135322571, "rewards/margins": 1.005350112915039, "rewards/rejected": -1.9257304668426514, "step": 4350 }, { "epoch": 4.73, "grad_norm": 4.434984640560382, "learning_rate": 5.48390133070053e-08, "logits/chosen": 3.148699998855591, "logits/rejected": 3.2259299755096436, "logps/chosen": -613.2357177734375, "logps/rejected": -614.7881469726562, "loss": 0.4233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9823920130729675, "rewards/margins": 1.1206071376800537, "rewards/rejected": -2.102999210357666, "step": 4360 }, { "epoch": 4.74, "grad_norm": 5.431057249316014, "learning_rate": 5.393765439541481e-08, "logits/chosen": 3.2440972328186035, "logits/rejected": 3.2051806449890137, "logps/chosen": -566.9620971679688, "logps/rejected": -601.295654296875, "loss": 0.4347, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.1483008861541748, "rewards/margins": 1.140537977218628, "rewards/rejected": -2.2888388633728027, "step": 4370 }, { "epoch": 4.76, "grad_norm": 4.957593775169598, "learning_rate": 5.304286802245442e-08, "logits/chosen": 3.3060543537139893, "logits/rejected": 3.3490371704101562, "logps/chosen": -645.0230712890625, "logps/rejected": -681.2359008789062, "loss": 0.4001, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8980600237846375, "rewards/margins": 1.297424077987671, "rewards/rejected": -2.195484161376953, "step": 4380 }, { "epoch": 4.77, "grad_norm": 4.846223069154338, "learning_rate": 5.2154684183776693e-08, "logits/chosen": 3.4804294109344482, "logits/rejected": 3.414719820022583, "logps/chosen": -627.5233154296875, "logps/rejected": -658.6403198242188, "loss": 0.4158, "rewards/accuracies": 0.84375, "rewards/chosen": -0.892591118812561, "rewards/margins": 1.0979639291763306, "rewards/rejected": -1.9905548095703125, "step": 4390 }, { "epoch": 4.78, "grad_norm": 3.9195408526050732, "learning_rate": 5.12731326536994e-08, "logits/chosen": 3.463097095489502, "logits/rejected": 3.391611099243164, "logps/chosen": -655.9578857421875, "logps/rejected": -643.9403686523438, "loss": 0.41, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7878724336624146, "rewards/margins": 1.3385916948318481, "rewards/rejected": -2.126464366912842, "step": 4400 }, { "epoch": 4.78, "eval_logits/chosen": 3.4080650806427, "eval_logits/rejected": 3.438655376434326, "eval_logps/chosen": -640.9733276367188, "eval_logps/rejected": -625.0818481445312, "eval_loss": 0.4768357574939728, "eval_rewards/accuracies": 0.7916666865348816, "eval_rewards/chosen": -0.9763648509979248, "eval_rewards/margins": 1.0337090492248535, "eval_rewards/rejected": -2.010073661804199, "eval_runtime": 203.0745, "eval_samples_per_second": 9.849, "eval_steps_per_second": 0.31, "step": 4400 }, { "epoch": 4.79, "grad_norm": 5.4524290398157955, "learning_rate": 5.0398242984207475e-08, "logits/chosen": 3.3209099769592285, "logits/rejected": 3.2791080474853516, "logps/chosen": -586.2724609375, "logps/rejected": -618.526123046875, "loss": 0.4101, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.0599873065948486, "rewards/margins": 1.16214919090271, "rewards/rejected": -2.2221364974975586, "step": 4410 }, { "epoch": 4.8, "grad_norm": 4.6020179571514275, "learning_rate": 4.953004450396239e-08, "logits/chosen": 3.284672260284424, "logits/rejected": 3.2334282398223877, "logps/chosen": -563.5467529296875, "logps/rejected": -631.9415283203125, "loss": 0.4228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0434836149215698, "rewards/margins": 1.07113778591156, "rewards/rejected": -2.11462140083313, "step": 4420 }, { "epoch": 4.81, "grad_norm": 4.1241101144448145, "learning_rate": 4.866856631731889e-08, "logits/chosen": 3.443408250808716, "logits/rejected": 3.4631810188293457, "logps/chosen": -676.0413818359375, "logps/rejected": -657.0355224609375, "loss": 0.4116, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0986618995666504, "rewards/margins": 1.040489912033081, "rewards/rejected": -2.1391518115997314, "step": 4430 }, { "epoch": 4.82, "grad_norm": 3.795819911330352, "learning_rate": 4.781383730334918e-08, "logits/chosen": 3.4550979137420654, "logits/rejected": 3.521261692047119, "logps/chosen": -607.6665649414062, "logps/rejected": -585.4976806640625, "loss": 0.3977, "rewards/accuracies": 0.84375, "rewards/chosen": -0.978421688079834, "rewards/margins": 1.056796669960022, "rewards/rejected": -2.0352184772491455, "step": 4440 }, { "epoch": 4.83, "grad_norm": 5.2898365415343624, "learning_rate": 4.696588611487517e-08, "logits/chosen": 3.3449108600616455, "logits/rejected": 3.3410325050354004, "logps/chosen": -651.4517822265625, "logps/rejected": -653.3697509765625, "loss": 0.4288, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9503029584884644, "rewards/margins": 1.1928989887237549, "rewards/rejected": -2.143202066421509, "step": 4450 }, { "epoch": 4.84, "grad_norm": 4.994699717782159, "learning_rate": 4.6124741177507835e-08, "logits/chosen": 3.423499584197998, "logits/rejected": 3.3942131996154785, "logps/chosen": -669.455078125, "logps/rejected": -642.244140625, "loss": 0.3964, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8707054257392883, "rewards/margins": 1.2696247100830078, "rewards/rejected": -2.1403300762176514, "step": 4460 }, { "epoch": 4.85, "grad_norm": 4.1727119144294615, "learning_rate": 4.5290430688693983e-08, "logits/chosen": 3.2331364154815674, "logits/rejected": 3.3367247581481934, "logps/chosen": -672.2711181640625, "logps/rejected": -612.0027465820312, "loss": 0.4116, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.1666219234466553, "rewards/margins": 1.0333144664764404, "rewards/rejected": -2.1999363899230957, "step": 4470 }, { "epoch": 4.86, "grad_norm": 4.3586986594794945, "learning_rate": 4.4462982616771534e-08, "logits/chosen": 3.2222061157226562, "logits/rejected": 3.2643065452575684, "logps/chosen": -666.2291259765625, "logps/rejected": -637.8343505859375, "loss": 0.4348, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0683128833770752, "rewards/margins": 1.2867891788482666, "rewards/rejected": -2.355102062225342, "step": 4480 }, { "epoch": 4.88, "grad_norm": 6.229497687930593, "learning_rate": 4.364242470003154e-08, "logits/chosen": 3.3906490802764893, "logits/rejected": 3.3158926963806152, "logps/chosen": -636.111328125, "logps/rejected": -652.4210815429688, "loss": 0.4149, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8192951083183289, "rewards/margins": 1.3163602352142334, "rewards/rejected": -2.135655164718628, "step": 4490 }, { "epoch": 4.89, "grad_norm": 4.738889591633732, "learning_rate": 4.2828784445788666e-08, "logits/chosen": 3.2304110527038574, "logits/rejected": 3.284945011138916, "logps/chosen": -643.64501953125, "logps/rejected": -607.7906494140625, "loss": 0.4127, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9771002531051636, "rewards/margins": 1.2356865406036377, "rewards/rejected": -2.212786912918091, "step": 4500 }, { "epoch": 4.89, "eval_logits/chosen": 3.4159555435180664, "eval_logits/rejected": 3.4453370571136475, "eval_logps/chosen": -639.3276977539062, "eval_logps/rejected": -624.016845703125, "eval_loss": 0.47491776943206787, "eval_rewards/accuracies": 0.7936508059501648, "eval_rewards/chosen": -0.9599084854125977, "eval_rewards/margins": 1.0395152568817139, "eval_rewards/rejected": -1.9994237422943115, "eval_runtime": 203.3389, "eval_samples_per_second": 9.836, "eval_steps_per_second": 0.31, "step": 4500 }, { "epoch": 4.9, "grad_norm": 4.37751293010457, "learning_rate": 4.2022089129458566e-08, "logits/chosen": 3.273806095123291, "logits/rejected": 3.283414363861084, "logps/chosen": -653.6941528320312, "logps/rejected": -600.9439086914062, "loss": 0.4268, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0693585872650146, "rewards/margins": 1.042353868484497, "rewards/rejected": -2.111712694168091, "step": 4510 }, { "epoch": 4.91, "grad_norm": 3.7680861083504684, "learning_rate": 4.122236579364402e-08, "logits/chosen": 3.2637264728546143, "logits/rejected": 3.3021061420440674, "logps/chosen": -674.3392944335938, "logps/rejected": -635.2799072265625, "loss": 0.4382, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.9136451482772827, "rewards/margins": 1.1711009740829468, "rewards/rejected": -2.0847458839416504, "step": 4520 }, { "epoch": 4.92, "grad_norm": 4.402537573501737, "learning_rate": 4.042964124722834e-08, "logits/chosen": 3.159298896789551, "logits/rejected": 3.158567190170288, "logps/chosen": -611.3229370117188, "logps/rejected": -615.6229248046875, "loss": 0.4264, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0395512580871582, "rewards/margins": 1.0462052822113037, "rewards/rejected": -2.085756778717041, "step": 4530 }, { "epoch": 4.93, "grad_norm": 4.923340820706569, "learning_rate": 3.9643942064476216e-08, "logits/chosen": 3.3168938159942627, "logits/rejected": 3.452329158782959, "logps/chosen": -710.4453125, "logps/rejected": -664.5238647460938, "loss": 0.4232, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.89970862865448, "rewards/margins": 1.0521256923675537, "rewards/rejected": -1.9518343210220337, "step": 4540 }, { "epoch": 4.94, "grad_norm": 3.9167663179010104, "learning_rate": 3.8865294584143506e-08, "logits/chosen": 3.326674222946167, "logits/rejected": 3.3209240436553955, "logps/chosen": -633.3974609375, "logps/rejected": -638.9397583007812, "loss": 0.4162, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8874519467353821, "rewards/margins": 1.1475107669830322, "rewards/rejected": -2.0349628925323486, "step": 4550 }, { "epoch": 4.95, "grad_norm": 4.708892942686557, "learning_rate": 3.809372490859381e-08, "logits/chosen": 3.2978458404541016, "logits/rejected": 3.2902169227600098, "logps/chosen": -591.3196411132812, "logps/rejected": -613.2948608398438, "loss": 0.4383, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9776461720466614, "rewards/margins": 0.9725853800773621, "rewards/rejected": -1.9502315521240234, "step": 4560 }, { "epoch": 4.96, "grad_norm": 4.1717624088625875, "learning_rate": 3.732925890292377e-08, "logits/chosen": 3.1886661052703857, "logits/rejected": 3.2254981994628906, "logps/chosen": -619.9031372070312, "logps/rejected": -610.3145751953125, "loss": 0.4331, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1668506860733032, "rewards/margins": 1.0246211290359497, "rewards/rejected": -2.191471576690674, "step": 4570 }, { "epoch": 4.97, "grad_norm": 4.518807027117185, "learning_rate": 3.657192219409566e-08, "logits/chosen": 3.3411731719970703, "logits/rejected": 3.373260498046875, "logps/chosen": -656.5994262695312, "logps/rejected": -639.821044921875, "loss": 0.4289, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0331714153289795, "rewards/margins": 0.9044809341430664, "rewards/rejected": -1.937652349472046, "step": 4580 }, { "epoch": 4.98, "grad_norm": 4.606342390009915, "learning_rate": 3.582174017007858e-08, "logits/chosen": 3.55220103263855, "logits/rejected": 3.4020888805389404, "logps/chosen": -676.1447143554688, "logps/rejected": -671.7203369140625, "loss": 0.4335, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0054256916046143, "rewards/margins": 1.053364634513855, "rewards/rejected": -2.058790683746338, "step": 4590 }, { "epoch": 4.99, "grad_norm": 4.237458177176042, "learning_rate": 3.507873797899735e-08, "logits/chosen": 3.2579739093780518, "logits/rejected": 3.1894021034240723, "logps/chosen": -560.0206909179688, "logps/rejected": -589.6734619140625, "loss": 0.453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8938484191894531, "rewards/margins": 1.065301537513733, "rewards/rejected": -1.959149956703186, "step": 4600 }, { "epoch": 4.99, "eval_logits/chosen": 3.414238214492798, "eval_logits/rejected": 3.444387674331665, "eval_logps/chosen": -635.6461791992188, "eval_logps/rejected": -619.3519287109375, "eval_loss": 0.4748367667198181, "eval_rewards/accuracies": 0.7916666865348816, "eval_rewards/chosen": -0.9230929613113403, "eval_rewards/margins": 1.029681921005249, "eval_rewards/rejected": -1.9527748823165894, "eval_runtime": 202.7994, "eval_samples_per_second": 9.862, "eval_steps_per_second": 0.311, "step": 4600 }, { "epoch": 5.01, "grad_norm": 4.820324892740955, "learning_rate": 3.434294052828945e-08, "logits/chosen": 3.439835786819458, "logits/rejected": 3.350839138031006, "logps/chosen": -619.0117797851562, "logps/rejected": -672.6756591796875, "loss": 0.4199, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.869764506816864, "rewards/margins": 1.084979772567749, "rewards/rejected": -1.9547443389892578, "step": 4610 }, { "epoch": 5.02, "grad_norm": 3.9650106546389403, "learning_rate": 3.361437248386983e-08, "logits/chosen": 3.4324920177459717, "logits/rejected": 3.456651210784912, "logps/chosen": -668.35888671875, "logps/rejected": -620.5419921875, "loss": 0.3776, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.955123245716095, "rewards/margins": 1.0994551181793213, "rewards/rejected": -2.0545783042907715, "step": 4620 }, { "epoch": 5.03, "grad_norm": 4.70688935810224, "learning_rate": 3.28930582693045e-08, "logits/chosen": 3.2283871173858643, "logits/rejected": 3.264172077178955, "logps/chosen": -729.6647338867188, "logps/rejected": -721.0824584960938, "loss": 0.3998, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.890113353729248, "rewards/margins": 1.3613433837890625, "rewards/rejected": -2.2514567375183105, "step": 4630 }, { "epoch": 5.04, "grad_norm": 5.0103647290306945, "learning_rate": 3.217902206499134e-08, "logits/chosen": 3.302089214324951, "logits/rejected": 3.3854782581329346, "logps/chosen": -672.3339233398438, "logps/rejected": -631.8421630859375, "loss": 0.4473, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8637478947639465, "rewards/margins": 1.0654820203781128, "rewards/rejected": -1.929229974746704, "step": 4640 }, { "epoch": 5.05, "grad_norm": 5.581096231339241, "learning_rate": 3.1472287807349865e-08, "logits/chosen": 3.2928035259246826, "logits/rejected": 3.206064224243164, "logps/chosen": -633.5398559570312, "logps/rejected": -630.5841064453125, "loss": 0.4163, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0908564329147339, "rewards/margins": 1.0193157196044922, "rewards/rejected": -2.1101725101470947, "step": 4650 }, { "epoch": 5.06, "grad_norm": 3.74186827957767, "learning_rate": 3.077287918801841e-08, "logits/chosen": 3.3158020973205566, "logits/rejected": 3.3574090003967285, "logps/chosen": -671.4949340820312, "logps/rejected": -608.8038330078125, "loss": 0.3968, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8246719241142273, "rewards/margins": 1.2937284708023071, "rewards/rejected": -2.1184000968933105, "step": 4660 }, { "epoch": 5.07, "grad_norm": 4.4512392019613545, "learning_rate": 3.0080819653060366e-08, "logits/chosen": 3.2338714599609375, "logits/rejected": 3.3237907886505127, "logps/chosen": -578.6688232421875, "logps/rejected": -599.2220458984375, "loss": 0.4339, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9173551797866821, "rewards/margins": 1.14029860496521, "rewards/rejected": -2.0576539039611816, "step": 4670 }, { "epoch": 5.08, "grad_norm": 4.469588696422973, "learning_rate": 2.9396132402177947e-08, "logits/chosen": 3.3419361114501953, "logits/rejected": 3.3482131958007812, "logps/chosen": -604.2589721679688, "logps/rejected": -582.9603271484375, "loss": 0.3904, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9833993911743164, "rewards/margins": 1.125213861465454, "rewards/rejected": -2.1086132526397705, "step": 4680 }, { "epoch": 5.09, "grad_norm": 4.661317641177597, "learning_rate": 2.871884038793443e-08, "logits/chosen": 3.3045921325683594, "logits/rejected": 3.301054000854492, "logps/chosen": -613.4208374023438, "logps/rejected": -600.6182250976562, "loss": 0.4014, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0003347396850586, "rewards/margins": 1.080404281616211, "rewards/rejected": -2.0807390213012695, "step": 4690 }, { "epoch": 5.1, "grad_norm": 4.278752725947728, "learning_rate": 2.804896631498488e-08, "logits/chosen": 3.3149044513702393, "logits/rejected": 3.3393523693084717, "logps/chosen": -632.0557861328125, "logps/rejected": -626.2003173828125, "loss": 0.4035, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.073968529701233, "rewards/margins": 1.1869663000106812, "rewards/rejected": -2.260934829711914, "step": 4700 }, { "epoch": 5.1, "eval_logits/chosen": 3.401919364929199, "eval_logits/rejected": 3.429266929626465, "eval_logps/chosen": -638.950439453125, "eval_logps/rejected": -623.7211303710938, "eval_loss": 0.47538915276527405, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -0.9561359286308289, "eval_rewards/margins": 1.0403298139572144, "eval_rewards/rejected": -1.9964655637741089, "eval_runtime": 203.0268, "eval_samples_per_second": 9.851, "eval_steps_per_second": 0.31, "step": 4700 }, { "epoch": 5.11, "grad_norm": 4.581330110290142, "learning_rate": 2.738653263931495e-08, "logits/chosen": 3.0435214042663574, "logits/rejected": 3.1650655269622803, "logps/chosen": -640.2958374023438, "logps/rejected": -609.2405395507812, "loss": 0.4067, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0501011610031128, "rewards/margins": 1.3587573766708374, "rewards/rejected": -2.4088587760925293, "step": 4710 }, { "epoch": 5.12, "grad_norm": 4.244257969835405, "learning_rate": 2.6731561567488235e-08, "logits/chosen": 3.550856351852417, "logits/rejected": 3.5885555744171143, "logps/chosen": -620.6921997070312, "logps/rejected": -627.4866333007812, "loss": 0.4089, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0588308572769165, "rewards/margins": 1.1712490320205688, "rewards/rejected": -2.2300798892974854, "step": 4720 }, { "epoch": 5.14, "grad_norm": 4.661841778899971, "learning_rate": 2.6084075055901463e-08, "logits/chosen": 3.4233341217041016, "logits/rejected": 3.5295162200927734, "logps/chosen": -715.7772216796875, "logps/rejected": -646.1692504882812, "loss": 0.4086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7983964681625366, "rewards/margins": 1.1571332216262817, "rewards/rejected": -1.9555295705795288, "step": 4730 }, { "epoch": 5.15, "grad_norm": 4.6671555450162625, "learning_rate": 2.5444094810048888e-08, "logits/chosen": 3.316159725189209, "logits/rejected": 3.356541872024536, "logps/chosen": -733.1777954101562, "logps/rejected": -655.0715942382812, "loss": 0.4133, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9959171414375305, "rewards/margins": 1.1742465496063232, "rewards/rejected": -2.170163869857788, "step": 4740 }, { "epoch": 5.16, "grad_norm": 3.9437881445314713, "learning_rate": 2.4811642283794537e-08, "logits/chosen": 3.3583106994628906, "logits/rejected": 3.3618130683898926, "logps/chosen": -622.913330078125, "logps/rejected": -617.5716552734375, "loss": 0.3909, "rewards/accuracies": 0.78125, "rewards/chosen": -1.233416199684143, "rewards/margins": 1.1455744504928589, "rewards/rejected": -2.378990650177002, "step": 4750 }, { "epoch": 5.17, "grad_norm": 4.852586309277658, "learning_rate": 2.4186738678652786e-08, "logits/chosen": 3.3076260089874268, "logits/rejected": 3.292635440826416, "logps/chosen": -640.589111328125, "logps/rejected": -656.1749877929688, "loss": 0.4355, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9329697489738464, "rewards/margins": 1.0732852220535278, "rewards/rejected": -2.0062549114227295, "step": 4760 }, { "epoch": 5.18, "grad_norm": 4.243111776045957, "learning_rate": 2.356940494307799e-08, "logits/chosen": 3.3132290840148926, "logits/rejected": 3.3167660236358643, "logps/chosen": -589.9036254882812, "logps/rejected": -592.9352416992188, "loss": 0.3896, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.916020393371582, "rewards/margins": 1.243582844734192, "rewards/rejected": -2.1596033573150635, "step": 4770 }, { "epoch": 5.19, "grad_norm": 4.446088956070295, "learning_rate": 2.295966177176198e-08, "logits/chosen": 3.243967056274414, "logits/rejected": 3.2699360847473145, "logps/chosen": -604.6546630859375, "logps/rejected": -590.2599487304688, "loss": 0.4304, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.808563232421875, "rewards/margins": 1.1195275783538818, "rewards/rejected": -1.928091049194336, "step": 4780 }, { "epoch": 5.2, "grad_norm": 4.64754800625305, "learning_rate": 2.2357529604940445e-08, "logits/chosen": 3.391747236251831, "logits/rejected": 3.3919975757598877, "logps/chosen": -651.8663940429688, "logps/rejected": -638.8233642578125, "loss": 0.4051, "rewards/accuracies": 0.84375, "rewards/chosen": -1.012085199356079, "rewards/margins": 1.2385754585266113, "rewards/rejected": -2.2506606578826904, "step": 4790 }, { "epoch": 5.21, "grad_norm": 5.137836752532852, "learning_rate": 2.1763028627707596e-08, "logits/chosen": 3.2514262199401855, "logits/rejected": 3.2740256786346436, "logps/chosen": -732.9015502929688, "logps/rejected": -692.7354125976562, "loss": 0.4225, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9560949206352234, "rewards/margins": 1.128583312034607, "rewards/rejected": -2.0846781730651855, "step": 4800 }, { "epoch": 5.21, "eval_logits/chosen": 3.4077415466308594, "eval_logits/rejected": 3.4359259605407715, "eval_logps/chosen": -638.046142578125, "eval_logps/rejected": -622.6226196289062, "eval_loss": 0.4752858281135559, "eval_rewards/accuracies": 0.7876983880996704, "eval_rewards/chosen": -0.947092592716217, "eval_rewards/margins": 1.0383890867233276, "eval_rewards/rejected": -1.9854816198349, "eval_runtime": 203.1953, "eval_samples_per_second": 9.843, "eval_steps_per_second": 0.31, "step": 4800 }, { "epoch": 5.22, "grad_norm": 3.816159955183857, "learning_rate": 2.1176178769339635e-08, "logits/chosen": 3.4070944786071777, "logits/rejected": 3.5404582023620605, "logps/chosen": -644.9030151367188, "logps/rejected": -614.4904174804688, "loss": 0.408, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9099332690238953, "rewards/margins": 1.2075811624526978, "rewards/rejected": -2.1175143718719482, "step": 4810 }, { "epoch": 5.23, "grad_norm": 3.9284751768775665, "learning_rate": 2.059699970262671e-08, "logits/chosen": 3.6015961170196533, "logits/rejected": 3.378958225250244, "logps/chosen": -631.582763671875, "logps/rejected": -653.2015991210938, "loss": 0.4156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8876656293869019, "rewards/margins": 1.1182324886322021, "rewards/rejected": -2.0058982372283936, "step": 4820 }, { "epoch": 5.24, "grad_norm": 5.252330936247836, "learning_rate": 2.0025510843213132e-08, "logits/chosen": 3.382859706878662, "logits/rejected": 3.3588058948516846, "logps/chosen": -595.603515625, "logps/rejected": -621.556640625, "loss": 0.4164, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.1256474256515503, "rewards/margins": 0.9402295351028442, "rewards/rejected": -2.0658769607543945, "step": 4830 }, { "epoch": 5.26, "grad_norm": 4.475785672412417, "learning_rate": 1.946173134894691e-08, "logits/chosen": 3.3111343383789062, "logits/rejected": 3.285496473312378, "logps/chosen": -645.1478881835938, "logps/rejected": -625.9896240234375, "loss": 0.4121, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9211744070053101, "rewards/margins": 1.1798975467681885, "rewards/rejected": -2.101071834564209, "step": 4840 }, { "epoch": 5.27, "grad_norm": 4.0620936735328135, "learning_rate": 1.8905680119237292e-08, "logits/chosen": 3.2482478618621826, "logits/rejected": 3.231915235519409, "logps/chosen": -581.0772705078125, "logps/rejected": -619.1720581054688, "loss": 0.4024, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.9992374181747437, "rewards/margins": 1.2866462469100952, "rewards/rejected": -2.285883665084839, "step": 4850 }, { "epoch": 5.28, "grad_norm": 3.6728399438555686, "learning_rate": 1.835737579442126e-08, "logits/chosen": 3.308389186859131, "logits/rejected": 3.2529404163360596, "logps/chosen": -613.5067138671875, "logps/rejected": -592.02685546875, "loss": 0.427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0827772617340088, "rewards/margins": 1.0043256282806396, "rewards/rejected": -2.0871028900146484, "step": 4860 }, { "epoch": 5.29, "grad_norm": 3.7106435347709823, "learning_rate": 1.7816836755138535e-08, "logits/chosen": 3.2648491859436035, "logits/rejected": 3.3202967643737793, "logps/chosen": -621.5859375, "logps/rejected": -588.76953125, "loss": 0.4198, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0197430849075317, "rewards/margins": 1.1170375347137451, "rewards/rejected": -2.1367805004119873, "step": 4870 }, { "epoch": 5.3, "grad_norm": 4.8771679617057835, "learning_rate": 1.72840811217157e-08, "logits/chosen": 3.3346686363220215, "logits/rejected": 3.3177542686462402, "logps/chosen": -655.8094482421875, "logps/rejected": -650.5350341796875, "loss": 0.4209, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0607974529266357, "rewards/margins": 1.1251922845840454, "rewards/rejected": -2.1859896183013916, "step": 4880 }, { "epoch": 5.31, "grad_norm": 4.407800753858192, "learning_rate": 1.6759126753558506e-08, "logits/chosen": 3.4082882404327393, "logits/rejected": 3.2606024742126465, "logps/chosen": -608.5733642578125, "logps/rejected": -660.5091552734375, "loss": 0.3973, "rewards/accuracies": 0.875, "rewards/chosen": -0.8485631942749023, "rewards/margins": 1.4242725372314453, "rewards/rejected": -2.2728357315063477, "step": 4890 }, { "epoch": 5.32, "grad_norm": 5.549196075308517, "learning_rate": 1.6241991248553217e-08, "logits/chosen": 3.202904462814331, "logits/rejected": 3.173356533050537, "logps/chosen": -636.3018798828125, "logps/rejected": -643.2816162109375, "loss": 0.3941, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9654437899589539, "rewards/margins": 1.382318377494812, "rewards/rejected": -2.3477623462677, "step": 4900 }, { "epoch": 5.32, "eval_logits/chosen": 3.4012038707733154, "eval_logits/rejected": 3.4281721115112305, "eval_logps/chosen": -639.1229858398438, "eval_logps/rejected": -623.8593139648438, "eval_loss": 0.47542309761047363, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -0.957861602306366, "eval_rewards/margins": 1.0399866104125977, "eval_rewards/rejected": -1.9978482723236084, "eval_runtime": 202.6421, "eval_samples_per_second": 9.87, "eval_steps_per_second": 0.311, "step": 4900 }, { "epoch": 5.33, "grad_norm": 3.8445878799427184, "learning_rate": 1.5732691942476673e-08, "logits/chosen": 3.3608036041259766, "logits/rejected": 3.2699153423309326, "logps/chosen": -624.7760009765625, "logps/rejected": -650.7325439453125, "loss": 0.3724, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.889731764793396, "rewards/margins": 1.1686939001083374, "rewards/rejected": -2.0584254264831543, "step": 4910 }, { "epoch": 5.34, "grad_norm": 3.9165052944586756, "learning_rate": 1.5231245908415348e-08, "logits/chosen": 3.2369396686553955, "logits/rejected": 3.2419559955596924, "logps/chosen": -619.8662719726562, "logps/rejected": -603.00439453125, "loss": 0.4447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1576344966888428, "rewards/margins": 1.0836381912231445, "rewards/rejected": -2.2412729263305664, "step": 4920 }, { "epoch": 5.35, "grad_norm": 5.056300731125499, "learning_rate": 1.4737669956192745e-08, "logits/chosen": 3.3968377113342285, "logits/rejected": 3.513598918914795, "logps/chosen": -640.3262329101562, "logps/rejected": -651.488037109375, "loss": 0.42, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8613258600234985, "rewards/margins": 1.3728233575820923, "rewards/rejected": -2.234149217605591, "step": 4930 }, { "epoch": 5.36, "grad_norm": 5.087084081341506, "learning_rate": 1.425198063180602e-08, "logits/chosen": 3.325380802154541, "logits/rejected": 3.2402496337890625, "logps/chosen": -595.4832763671875, "logps/rejected": -580.9246215820312, "loss": 0.4394, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8831828832626343, "rewards/margins": 1.0617479085922241, "rewards/rejected": -1.9449306726455688, "step": 4940 }, { "epoch": 5.37, "grad_norm": 5.132526160430227, "learning_rate": 1.377419421687126e-08, "logits/chosen": 3.284151554107666, "logits/rejected": 3.3109688758850098, "logps/chosen": -661.6206665039062, "logps/rejected": -648.8699951171875, "loss": 0.4299, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0141069889068604, "rewards/margins": 1.0557873249053955, "rewards/rejected": -2.069894313812256, "step": 4950 }, { "epoch": 5.39, "grad_norm": 5.215006545274992, "learning_rate": 1.3304326728077797e-08, "logits/chosen": 3.396632671356201, "logits/rejected": 3.3906586170196533, "logps/chosen": -671.5709228515625, "logps/rejected": -654.5450439453125, "loss": 0.4483, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8753288388252258, "rewards/margins": 1.0657485723495483, "rewards/rejected": -1.941077470779419, "step": 4960 }, { "epoch": 5.4, "grad_norm": 3.6777621619640075, "learning_rate": 1.284239391665115e-08, "logits/chosen": 3.4247264862060547, "logits/rejected": 3.4094860553741455, "logps/chosen": -633.9702758789062, "logps/rejected": -615.48974609375, "loss": 0.3872, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8330886960029602, "rewards/margins": 1.09920334815979, "rewards/rejected": -1.9322922229766846, "step": 4970 }, { "epoch": 5.41, "grad_norm": 4.692737717148633, "learning_rate": 1.2388411267825e-08, "logits/chosen": 3.251021146774292, "logits/rejected": 3.275005340576172, "logps/chosen": -623.6749267578125, "logps/rejected": -604.2633056640625, "loss": 0.4187, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.025691032409668, "rewards/margins": 1.01606285572052, "rewards/rejected": -2.0417537689208984, "step": 4980 }, { "epoch": 5.42, "grad_norm": 4.3474023538090885, "learning_rate": 1.1942394000322209e-08, "logits/chosen": 3.247284412384033, "logits/rejected": 3.3002495765686035, "logps/chosen": -631.767333984375, "logps/rejected": -658.4265747070312, "loss": 0.4207, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9438480138778687, "rewards/margins": 1.0788995027542114, "rewards/rejected": -2.022747755050659, "step": 4990 }, { "epoch": 5.43, "grad_norm": 4.410015757828728, "learning_rate": 1.1504357065844572e-08, "logits/chosen": 3.3900043964385986, "logits/rejected": 3.348998546600342, "logps/chosen": -643.7843017578125, "logps/rejected": -624.371826171875, "loss": 0.4093, "rewards/accuracies": 0.875, "rewards/chosen": -0.9818245768547058, "rewards/margins": 1.2492727041244507, "rewards/rejected": -2.231097459793091, "step": 5000 }, { "epoch": 5.43, "eval_logits/chosen": 3.4052021503448486, "eval_logits/rejected": 3.43179988861084, "eval_logps/chosen": -634.6866455078125, "eval_logps/rejected": -618.552978515625, "eval_loss": 0.4747713804244995, "eval_rewards/accuracies": 0.7936508059501648, "eval_rewards/chosen": -0.913497269153595, "eval_rewards/margins": 1.0312875509262085, "eval_rewards/rejected": -1.9447849988937378, "eval_runtime": 203.57, "eval_samples_per_second": 9.825, "eval_steps_per_second": 0.309, "step": 5000 }, { "epoch": 5.44, "grad_norm": 4.4119058794666, "learning_rate": 1.1074315148571544e-08, "logits/chosen": 3.1892237663269043, "logits/rejected": 3.2578811645507812, "logps/chosen": -572.1991577148438, "logps/rejected": -589.6619873046875, "loss": 0.397, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8902987241744995, "rewards/margins": 1.1038495302200317, "rewards/rejected": -1.9941484928131104, "step": 5010 }, { "epoch": 5.45, "grad_norm": 4.2989341794003595, "learning_rate": 1.0652282664668083e-08, "logits/chosen": 3.5411269664764404, "logits/rejected": 3.5817456245422363, "logps/chosen": -635.8283081054688, "logps/rejected": -612.7503662109375, "loss": 0.3968, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8479970097541809, "rewards/margins": 1.1505476236343384, "rewards/rejected": -1.998544692993164, "step": 5020 }, { "epoch": 5.46, "grad_norm": 4.459380676676112, "learning_rate": 1.0238273761801335e-08, "logits/chosen": 3.439701795578003, "logits/rejected": 3.438943862915039, "logps/chosen": -617.8238525390625, "logps/rejected": -602.0964965820312, "loss": 0.4133, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9042927622795105, "rewards/margins": 1.271222472190857, "rewards/rejected": -2.1755154132843018, "step": 5030 }, { "epoch": 5.47, "grad_norm": 4.9493254412504015, "learning_rate": 9.832302318666358e-09, "logits/chosen": 3.413435459136963, "logits/rejected": 3.3809356689453125, "logps/chosen": -650.2511596679688, "logps/rejected": -673.4157104492188, "loss": 0.421, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9549147486686707, "rewards/margins": 1.304351568222046, "rewards/rejected": -2.2592661380767822, "step": 5040 }, { "epoch": 5.48, "grad_norm": 4.422796240560526, "learning_rate": 9.434381944520842e-09, "logits/chosen": 3.1826424598693848, "logits/rejected": 3.2024574279785156, "logps/chosen": -614.8028564453125, "logps/rejected": -593.225341796875, "loss": 0.4338, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0104892253875732, "rewards/margins": 1.1413373947143555, "rewards/rejected": -2.1518266201019287, "step": 5050 }, { "epoch": 5.49, "grad_norm": 4.878774841516686, "learning_rate": 9.044525978729011e-09, "logits/chosen": 3.4412167072296143, "logits/rejected": 3.4535269737243652, "logps/chosen": -593.5527954101562, "logps/rejected": -622.8233642578125, "loss": 0.3978, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9767769575119019, "rewards/margins": 1.0934401750564575, "rewards/rejected": -2.0702171325683594, "step": 5060 }, { "epoch": 5.5, "grad_norm": 5.416226405990332, "learning_rate": 8.662747490314342e-09, "logits/chosen": 3.3210251331329346, "logits/rejected": 3.4503540992736816, "logps/chosen": -612.9114990234375, "logps/rejected": -611.572998046875, "loss": 0.4238, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8853181600570679, "rewards/margins": 1.1313741207122803, "rewards/rejected": -2.0166923999786377, "step": 5070 }, { "epoch": 5.52, "grad_norm": 5.385179795149814, "learning_rate": 8.289059277521466e-09, "logits/chosen": 3.242987871170044, "logits/rejected": 3.254490613937378, "logps/chosen": -655.1529541015625, "logps/rejected": -627.9263916015625, "loss": 0.4181, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9421852231025696, "rewards/margins": 1.1894053220748901, "rewards/rejected": -2.1315903663635254, "step": 5080 }, { "epoch": 5.53, "grad_norm": 3.8870803841539363, "learning_rate": 7.92347386738712e-09, "logits/chosen": 3.2488110065460205, "logits/rejected": 3.3222861289978027, "logps/chosen": -662.3355102539062, "logps/rejected": -615.2122802734375, "loss": 0.4185, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9915229082107544, "rewards/margins": 0.9766005277633667, "rewards/rejected": -1.9681236743927002, "step": 5090 }, { "epoch": 5.54, "grad_norm": 11.508896027445495, "learning_rate": 7.566003515320302e-09, "logits/chosen": 3.4032371044158936, "logits/rejected": 3.3691341876983643, "logps/chosen": -664.739990234375, "logps/rejected": -707.6671142578125, "loss": 0.3902, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.9979314804077148, "rewards/margins": 1.2934248447418213, "rewards/rejected": -2.291356086730957, "step": 5100 }, { "epoch": 5.54, "eval_logits/chosen": 3.401404857635498, "eval_logits/rejected": 3.4280807971954346, "eval_logps/chosen": -637.9056396484375, "eval_logps/rejected": -622.2273559570312, "eval_loss": 0.4754364490509033, "eval_rewards/accuracies": 0.795634925365448, "eval_rewards/chosen": -0.9456868171691895, "eval_rewards/margins": 1.035841941833496, "eval_rewards/rejected": -1.981528878211975, "eval_runtime": 203.0612, "eval_samples_per_second": 9.849, "eval_steps_per_second": 0.31, "step": 5100 }, { "epoch": 5.55, "grad_norm": 3.9243176229170555, "learning_rate": 7.216660204691416e-09, "logits/chosen": 3.2955546379089355, "logits/rejected": 3.2030303478240967, "logps/chosen": -619.5804443359375, "logps/rejected": -612.2360229492188, "loss": 0.4362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.972115695476532, "rewards/margins": 1.0014656782150269, "rewards/rejected": -1.973581314086914, "step": 5110 }, { "epoch": 5.56, "grad_norm": 4.090871284685779, "learning_rate": 6.875455646430356e-09, "logits/chosen": 3.300230026245117, "logits/rejected": 3.301499843597412, "logps/chosen": -650.2235717773438, "logps/rejected": -623.32666015625, "loss": 0.401, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9880973696708679, "rewards/margins": 1.0679004192352295, "rewards/rejected": -2.055997848510742, "step": 5120 }, { "epoch": 5.57, "grad_norm": 4.390689543358374, "learning_rate": 6.542401278634258e-09, "logits/chosen": 3.258399248123169, "logits/rejected": 3.2140533924102783, "logps/chosen": -591.4810180664062, "logps/rejected": -590.0782470703125, "loss": 0.4106, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0686167478561401, "rewards/margins": 0.9639756083488464, "rewards/rejected": -2.032592296600342, "step": 5130 }, { "epoch": 5.58, "grad_norm": 5.840226359389912, "learning_rate": 6.217508266183807e-09, "logits/chosen": 3.26456880569458, "logits/rejected": 3.2853972911834717, "logps/chosen": -637.7308349609375, "logps/rejected": -636.2374267578125, "loss": 0.4236, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.0634821653366089, "rewards/margins": 1.077043890953064, "rewards/rejected": -2.140526056289673, "step": 5140 }, { "epoch": 5.59, "grad_norm": 4.482036145529995, "learning_rate": 5.90078750036907e-09, "logits/chosen": 3.2838358879089355, "logits/rejected": 3.3020870685577393, "logps/chosen": -621.7236328125, "logps/rejected": -611.62939453125, "loss": 0.4298, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9451899528503418, "rewards/margins": 1.2082992792129517, "rewards/rejected": -2.153489112854004, "step": 5150 }, { "epoch": 5.6, "grad_norm": 5.640361865517916, "learning_rate": 5.592249598524307e-09, "logits/chosen": 3.2278237342834473, "logits/rejected": 3.272254228591919, "logps/chosen": -655.6765747070312, "logps/rejected": -622.3482055664062, "loss": 0.4081, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9868273735046387, "rewards/margins": 0.9926432371139526, "rewards/rejected": -1.9794708490371704, "step": 5160 }, { "epoch": 5.61, "grad_norm": 4.4146771606215705, "learning_rate": 5.291904903672179e-09, "logits/chosen": 3.2950425148010254, "logits/rejected": 3.319478988647461, "logps/chosen": -650.0654907226562, "logps/rejected": -599.9808349609375, "loss": 0.4234, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9846261739730835, "rewards/margins": 1.0575975179672241, "rewards/rejected": -2.0422236919403076, "step": 5170 }, { "epoch": 5.62, "grad_norm": 4.054755418416655, "learning_rate": 4.999763484176966e-09, "logits/chosen": 3.3758158683776855, "logits/rejected": 3.3496575355529785, "logps/chosen": -616.5472412109375, "logps/rejected": -596.1931762695312, "loss": 0.383, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8751519322395325, "rewards/margins": 1.1981254816055298, "rewards/rejected": -2.073277473449707, "step": 5180 }, { "epoch": 5.64, "grad_norm": 4.483028057554726, "learning_rate": 4.715835133406926e-09, "logits/chosen": 3.383812427520752, "logits/rejected": 3.3428051471710205, "logps/chosen": -606.5137939453125, "logps/rejected": -649.4478759765625, "loss": 0.4252, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9688395261764526, "rewards/margins": 0.9940358996391296, "rewards/rejected": -1.9628756046295166, "step": 5190 }, { "epoch": 5.65, "grad_norm": 4.83914530022394, "learning_rate": 4.440129369406215e-09, "logits/chosen": 3.3034191131591797, "logits/rejected": 3.2730090618133545, "logps/chosen": -618.3013916015625, "logps/rejected": -642.0859375, "loss": 0.3795, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8667186498641968, "rewards/margins": 1.3520619869232178, "rewards/rejected": -2.218780755996704, "step": 5200 }, { "epoch": 5.65, "eval_logits/chosen": 3.3988428115844727, "eval_logits/rejected": 3.425305128097534, "eval_logps/chosen": -638.17236328125, "eval_logps/rejected": -622.5895385742188, "eval_loss": 0.47534114122390747, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -0.9483546018600464, "eval_rewards/margins": 1.0367964506149292, "eval_rewards/rejected": -1.9851511716842651, "eval_runtime": 202.6985, "eval_samples_per_second": 9.867, "eval_steps_per_second": 0.311, "step": 5200 }, { "epoch": 5.66, "grad_norm": 4.001249737907504, "learning_rate": 4.1726554345756505e-09, "logits/chosen": 3.338737964630127, "logits/rejected": 3.3700027465820312, "logps/chosen": -684.7225341796875, "logps/rejected": -670.1957397460938, "loss": 0.426, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8920892477035522, "rewards/margins": 1.1326582431793213, "rewards/rejected": -2.024747133255005, "step": 5210 }, { "epoch": 5.67, "grad_norm": 4.683512865364909, "learning_rate": 3.913422295362928e-09, "logits/chosen": 3.2599501609802246, "logits/rejected": 3.282784938812256, "logps/chosen": -633.5653076171875, "logps/rejected": -591.6630859375, "loss": 0.3939, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9538997411727905, "rewards/margins": 1.2915997505187988, "rewards/rejected": -2.2454991340637207, "step": 5220 }, { "epoch": 5.68, "grad_norm": 4.409498023734784, "learning_rate": 3.6624386419620846e-09, "logits/chosen": 3.321446657180786, "logits/rejected": 3.3035056591033936, "logps/chosen": -581.6798706054688, "logps/rejected": -596.5265502929688, "loss": 0.4364, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9379063844680786, "rewards/margins": 1.1741756200790405, "rewards/rejected": -2.112082004547119, "step": 5230 }, { "epoch": 5.69, "grad_norm": 4.111871972409331, "learning_rate": 3.4197128880220637e-09, "logits/chosen": 3.383150577545166, "logits/rejected": 3.3710696697235107, "logps/chosen": -663.54150390625, "logps/rejected": -652.0343627929688, "loss": 0.4036, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9313344955444336, "rewards/margins": 1.1616183519363403, "rewards/rejected": -2.0929527282714844, "step": 5240 }, { "epoch": 5.7, "grad_norm": 4.876890172442885, "learning_rate": 3.1852531703648323e-09, "logits/chosen": 3.3027126789093018, "logits/rejected": 3.3122799396514893, "logps/chosen": -612.0169677734375, "logps/rejected": -627.9977416992188, "loss": 0.4266, "rewards/accuracies": 0.75, "rewards/chosen": -1.095849871635437, "rewards/margins": 1.2259938716888428, "rewards/rejected": -2.3218436241149902, "step": 5250 }, { "epoch": 5.71, "grad_norm": 4.734855558953547, "learning_rate": 2.959067348712513e-09, "logits/chosen": 3.279170513153076, "logits/rejected": 3.231475353240967, "logps/chosen": -681.5413818359375, "logps/rejected": -652.7421264648438, "loss": 0.4143, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.8554766774177551, "rewards/margins": 1.369996428489685, "rewards/rejected": -2.225472927093506, "step": 5260 }, { "epoch": 5.72, "grad_norm": 5.193642170778163, "learning_rate": 2.7411630054238466e-09, "logits/chosen": 3.3132317066192627, "logits/rejected": 3.260838031768799, "logps/chosen": -661.933837890625, "logps/rejected": -686.833251953125, "loss": 0.4191, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8667977452278137, "rewards/margins": 1.304004430770874, "rewards/rejected": -2.170802354812622, "step": 5270 }, { "epoch": 5.73, "grad_norm": 3.6470268383830295, "learning_rate": 2.531547445240201e-09, "logits/chosen": 3.302412748336792, "logits/rejected": 3.3322739601135254, "logps/chosen": -643.552490234375, "logps/rejected": -647.7770385742188, "loss": 0.4123, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9946368932723999, "rewards/margins": 1.1000826358795166, "rewards/rejected": -2.094719648361206, "step": 5280 }, { "epoch": 5.74, "grad_norm": 4.645085263982338, "learning_rate": 2.3302276950404875e-09, "logits/chosen": 3.258387804031372, "logits/rejected": 3.3486316204071045, "logps/chosen": -694.1226806640625, "logps/rejected": -622.4017944335938, "loss": 0.3881, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.7868496179580688, "rewards/margins": 1.3848803043365479, "rewards/rejected": -2.171729803085327, "step": 5290 }, { "epoch": 5.75, "grad_norm": 4.645244283472372, "learning_rate": 2.1372105036056853e-09, "logits/chosen": 3.4053711891174316, "logits/rejected": 3.450652599334717, "logps/chosen": -641.9345092773438, "logps/rejected": -635.5320434570312, "loss": 0.3915, "rewards/accuracies": 0.75, "rewards/chosen": -1.0863783359527588, "rewards/margins": 1.034938097000122, "rewards/rejected": -2.121316432952881, "step": 5300 }, { "epoch": 5.75, "eval_logits/chosen": 3.397886037826538, "eval_logits/rejected": 3.424150228500366, "eval_logps/chosen": -639.0426635742188, "eval_logps/rejected": -623.64501953125, "eval_loss": 0.47541338205337524, "eval_rewards/accuracies": 0.795634925365448, "eval_rewards/chosen": -0.957057774066925, "eval_rewards/margins": 1.038646936416626, "eval_rewards/rejected": -1.9957046508789062, "eval_runtime": 203.1189, "eval_samples_per_second": 9.846, "eval_steps_per_second": 0.31, "step": 5300 }, { "epoch": 5.77, "grad_norm": 4.0104722995112825, "learning_rate": 1.9525023413926865e-09, "logits/chosen": 3.277747392654419, "logits/rejected": 3.2706780433654785, "logps/chosen": -615.3922119140625, "logps/rejected": -617.8639526367188, "loss": 0.388, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9219114184379578, "rewards/margins": 1.417864203453064, "rewards/rejected": -2.339775562286377, "step": 5310 }, { "epoch": 5.78, "grad_norm": 6.182466662472027, "learning_rate": 1.7761094003172217e-09, "logits/chosen": 3.153205394744873, "logits/rejected": 3.126901388168335, "logps/chosen": -612.4298095703125, "logps/rejected": -585.1417846679688, "loss": 0.439, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0855505466461182, "rewards/margins": 1.1274497509002686, "rewards/rejected": -2.2130002975463867, "step": 5320 }, { "epoch": 5.79, "grad_norm": 3.890323205103855, "learning_rate": 1.608037593546524e-09, "logits/chosen": 3.424553632736206, "logits/rejected": 3.4162402153015137, "logps/chosen": -617.7884521484375, "logps/rejected": -594.5033569335938, "loss": 0.4127, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0019267797470093, "rewards/margins": 1.031286597251892, "rewards/rejected": -2.0332133769989014, "step": 5330 }, { "epoch": 5.8, "grad_norm": 4.193059525281177, "learning_rate": 1.4482925553007675e-09, "logits/chosen": 3.4022529125213623, "logits/rejected": 3.352123975753784, "logps/chosen": -618.6912231445312, "logps/rejected": -686.6834106445312, "loss": 0.4067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8951805233955383, "rewards/margins": 1.1654999256134033, "rewards/rejected": -2.060680627822876, "step": 5340 }, { "epoch": 5.81, "grad_norm": 3.814761111530502, "learning_rate": 1.2968796406645222e-09, "logits/chosen": 3.3192951679229736, "logits/rejected": 3.4117462635040283, "logps/chosen": -636.7406005859375, "logps/rejected": -570.1781005859375, "loss": 0.4208, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9999561309814453, "rewards/margins": 0.9623527526855469, "rewards/rejected": -1.9623088836669922, "step": 5350 }, { "epoch": 5.82, "grad_norm": 4.427513499352976, "learning_rate": 1.1538039254070931e-09, "logits/chosen": 3.3473079204559326, "logits/rejected": 3.3865928649902344, "logps/chosen": -665.1278076171875, "logps/rejected": -651.0011596679688, "loss": 0.4232, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0296939611434937, "rewards/margins": 1.1393468379974365, "rewards/rejected": -2.1690409183502197, "step": 5360 }, { "epoch": 5.83, "grad_norm": 4.009969797074028, "learning_rate": 1.0190702058123224e-09, "logits/chosen": 3.324174404144287, "logits/rejected": 3.387791156768799, "logps/chosen": -624.931640625, "logps/rejected": -565.3156127929688, "loss": 0.384, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8337820172309875, "rewards/margins": 1.1643704175949097, "rewards/rejected": -1.9981523752212524, "step": 5370 }, { "epoch": 5.84, "grad_norm": 4.291144155209391, "learning_rate": 8.926829985178852e-10, "logits/chosen": 3.262115001678467, "logits/rejected": 3.3393502235412598, "logps/chosen": -615.2139282226562, "logps/rejected": -603.3374633789062, "loss": 0.4397, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9387454986572266, "rewards/margins": 1.1294466257095337, "rewards/rejected": -2.06819224357605, "step": 5380 }, { "epoch": 5.85, "grad_norm": 3.5725693541645502, "learning_rate": 7.746465403638824e-10, "logits/chosen": 3.344813585281372, "logits/rejected": 3.3891005516052246, "logps/chosen": -600.55126953125, "logps/rejected": -580.0817260742188, "loss": 0.4058, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9538615942001343, "rewards/margins": 1.2148098945617676, "rewards/rejected": -2.1686716079711914, "step": 5390 }, { "epoch": 5.86, "grad_norm": 4.234295721747548, "learning_rate": 6.649647882507048e-10, "logits/chosen": 3.3508033752441406, "logits/rejected": 3.3046011924743652, "logps/chosen": -684.0999145507812, "logps/rejected": -679.2061767578125, "loss": 0.4075, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9212953448295593, "rewards/margins": 1.2048547267913818, "rewards/rejected": -2.126150131225586, "step": 5400 }, { "epoch": 5.86, "eval_logits/chosen": 3.3962159156799316, "eval_logits/rejected": 3.422123670578003, "eval_logps/chosen": -638.9973754882812, "eval_logps/rejected": -623.5674438476562, "eval_loss": 0.4755534529685974, "eval_rewards/accuracies": 0.7876983880996704, "eval_rewards/chosen": -0.9566047787666321, "eval_rewards/margins": 1.0383244752883911, "eval_rewards/rejected": -1.9949294328689575, "eval_runtime": 203.7259, "eval_samples_per_second": 9.817, "eval_steps_per_second": 0.309, "step": 5400 }, { "epoch": 5.87, "grad_norm": 4.2379320026605525, "learning_rate": 5.636414190065275e-10, "logits/chosen": 3.383838176727295, "logits/rejected": 3.4722800254821777, "logps/chosen": -656.2587890625, "logps/rejected": -619.7723388671875, "loss": 0.4205, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9455728530883789, "rewards/margins": 1.1223266124725342, "rewards/rejected": -2.067899465560913, "step": 5410 }, { "epoch": 5.88, "grad_norm": 5.0826248897642925, "learning_rate": 4.706798292639647e-10, "logits/chosen": 3.1631453037261963, "logits/rejected": 3.2290291786193848, "logps/chosen": -642.7736206054688, "logps/rejected": -565.1218872070312, "loss": 0.4278, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.017159104347229, "rewards/margins": 1.0324034690856934, "rewards/rejected": -2.049562454223633, "step": 5420 }, { "epoch": 5.9, "grad_norm": 4.506571712558853, "learning_rate": 3.8608313534627123e-10, "logits/chosen": 3.4478847980499268, "logits/rejected": 3.4374423027038574, "logps/chosen": -635.98486328125, "logps/rejected": -633.1765747070312, "loss": 0.4308, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0356416702270508, "rewards/margins": 1.0700197219848633, "rewards/rejected": -2.105661392211914, "step": 5430 }, { "epoch": 5.91, "grad_norm": 4.114087218850236, "learning_rate": 3.0985417316273245e-10, "logits/chosen": 3.251316547393799, "logits/rejected": 3.267460346221924, "logps/chosen": -664.8427124023438, "logps/rejected": -629.3033447265625, "loss": 0.3975, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8826308250427246, "rewards/margins": 1.3063408136367798, "rewards/rejected": -2.188971757888794, "step": 5440 }, { "epoch": 5.92, "grad_norm": 5.291501525662143, "learning_rate": 2.419954981138228e-10, "logits/chosen": 3.3314995765686035, "logits/rejected": 3.2727248668670654, "logps/chosen": -593.9716796875, "logps/rejected": -667.9520263671875, "loss": 0.4233, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0151082277297974, "rewards/margins": 1.1485967636108398, "rewards/rejected": -2.1637048721313477, "step": 5450 }, { "epoch": 5.93, "grad_norm": 4.317501097756795, "learning_rate": 1.8250938500530277e-10, "logits/chosen": 3.2977919578552246, "logits/rejected": 3.280268430709839, "logps/chosen": -579.6351928710938, "logps/rejected": -610.71044921875, "loss": 0.4145, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.1247862577438354, "rewards/margins": 1.258470892906189, "rewards/rejected": -2.3832573890686035, "step": 5460 }, { "epoch": 5.94, "grad_norm": 5.746691069828419, "learning_rate": 1.3139782797216836e-10, "logits/chosen": 3.352397918701172, "logits/rejected": 3.2617995738983154, "logps/chosen": -612.7765502929688, "logps/rejected": -599.3033447265625, "loss": 0.4059, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9014495611190796, "rewards/margins": 1.2235716581344604, "rewards/rejected": -2.125021457672119, "step": 5470 }, { "epoch": 5.95, "grad_norm": 4.492518364835113, "learning_rate": 8.866254041164922e-11, "logits/chosen": 3.4882349967956543, "logits/rejected": 3.378361463546753, "logps/chosen": -639.9263916015625, "logps/rejected": -671.6799926757812, "loss": 0.4086, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8735315203666687, "rewards/margins": 1.053206205368042, "rewards/rejected": -1.9267380237579346, "step": 5480 }, { "epoch": 5.96, "grad_norm": 5.077523007074683, "learning_rate": 5.4304954925810106e-11, "logits/chosen": 3.406404972076416, "logits/rejected": 3.4217746257781982, "logps/chosen": -643.9853515625, "logps/rejected": -631.0284423828125, "loss": 0.4089, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.0984253883361816, "rewards/margins": 1.0895438194274902, "rewards/rejected": -2.187969446182251, "step": 5490 }, { "epoch": 5.97, "grad_norm": 4.752208507522833, "learning_rate": 2.8326223273644757e-11, "logits/chosen": 3.398366928100586, "logits/rejected": 3.468355655670166, "logps/chosen": -616.122802734375, "logps/rejected": -599.9295654296875, "loss": 0.4293, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9166873097419739, "rewards/margins": 1.245667815208435, "rewards/rejected": -2.1623549461364746, "step": 5500 }, { "epoch": 5.97, "eval_logits/chosen": 3.396428108215332, "eval_logits/rejected": 3.422976016998291, "eval_logps/chosen": -639.0445556640625, "eval_logps/rejected": -623.5548095703125, "eval_loss": 0.47562962770462036, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -0.9570770263671875, "eval_rewards/margins": 1.0377260446548462, "eval_rewards/rejected": -1.9948030710220337, "eval_runtime": 203.0418, "eval_samples_per_second": 9.85, "eval_steps_per_second": 0.31, "step": 5500 }, { "epoch": 5.98, "grad_norm": 4.5019522271063, "learning_rate": 1.0727216332273581e-11, "logits/chosen": 3.3246712684631348, "logits/rejected": 3.3786635398864746, "logps/chosen": -625.7957763671875, "logps/rejected": -603.4461669921875, "loss": 0.4024, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.7635302543640137, "rewards/margins": 1.106597900390625, "rewards/rejected": -1.8701282739639282, "step": 5510 }, { "epoch": 5.99, "grad_norm": 4.531423424387871, "learning_rate": 1.508524067855843e-12, "logits/chosen": 3.2934257984161377, "logits/rejected": 3.3364462852478027, "logps/chosen": -673.7186279296875, "logps/rejected": -613.6826782226562, "loss": 0.4039, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.0638152360916138, "rewards/margins": 1.1539263725280762, "rewards/rejected": -2.2177414894104004, "step": 5520 }, { "epoch": 6.0, "step": 5526, "total_flos": 0.0, "train_loss": 0.4762616708060343, "train_runtime": 91379.3933, "train_samples_per_second": 3.869, "train_steps_per_second": 0.06 } ], "logging_steps": 10, "max_steps": 5526, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }