{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20013254392163232, "eval_steps": 5000, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001053329178534907, "grad_norm": 217.68748474121094, "learning_rate": 5.263157894736842e-06, "logits/chosen": -2.8389856815338135, "logits/rejected": -2.8396875858306885, "logps/chosen": -5.6633830070495605, "logps/rejected": -6.1086931228637695, "loss": 5.7387, "odds_ratio_loss": 12.78534984588623, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.5663383603096008, "rewards/margins": 0.04453102499246597, "rewards/rejected": -0.6108693480491638, "sft_loss": 4.460188388824463, "step": 10 }, { "epoch": 0.002106658357069814, "grad_norm": 59.99885177612305, "learning_rate": 1.0526315789473684e-05, "logits/chosen": -3.0334506034851074, "logits/rejected": -3.034069299697876, "logps/chosen": -2.313845634460449, "logps/rejected": -3.318277597427368, "loss": 2.3687, "odds_ratio_loss": 3.849175214767456, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.23138457536697388, "rewards/margins": 0.10044321417808533, "rewards/rejected": -0.3318277597427368, "sft_loss": 1.9837348461151123, "step": 20 }, { "epoch": 0.0031599875356047207, "grad_norm": 21.754234313964844, "learning_rate": 1.5789473684210526e-05, "logits/chosen": -3.226034641265869, "logits/rejected": -3.2263522148132324, "logps/chosen": -1.0712947845458984, "logps/rejected": -2.3490335941314697, "loss": 1.1116, "odds_ratio_loss": 2.9476191997528076, "rewards/accuracies": 0.8083333373069763, "rewards/chosen": -0.10712946206331253, "rewards/margins": 0.12777391076087952, "rewards/rejected": -0.23490336537361145, "sft_loss": 0.816817581653595, "step": 30 }, { "epoch": 0.004213316714139628, "grad_norm": 7.782005310058594, "learning_rate": 2.105263157894737e-05, "logits/chosen": -3.4438676834106445, "logits/rejected": -3.4442031383514404, "logps/chosen": -0.8733291625976562, "logps/rejected": -2.5004069805145264, "loss": 0.914, "odds_ratio_loss": 3.1110053062438965, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08733292669057846, "rewards/margins": 0.16270779073238373, "rewards/rejected": -0.2500407099723816, "sft_loss": 0.6028769016265869, "step": 40 }, { "epoch": 0.0052666458926745345, "grad_norm": 9.219217300415039, "learning_rate": 2.631578947368421e-05, "logits/chosen": -3.4509921073913574, "logits/rejected": -3.451155662536621, "logps/chosen": -0.7789952158927917, "logps/rejected": -2.3700900077819824, "loss": 0.8189, "odds_ratio_loss": 2.722813367843628, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.0778995230793953, "rewards/margins": 0.1591094583272934, "rewards/rejected": -0.23700900375843048, "sft_loss": 0.5466489195823669, "step": 50 }, { "epoch": 0.006319975071209441, "grad_norm": 7.359012603759766, "learning_rate": 3.157894736842105e-05, "logits/chosen": -3.5323359966278076, "logits/rejected": -3.532480001449585, "logps/chosen": -0.7335668802261353, "logps/rejected": -2.5415773391723633, "loss": 0.7747, "odds_ratio_loss": 2.7141709327697754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07335668057203293, "rewards/margins": 0.18080104887485504, "rewards/rejected": -0.2541577219963074, "sft_loss": 0.5032956600189209, "step": 60 }, { "epoch": 0.007373304249744348, "grad_norm": 16.543472290039062, "learning_rate": 3.6842105263157895e-05, "logits/chosen": -3.2315316200256348, "logits/rejected": -3.2316715717315674, "logps/chosen": -0.6829439997673035, "logps/rejected": -2.482815980911255, "loss": 0.7205, "odds_ratio_loss": 2.6182825565338135, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.06829439848661423, "rewards/margins": 0.17998717725276947, "rewards/rejected": -0.2482815831899643, "sft_loss": 0.4587023854255676, "step": 70 }, { "epoch": 0.008426633428279255, "grad_norm": 5.6111907958984375, "learning_rate": 4.210526315789474e-05, "logits/chosen": -2.9154586791992188, "logits/rejected": -2.9156744480133057, "logps/chosen": -0.7533618807792664, "logps/rejected": -2.669802665710449, "loss": 0.7912, "odds_ratio_loss": 2.688143730163574, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.07533618807792664, "rewards/margins": 0.19164405763149261, "rewards/rejected": -0.26698026061058044, "sft_loss": 0.5223438739776611, "step": 80 }, { "epoch": 0.009479962606814161, "grad_norm": 3.4394092559814453, "learning_rate": 4.736842105263158e-05, "logits/chosen": -3.0740275382995605, "logits/rejected": -3.0744776725769043, "logps/chosen": -0.6786951422691345, "logps/rejected": -3.1611907482147217, "loss": 0.7145, "odds_ratio_loss": 2.766305446624756, "rewards/accuracies": 0.8270833492279053, "rewards/chosen": -0.06786951422691345, "rewards/margins": 0.2482495754957199, "rewards/rejected": -0.31611910462379456, "sft_loss": 0.4378434419631958, "step": 90 }, { "epoch": 0.010533291785349069, "grad_norm": 9.483699798583984, "learning_rate": 5.263157894736842e-05, "logits/chosen": -3.197289228439331, "logits/rejected": -3.19758677482605, "logps/chosen": -0.6806226372718811, "logps/rejected": -3.0961530208587646, "loss": 0.7172, "odds_ratio_loss": 2.700289726257324, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06806226074695587, "rewards/margins": 0.24155308306217194, "rewards/rejected": -0.3096153140068054, "sft_loss": 0.44717150926589966, "step": 100 }, { "epoch": 0.011586620963883975, "grad_norm": 7.847203731536865, "learning_rate": 5.789473684210527e-05, "logits/chosen": -3.01198673248291, "logits/rejected": -3.0121328830718994, "logps/chosen": -0.7313061356544495, "logps/rejected": -3.1134066581726074, "loss": 0.7701, "odds_ratio_loss": 2.80288028717041, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.07313062250614166, "rewards/margins": 0.2382100373506546, "rewards/rejected": -0.31134068965911865, "sft_loss": 0.489812970161438, "step": 110 }, { "epoch": 0.012639950142418883, "grad_norm": 9.357610702514648, "learning_rate": 6.31578947368421e-05, "logits/chosen": -2.793389320373535, "logits/rejected": -2.793593406677246, "logps/chosen": -0.744411826133728, "logps/rejected": -3.2574427127838135, "loss": 0.781, "odds_ratio_loss": 2.4383304119110107, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.07444119453430176, "rewards/margins": 0.251303106546402, "rewards/rejected": -0.32574430108070374, "sft_loss": 0.5372061133384705, "step": 120 }, { "epoch": 0.013693279320953789, "grad_norm": 4.926955699920654, "learning_rate": 6.842105263157896e-05, "logits/chosen": -2.829728603363037, "logits/rejected": -2.829943895339966, "logps/chosen": -0.5513553619384766, "logps/rejected": -2.877117872238159, "loss": 0.5817, "odds_ratio_loss": 2.001354694366455, "rewards/accuracies": 0.875, "rewards/chosen": -0.055135536938905716, "rewards/margins": 0.23257622122764587, "rewards/rejected": -0.2877117693424225, "sft_loss": 0.38156595826148987, "step": 130 }, { "epoch": 0.014746608499488697, "grad_norm": 8.740342140197754, "learning_rate": 7.368421052631579e-05, "logits/chosen": -2.9767470359802246, "logits/rejected": -2.9770071506500244, "logps/chosen": -0.5641220211982727, "logps/rejected": -4.819352149963379, "loss": 0.589, "odds_ratio_loss": 2.0714950561523438, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -0.05641220510005951, "rewards/margins": 0.4255230724811554, "rewards/rejected": -0.48193517327308655, "sft_loss": 0.3818718194961548, "step": 140 }, { "epoch": 0.015799937678023603, "grad_norm": 21.316011428833008, "learning_rate": 7.894736842105263e-05, "logits/chosen": -2.9424824714660645, "logits/rejected": -2.942646026611328, "logps/chosen": -0.6596536040306091, "logps/rejected": -5.643968105316162, "loss": 0.6873, "odds_ratio_loss": 2.2379026412963867, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -0.06596536934375763, "rewards/margins": 0.4984314739704132, "rewards/rejected": -0.5643967986106873, "sft_loss": 0.46346980333328247, "step": 150 }, { "epoch": 0.01685326685655851, "grad_norm": 8.542074203491211, "learning_rate": 8.421052631578948e-05, "logits/chosen": -2.788573741912842, "logits/rejected": -2.7887728214263916, "logps/chosen": -0.5694094300270081, "logps/rejected": -4.549073219299316, "loss": 0.5953, "odds_ratio_loss": 2.08962082862854, "rewards/accuracies": 0.9083333611488342, "rewards/chosen": -0.05694093927741051, "rewards/margins": 0.3979664146900177, "rewards/rejected": -0.4549073278903961, "sft_loss": 0.3863413631916046, "step": 160 }, { "epoch": 0.017906596035093418, "grad_norm": 10.086228370666504, "learning_rate": 8.947368421052632e-05, "logits/chosen": -2.747136354446411, "logits/rejected": -2.7473561763763428, "logps/chosen": -0.5511812567710876, "logps/rejected": -4.04136848449707, "loss": 0.5864, "odds_ratio_loss": 2.00999116897583, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.0551181361079216, "rewards/margins": 0.34901875257492065, "rewards/rejected": -0.40413692593574524, "sft_loss": 0.3853704035282135, "step": 170 }, { "epoch": 0.018959925213628322, "grad_norm": 6.374107837677002, "learning_rate": 9.473684210526316e-05, "logits/chosen": -2.6502909660339355, "logits/rejected": -2.650648593902588, "logps/chosen": -0.5040014982223511, "logps/rejected": -4.205493450164795, "loss": 0.5236, "odds_ratio_loss": 1.6211795806884766, "rewards/accuracies": 0.9208333492279053, "rewards/chosen": -0.050400152802467346, "rewards/margins": 0.3701492249965668, "rewards/rejected": -0.4205494225025177, "sft_loss": 0.3614722490310669, "step": 180 }, { "epoch": 0.02001325439216323, "grad_norm": 2.717724323272705, "learning_rate": 0.0001, "logits/chosen": -2.7677037715911865, "logits/rejected": -2.7680699825286865, "logps/chosen": -0.5834678411483765, "logps/rejected": -4.945567607879639, "loss": 0.6101, "odds_ratio_loss": 2.1636292934417725, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.05834679678082466, "rewards/margins": 0.4362100064754486, "rewards/rejected": -0.49455681443214417, "sft_loss": 0.39376381039619446, "step": 190 }, { "epoch": 0.021066583570698138, "grad_norm": 6.805058479309082, "learning_rate": 0.00010526315789473683, "logits/chosen": -2.722885847091675, "logits/rejected": -2.723130226135254, "logps/chosen": -0.6606806516647339, "logps/rejected": -4.050003528594971, "loss": 0.6941, "odds_ratio_loss": 2.383312940597534, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.06606806069612503, "rewards/margins": 0.3389323055744171, "rewards/rejected": -0.4050002992153168, "sft_loss": 0.4557226896286011, "step": 200 }, { "epoch": 0.022119912749233046, "grad_norm": 5.377903938293457, "learning_rate": 0.00011052631578947368, "logits/chosen": -2.777975559234619, "logits/rejected": -2.778069257736206, "logps/chosen": -0.663443922996521, "logps/rejected": -3.8639333248138428, "loss": 0.69, "odds_ratio_loss": 1.9626834392547607, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.06634439527988434, "rewards/margins": 0.3200489580631256, "rewards/rejected": -0.38639336824417114, "sft_loss": 0.4936945140361786, "step": 210 }, { "epoch": 0.02317324192776795, "grad_norm": 11.14415454864502, "learning_rate": 0.00011578947368421053, "logits/chosen": -2.69752836227417, "logits/rejected": -2.6975817680358887, "logps/chosen": -0.7092010378837585, "logps/rejected": -3.6659300327301025, "loss": 0.7441, "odds_ratio_loss": 2.298079013824463, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07092010229825974, "rewards/margins": 0.29567286372184753, "rewards/rejected": -0.36659297347068787, "sft_loss": 0.5143173933029175, "step": 220 }, { "epoch": 0.024226571106302858, "grad_norm": 17.70037269592285, "learning_rate": 0.00012105263157894738, "logits/chosen": -2.8756678104400635, "logits/rejected": -2.875657796859741, "logps/chosen": -0.639348566532135, "logps/rejected": -4.930140018463135, "loss": 0.6653, "odds_ratio_loss": 2.042109489440918, "rewards/accuracies": 0.8958333134651184, "rewards/chosen": -0.06393485516309738, "rewards/margins": 0.42907920479774475, "rewards/rejected": -0.49301406741142273, "sft_loss": 0.4610413908958435, "step": 230 }, { "epoch": 0.025279900284837765, "grad_norm": 22.880346298217773, "learning_rate": 0.0001263157894736842, "logits/chosen": -3.3157336711883545, "logits/rejected": -3.3157567977905273, "logps/chosen": -1.2456268072128296, "logps/rejected": -5.491827964782715, "loss": 1.3155, "odds_ratio_loss": 4.847614765167236, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.12456268817186356, "rewards/margins": 0.4246201515197754, "rewards/rejected": -0.5491827726364136, "sft_loss": 0.8306990265846252, "step": 240 }, { "epoch": 0.026333229463372673, "grad_norm": 5.729049205780029, "learning_rate": 0.00013157894736842105, "logits/chosen": -2.5473344326019287, "logits/rejected": -2.5469789505004883, "logps/chosen": -0.749646008014679, "logps/rejected": -6.155911445617676, "loss": 0.7811, "odds_ratio_loss": 2.4923181533813477, "rewards/accuracies": 0.8645833134651184, "rewards/chosen": -0.07496459782123566, "rewards/margins": 0.5406264662742615, "rewards/rejected": -0.6155910491943359, "sft_loss": 0.5319061875343323, "step": 250 }, { "epoch": 0.027386558641907578, "grad_norm": 11.868535995483398, "learning_rate": 0.00013684210526315792, "logits/chosen": -2.9207849502563477, "logits/rejected": -2.9205822944641113, "logps/chosen": -0.8412100076675415, "logps/rejected": -5.425389289855957, "loss": 0.8748, "odds_ratio_loss": 3.032189130783081, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.08412099629640579, "rewards/margins": 0.4584178924560547, "rewards/rejected": -0.5425389409065247, "sft_loss": 0.5715639591217041, "step": 260 }, { "epoch": 0.028439887820442485, "grad_norm": 9.387285232543945, "learning_rate": 0.00014210526315789474, "logits/chosen": -3.1293420791625977, "logits/rejected": -3.129204273223877, "logps/chosen": -0.7858380079269409, "logps/rejected": -4.375970363616943, "loss": 0.8281, "odds_ratio_loss": 2.820624351501465, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.07858379930257797, "rewards/margins": 0.35901322960853577, "rewards/rejected": -0.43759700655937195, "sft_loss": 0.5460221171379089, "step": 270 }, { "epoch": 0.029493216998977393, "grad_norm": 5.87777042388916, "learning_rate": 0.00014736842105263158, "logits/chosen": -2.9154765605926514, "logits/rejected": -2.91546368598938, "logps/chosen": -0.595488965511322, "logps/rejected": -3.7737627029418945, "loss": 0.6283, "odds_ratio_loss": 2.1527862548828125, "rewards/accuracies": 0.8604166507720947, "rewards/chosen": -0.05954889953136444, "rewards/margins": 0.31782734394073486, "rewards/rejected": -0.3773762583732605, "sft_loss": 0.4130483567714691, "step": 280 }, { "epoch": 0.0305465461775123, "grad_norm": 10.7889986038208, "learning_rate": 0.00015263157894736842, "logits/chosen": -3.0789036750793457, "logits/rejected": -3.079068899154663, "logps/chosen": -0.5851417779922485, "logps/rejected": -3.892369031906128, "loss": 0.6135, "odds_ratio_loss": 2.1990480422973633, "rewards/accuracies": 0.8791666626930237, "rewards/chosen": -0.05851416662335396, "rewards/margins": 0.33072274923324585, "rewards/rejected": -0.3892369568347931, "sft_loss": 0.3936450183391571, "step": 290 }, { "epoch": 0.031599875356047205, "grad_norm": 10.757394790649414, "learning_rate": 0.00015789473684210527, "logits/chosen": -3.1641581058502197, "logits/rejected": -3.1641595363616943, "logps/chosen": -0.625824511051178, "logps/rejected": -4.4615254402160645, "loss": 0.6551, "odds_ratio_loss": 2.074664354324341, "rewards/accuracies": 0.8708333373069763, "rewards/chosen": -0.06258244812488556, "rewards/margins": 0.3835701644420624, "rewards/rejected": -0.44615259766578674, "sft_loss": 0.44765299558639526, "step": 300 }, { "epoch": 0.03265320453458211, "grad_norm": 8.260001182556152, "learning_rate": 0.0001631578947368421, "logits/chosen": -3.1696364879608154, "logits/rejected": -3.169647693634033, "logps/chosen": -0.6334646940231323, "logps/rejected": -4.125209331512451, "loss": 0.6653, "odds_ratio_loss": 2.147244691848755, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.06334646791219711, "rewards/margins": 0.34917446970939636, "rewards/rejected": -0.4125209450721741, "sft_loss": 0.45062482357025146, "step": 310 }, { "epoch": 0.03370653371311702, "grad_norm": 17.23076057434082, "learning_rate": 0.00016842105263157895, "logits/chosen": -3.175907611846924, "logits/rejected": -3.175673007965088, "logps/chosen": -0.8329946994781494, "logps/rejected": -4.718179702758789, "loss": 0.8649, "odds_ratio_loss": 2.6120009422302246, "rewards/accuracies": 0.8729166388511658, "rewards/chosen": -0.08329946547746658, "rewards/margins": 0.3885185122489929, "rewards/rejected": -0.4718180298805237, "sft_loss": 0.6036695837974548, "step": 320 }, { "epoch": 0.03475986289165193, "grad_norm": 8.57013988494873, "learning_rate": 0.0001736842105263158, "logits/chosen": -2.885397434234619, "logits/rejected": -2.8853094577789307, "logps/chosen": -0.7634103298187256, "logps/rejected": -3.892472982406616, "loss": 0.7919, "odds_ratio_loss": 2.4835994243621826, "rewards/accuracies": 0.8770833611488342, "rewards/chosen": -0.07634103298187256, "rewards/margins": 0.31290626525878906, "rewards/rejected": -0.389247328042984, "sft_loss": 0.5435259938240051, "step": 330 }, { "epoch": 0.035813192070186836, "grad_norm": 6.161098957061768, "learning_rate": 0.00017894736842105264, "logits/chosen": -2.7474565505981445, "logits/rejected": -2.747159004211426, "logps/chosen": -0.6635507345199585, "logps/rejected": -4.673018455505371, "loss": 0.6934, "odds_ratio_loss": 2.4976134300231934, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.06635507941246033, "rewards/margins": 0.4009467363357544, "rewards/rejected": -0.4673018753528595, "sft_loss": 0.44364967942237854, "step": 340 }, { "epoch": 0.036866521248721744, "grad_norm": 6.6998372077941895, "learning_rate": 0.00018421052631578948, "logits/chosen": -3.3279902935028076, "logits/rejected": -3.327728509902954, "logps/chosen": -0.7464654445648193, "logps/rejected": -4.424903869628906, "loss": 0.7811, "odds_ratio_loss": 2.4744033813476562, "rewards/accuracies": 0.8270833492279053, "rewards/chosen": -0.07464654743671417, "rewards/margins": 0.3678438365459442, "rewards/rejected": -0.4424903988838196, "sft_loss": 0.5336239337921143, "step": 350 }, { "epoch": 0.037919850427256645, "grad_norm": 4.3592400550842285, "learning_rate": 0.00018947368421052632, "logits/chosen": -3.632689952850342, "logits/rejected": -3.6323180198669434, "logps/chosen": -0.6197668313980103, "logps/rejected": -5.021815299987793, "loss": 0.647, "odds_ratio_loss": 2.0902163982391357, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.061976686120033264, "rewards/margins": 0.44020482897758484, "rewards/rejected": -0.5021815299987793, "sft_loss": 0.43794170022010803, "step": 360 }, { "epoch": 0.03897317960579155, "grad_norm": 9.133977890014648, "learning_rate": 0.00019473684210526317, "logits/chosen": -3.6677591800689697, "logits/rejected": -3.667369842529297, "logps/chosen": -0.6474730372428894, "logps/rejected": -5.439915180206299, "loss": 0.6779, "odds_ratio_loss": 2.1945674419403076, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.06474730372428894, "rewards/margins": 0.4792442321777344, "rewards/rejected": -0.5439915060997009, "sft_loss": 0.45841965079307556, "step": 370 }, { "epoch": 0.04002650878432646, "grad_norm": 9.33304214477539, "learning_rate": 0.0002, "logits/chosen": -3.5276687145233154, "logits/rejected": -3.527397632598877, "logps/chosen": -0.6519566178321838, "logps/rejected": -4.4450178146362305, "loss": 0.6857, "odds_ratio_loss": 2.411801338195801, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06519566476345062, "rewards/margins": 0.3793061375617981, "rewards/rejected": -0.4445018172264099, "sft_loss": 0.44453203678131104, "step": 380 }, { "epoch": 0.04107983796286137, "grad_norm": 8.269370079040527, "learning_rate": 0.00020526315789473685, "logits/chosen": -3.345468282699585, "logits/rejected": -3.3452601432800293, "logps/chosen": -0.7482808232307434, "logps/rejected": -4.133052349090576, "loss": 0.7849, "odds_ratio_loss": 2.473043918609619, "rewards/accuracies": 0.8395833373069763, "rewards/chosen": -0.07482809573411942, "rewards/margins": 0.33847716450691223, "rewards/rejected": -0.41330528259277344, "sft_loss": 0.5376084446907043, "step": 390 }, { "epoch": 0.042133167141396276, "grad_norm": 3.1917130947113037, "learning_rate": 0.00021052631578947367, "logits/chosen": -3.498554229736328, "logits/rejected": -3.4982807636260986, "logps/chosen": -0.6910140514373779, "logps/rejected": -4.305008888244629, "loss": 0.7229, "odds_ratio_loss": 2.5834543704986572, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.06910141557455063, "rewards/margins": 0.36139950156211853, "rewards/rejected": -0.43050095438957214, "sft_loss": 0.46452444791793823, "step": 400 }, { "epoch": 0.043186496319931184, "grad_norm": 8.981714248657227, "learning_rate": 0.00021578947368421054, "logits/chosen": -3.482508420944214, "logits/rejected": -3.482311725616455, "logps/chosen": -0.666519045829773, "logps/rejected": -3.838574171066284, "loss": 0.7004, "odds_ratio_loss": 2.59932017326355, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06665190309286118, "rewards/margins": 0.3172055184841156, "rewards/rejected": -0.3838574290275574, "sft_loss": 0.4405144453048706, "step": 410 }, { "epoch": 0.04423982549846609, "grad_norm": 5.946087837219238, "learning_rate": 0.00022105263157894735, "logits/chosen": -3.5680463314056396, "logits/rejected": -3.5679588317871094, "logps/chosen": -0.6861178874969482, "logps/rejected": -3.294382333755493, "loss": 0.7209, "odds_ratio_loss": 2.4581611156463623, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06861178576946259, "rewards/margins": 0.26082643866539, "rewards/rejected": -0.3294382095336914, "sft_loss": 0.4750979244709015, "step": 420 }, { "epoch": 0.045293154677001, "grad_norm": 16.975387573242188, "learning_rate": 0.00022631578947368422, "logits/chosen": -3.8716492652893066, "logits/rejected": -3.871539831161499, "logps/chosen": -0.7186715602874756, "logps/rejected": -3.2046849727630615, "loss": 0.7559, "odds_ratio_loss": 2.6692261695861816, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.07186715304851532, "rewards/margins": 0.24860134720802307, "rewards/rejected": -0.3204684853553772, "sft_loss": 0.48894843459129333, "step": 430 }, { "epoch": 0.0463464838555359, "grad_norm": 3.843916416168213, "learning_rate": 0.00023157894736842107, "logits/chosen": -3.794214963912964, "logits/rejected": -3.794062852859497, "logps/chosen": -0.6966003179550171, "logps/rejected": -3.6082844734191895, "loss": 0.7316, "odds_ratio_loss": 2.6561334133148193, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.06966003775596619, "rewards/margins": 0.2911684215068817, "rewards/rejected": -0.3608284592628479, "sft_loss": 0.4659655690193176, "step": 440 }, { "epoch": 0.04739981303407081, "grad_norm": 14.617210388183594, "learning_rate": 0.00023684210526315788, "logits/chosen": -3.84993052482605, "logits/rejected": -3.8500382900238037, "logps/chosen": -0.7132828831672668, "logps/rejected": -3.116370916366577, "loss": 0.7449, "odds_ratio_loss": 2.349879264831543, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.0713282972574234, "rewards/margins": 0.24030880630016327, "rewards/rejected": -0.31163710355758667, "sft_loss": 0.5099204182624817, "step": 450 }, { "epoch": 0.048453142212605715, "grad_norm": 15.630524635314941, "learning_rate": 0.00024210526315789475, "logits/chosen": -4.3313679695129395, "logits/rejected": -4.331648349761963, "logps/chosen": -0.7833544611930847, "logps/rejected": -2.8526246547698975, "loss": 0.8191, "odds_ratio_loss": 2.5849623680114746, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07833544164896011, "rewards/margins": 0.20692706108093262, "rewards/rejected": -0.28526249527931213, "sft_loss": 0.5606356263160706, "step": 460 }, { "epoch": 0.04950647139114062, "grad_norm": 4.825496196746826, "learning_rate": 0.0002473684210526316, "logits/chosen": -4.020305156707764, "logits/rejected": -4.020514965057373, "logps/chosen": -0.7084909677505493, "logps/rejected": -2.901973009109497, "loss": 0.745, "odds_ratio_loss": 2.5733158588409424, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.07084909081459045, "rewards/margins": 0.21934820711612701, "rewards/rejected": -0.29019731283187866, "sft_loss": 0.48766499757766724, "step": 470 }, { "epoch": 0.05055980056967553, "grad_norm": 6.267645835876465, "learning_rate": 0.0002526315789473684, "logits/chosen": -3.936088800430298, "logits/rejected": -3.936236619949341, "logps/chosen": -0.7358769774436951, "logps/rejected": -2.6652441024780273, "loss": 0.7689, "odds_ratio_loss": 2.5294764041900635, "rewards/accuracies": 0.8541666865348816, "rewards/chosen": -0.07358769327402115, "rewards/margins": 0.1929367184638977, "rewards/rejected": -0.26652440428733826, "sft_loss": 0.5159851312637329, "step": 480 }, { "epoch": 0.05161312974821044, "grad_norm": 7.438229084014893, "learning_rate": 0.0002578947368421053, "logits/chosen": -4.008545875549316, "logits/rejected": -4.008641242980957, "logps/chosen": -0.7306921482086182, "logps/rejected": -2.7273244857788086, "loss": 0.7645, "odds_ratio_loss": 2.6694443225860596, "rewards/accuracies": 0.8291666507720947, "rewards/chosen": -0.07306921482086182, "rewards/margins": 0.1996632218360901, "rewards/rejected": -0.2727324366569519, "sft_loss": 0.49753716588020325, "step": 490 }, { "epoch": 0.052666458926745346, "grad_norm": 5.6936469078063965, "learning_rate": 0.0002631578947368421, "logits/chosen": -3.8969054222106934, "logits/rejected": -3.896923303604126, "logps/chosen": -0.7155380249023438, "logps/rejected": -3.1710591316223145, "loss": 0.7467, "odds_ratio_loss": 2.616429567337036, "rewards/accuracies": 0.8729166388511658, "rewards/chosen": -0.07155381143093109, "rewards/margins": 0.24555210769176483, "rewards/rejected": -0.3171059191226959, "sft_loss": 0.48508700728416443, "step": 500 }, { "epoch": 0.053719788105280254, "grad_norm": 4.272115230560303, "learning_rate": 0.00026842105263157897, "logits/chosen": -3.8689732551574707, "logits/rejected": -3.868974208831787, "logps/chosen": -0.6541014313697815, "logps/rejected": -3.1265270709991455, "loss": 0.6821, "odds_ratio_loss": 2.2681777477264404, "rewards/accuracies": 0.8854166865348816, "rewards/chosen": -0.06541014462709427, "rewards/margins": 0.2472425401210785, "rewards/rejected": -0.31265270709991455, "sft_loss": 0.4552646279335022, "step": 510 }, { "epoch": 0.054773117283815155, "grad_norm": 5.510837078094482, "learning_rate": 0.00027368421052631584, "logits/chosen": -3.8620245456695557, "logits/rejected": -3.862044334411621, "logps/chosen": -0.6386537551879883, "logps/rejected": -3.322967767715454, "loss": 0.6712, "odds_ratio_loss": 2.311323881149292, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.06386537849903107, "rewards/margins": 0.2684313654899597, "rewards/rejected": -0.33229681849479675, "sft_loss": 0.4400910437107086, "step": 520 }, { "epoch": 0.05582644646235006, "grad_norm": 12.38877010345459, "learning_rate": 0.0002789473684210526, "logits/chosen": -4.60584020614624, "logits/rejected": -4.6057939529418945, "logps/chosen": -0.7113536596298218, "logps/rejected": -3.330021381378174, "loss": 0.7496, "odds_ratio_loss": 2.427239179611206, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07113537192344666, "rewards/margins": 0.2618667781352997, "rewards/rejected": -0.33300215005874634, "sft_loss": 0.5068832635879517, "step": 530 }, { "epoch": 0.05687977564088497, "grad_norm": 2.2653727531433105, "learning_rate": 0.00028421052631578947, "logits/chosen": -5.099688529968262, "logits/rejected": -5.09957218170166, "logps/chosen": -0.6874160170555115, "logps/rejected": -3.100078582763672, "loss": 0.7231, "odds_ratio_loss": 2.652129650115967, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06874160468578339, "rewards/margins": 0.24126628041267395, "rewards/rejected": -0.31000787019729614, "sft_loss": 0.45785781741142273, "step": 540 }, { "epoch": 0.05793310481941988, "grad_norm": 6.484382152557373, "learning_rate": 0.00028947368421052634, "logits/chosen": -4.181884288787842, "logits/rejected": -4.181893348693848, "logps/chosen": -0.7184228897094727, "logps/rejected": -3.2252790927886963, "loss": 0.7563, "odds_ratio_loss": 2.4872865676879883, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.07184228301048279, "rewards/margins": 0.25068560242652893, "rewards/rejected": -0.3225278854370117, "sft_loss": 0.5075890421867371, "step": 550 }, { "epoch": 0.058986433997954786, "grad_norm": 6.237575531005859, "learning_rate": 0.00029473684210526316, "logits/chosen": -4.042048931121826, "logits/rejected": -4.042147159576416, "logps/chosen": -0.6820612549781799, "logps/rejected": -2.6241307258605957, "loss": 0.7178, "odds_ratio_loss": 2.6058013439178467, "rewards/accuracies": 0.8604166507720947, "rewards/chosen": -0.06820613890886307, "rewards/margins": 0.19420695304870605, "rewards/rejected": -0.26241305470466614, "sft_loss": 0.45718762278556824, "step": 560 }, { "epoch": 0.060039763176489694, "grad_norm": 5.729897499084473, "learning_rate": 0.0003, "logits/chosen": -3.9665284156799316, "logits/rejected": -3.966668128967285, "logps/chosen": -0.7161160111427307, "logps/rejected": -2.8060250282287598, "loss": 0.7529, "odds_ratio_loss": 2.4470136165618896, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.07161159813404083, "rewards/margins": 0.20899087190628052, "rewards/rejected": -0.28060245513916016, "sft_loss": 0.5082017779350281, "step": 570 }, { "epoch": 0.0610930923550246, "grad_norm": 5.065602779388428, "learning_rate": 0.00030526315789473684, "logits/chosen": -3.9091081619262695, "logits/rejected": -3.9092376232147217, "logps/chosen": -0.6755971908569336, "logps/rejected": -2.8741674423217773, "loss": 0.7078, "odds_ratio_loss": 2.4316818714141846, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.06755972653627396, "rewards/margins": 0.21985705196857452, "rewards/rejected": -0.2874167859554291, "sft_loss": 0.4646414816379547, "step": 580 }, { "epoch": 0.06214642153355951, "grad_norm": 2.45158314704895, "learning_rate": 0.0003105263157894737, "logits/chosen": -3.9886550903320312, "logits/rejected": -3.9887642860412598, "logps/chosen": -0.649567186832428, "logps/rejected": -3.0265567302703857, "loss": 0.6791, "odds_ratio_loss": 2.3147711753845215, "rewards/accuracies": 0.8729166388511658, "rewards/chosen": -0.06495673209428787, "rewards/margins": 0.2376989722251892, "rewards/rejected": -0.3026556670665741, "sft_loss": 0.4476209580898285, "step": 590 }, { "epoch": 0.06319975071209441, "grad_norm": 15.312357902526855, "learning_rate": 0.00031578947368421053, "logits/chosen": -3.9752919673919678, "logits/rejected": -3.9754388332366943, "logps/chosen": -0.696826159954071, "logps/rejected": -3.0343518257141113, "loss": 0.732, "odds_ratio_loss": 2.443164587020874, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.06968262046575546, "rewards/margins": 0.23375259339809418, "rewards/rejected": -0.30343523621559143, "sft_loss": 0.48767518997192383, "step": 600 }, { "epoch": 0.06425307989062932, "grad_norm": 9.758230209350586, "learning_rate": 0.0003210526315789474, "logits/chosen": -3.76411509513855, "logits/rejected": -3.763446092605591, "logps/chosen": -0.7242849469184875, "logps/rejected": -5.800142288208008, "loss": 0.7592, "odds_ratio_loss": 2.657700300216675, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.07242848724126816, "rewards/margins": 0.5075857639312744, "rewards/rejected": -0.5800142884254456, "sft_loss": 0.49342209100723267, "step": 610 }, { "epoch": 0.06530640906916423, "grad_norm": 7.555414199829102, "learning_rate": 0.0003263157894736842, "logits/chosen": -4.165302753448486, "logits/rejected": -4.1650519371032715, "logps/chosen": -0.7384843230247498, "logps/rejected": -3.710164785385132, "loss": 0.7768, "odds_ratio_loss": 2.5719528198242188, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.0738484337925911, "rewards/margins": 0.297168105840683, "rewards/rejected": -0.3710165023803711, "sft_loss": 0.5196101665496826, "step": 620 }, { "epoch": 0.06635973824769914, "grad_norm": 4.273881435394287, "learning_rate": 0.00033157894736842103, "logits/chosen": -4.187811374664307, "logits/rejected": -4.186800479888916, "logps/chosen": -0.6481006145477295, "logps/rejected": -5.178854942321777, "loss": 0.6803, "odds_ratio_loss": 2.3059561252593994, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.06481005996465683, "rewards/margins": 0.45307546854019165, "rewards/rejected": -0.5178855061531067, "sft_loss": 0.4497505724430084, "step": 630 }, { "epoch": 0.06741306742623404, "grad_norm": 6.665101528167725, "learning_rate": 0.0003368421052631579, "logits/chosen": -4.168524265289307, "logits/rejected": -4.1669230461120605, "logps/chosen": -0.658748984336853, "logps/rejected": -6.14946174621582, "loss": 0.6936, "odds_ratio_loss": 2.615469455718994, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06587490439414978, "rewards/margins": 0.5490713715553284, "rewards/rejected": -0.614946186542511, "sft_loss": 0.432014137506485, "step": 640 }, { "epoch": 0.06846639660476894, "grad_norm": 5.859743118286133, "learning_rate": 0.00034210526315789477, "logits/chosen": -4.12244176864624, "logits/rejected": -4.120962619781494, "logps/chosen": -0.703795850276947, "logps/rejected": -5.927857875823975, "loss": 0.739, "odds_ratio_loss": 2.5700552463531494, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.0703795999288559, "rewards/margins": 0.5224061608314514, "rewards/rejected": -0.5927857756614685, "sft_loss": 0.48198458552360535, "step": 650 }, { "epoch": 0.06951972578330386, "grad_norm": 3.937659502029419, "learning_rate": 0.0003473684210526316, "logits/chosen": -4.144687175750732, "logits/rejected": -4.143020153045654, "logps/chosen": -0.6716140508651733, "logps/rejected": -6.169389247894287, "loss": 0.704, "odds_ratio_loss": 2.5956499576568604, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.06716141104698181, "rewards/margins": 0.549777626991272, "rewards/rejected": -0.6169389486312866, "sft_loss": 0.44441157579421997, "step": 660 }, { "epoch": 0.07057305496183876, "grad_norm": 4.0990681648254395, "learning_rate": 0.0003526315789473684, "logits/chosen": -4.208241939544678, "logits/rejected": -4.206976413726807, "logps/chosen": -0.6446244120597839, "logps/rejected": -5.427404403686523, "loss": 0.6791, "odds_ratio_loss": 2.412100315093994, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06446244567632675, "rewards/margins": 0.4782780110836029, "rewards/rejected": -0.5427404642105103, "sft_loss": 0.4378568232059479, "step": 670 }, { "epoch": 0.07162638414037367, "grad_norm": 4.258831977844238, "learning_rate": 0.0003578947368421053, "logits/chosen": -4.341937065124512, "logits/rejected": -4.341104984283447, "logps/chosen": -0.7450679540634155, "logps/rejected": -4.367857933044434, "loss": 0.7874, "odds_ratio_loss": 2.708036184310913, "rewards/accuracies": 0.8083333373069763, "rewards/chosen": -0.07450678944587708, "rewards/margins": 0.3622789978981018, "rewards/rejected": -0.4367857873439789, "sft_loss": 0.5165507793426514, "step": 680 }, { "epoch": 0.07267971331890857, "grad_norm": 10.723002433776855, "learning_rate": 0.00036315789473684214, "logits/chosen": -4.344449996948242, "logits/rejected": -4.344136714935303, "logps/chosen": -0.8118324279785156, "logps/rejected": -3.4473140239715576, "loss": 0.852, "odds_ratio_loss": 2.871811628341675, "rewards/accuracies": 0.8104166388511658, "rewards/chosen": -0.08118324726819992, "rewards/margins": 0.26354819536209106, "rewards/rejected": -0.3447313904762268, "sft_loss": 0.5648209452629089, "step": 690 }, { "epoch": 0.07373304249744349, "grad_norm": 5.821114540100098, "learning_rate": 0.00036842105263157896, "logits/chosen": -4.07045316696167, "logits/rejected": -4.069707870483398, "logps/chosen": -0.8850536942481995, "logps/rejected": -5.450161933898926, "loss": 0.9181, "odds_ratio_loss": 3.165844678878784, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.08850537240505219, "rewards/margins": 0.4565107524394989, "rewards/rejected": -0.5450161695480347, "sft_loss": 0.6015486121177673, "step": 700 }, { "epoch": 0.07478637167597839, "grad_norm": 2.219165563583374, "learning_rate": 0.0003736842105263158, "logits/chosen": -3.7920498847961426, "logits/rejected": -3.7913458347320557, "logps/chosen": -0.7324831485748291, "logps/rejected": -4.996405601501465, "loss": 0.7707, "odds_ratio_loss": 2.7621920108795166, "rewards/accuracies": 0.8041666746139526, "rewards/chosen": -0.07324830442667007, "rewards/margins": 0.42639225721359253, "rewards/rejected": -0.499640554189682, "sft_loss": 0.49450069665908813, "step": 710 }, { "epoch": 0.07583970085451329, "grad_norm": 5.435701370239258, "learning_rate": 0.00037894736842105265, "logits/chosen": -4.781533718109131, "logits/rejected": -4.781356334686279, "logps/chosen": -0.6917392611503601, "logps/rejected": -4.243617057800293, "loss": 0.7244, "odds_ratio_loss": 2.5724165439605713, "rewards/accuracies": 0.8645833134651184, "rewards/chosen": -0.06917393207550049, "rewards/margins": 0.3551878333091736, "rewards/rejected": -0.4243617355823517, "sft_loss": 0.4671470522880554, "step": 720 }, { "epoch": 0.0768930300330482, "grad_norm": 4.722170352935791, "learning_rate": 0.00038421052631578946, "logits/chosen": -4.822556495666504, "logits/rejected": -4.822704792022705, "logps/chosen": -0.6918298006057739, "logps/rejected": -3.5728962421417236, "loss": 0.7267, "odds_ratio_loss": 2.5833892822265625, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.06918298453092575, "rewards/margins": 0.28810662031173706, "rewards/rejected": -0.3572896420955658, "sft_loss": 0.4683450758457184, "step": 730 }, { "epoch": 0.0779463592115831, "grad_norm": 2.800881862640381, "learning_rate": 0.00038947368421052633, "logits/chosen": -4.779958248138428, "logits/rejected": -4.780096530914307, "logps/chosen": -0.6186120510101318, "logps/rejected": -3.642204761505127, "loss": 0.6549, "odds_ratio_loss": 2.427229881286621, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.06186120584607124, "rewards/margins": 0.3023592531681061, "rewards/rejected": -0.3642204701900482, "sft_loss": 0.41214191913604736, "step": 740 }, { "epoch": 0.07899968839011802, "grad_norm": 5.068697452545166, "learning_rate": 0.00039473684210526315, "logits/chosen": -4.596142768859863, "logits/rejected": -4.595941066741943, "logps/chosen": -0.7380008697509766, "logps/rejected": -4.162142276763916, "loss": 0.7749, "odds_ratio_loss": 2.4714393615722656, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.07380008697509766, "rewards/margins": 0.34241411089897156, "rewards/rejected": -0.4162141978740692, "sft_loss": 0.527804970741272, "step": 750 }, { "epoch": 0.08005301756865292, "grad_norm": 3.4697628021240234, "learning_rate": 0.0004, "logits/chosen": -4.708858013153076, "logits/rejected": -4.708543300628662, "logps/chosen": -0.675973653793335, "logps/rejected": -4.2291083335876465, "loss": 0.7074, "odds_ratio_loss": 2.3750479221343994, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.06759736686944962, "rewards/margins": 0.3553134799003601, "rewards/rejected": -0.4229108393192291, "sft_loss": 0.4698618948459625, "step": 760 }, { "epoch": 0.08110634674718784, "grad_norm": 11.160131454467773, "learning_rate": 0.00040526315789473684, "logits/chosen": -5.051191329956055, "logits/rejected": -5.050747871398926, "logps/chosen": -0.7793533802032471, "logps/rejected": -5.09091854095459, "loss": 0.8153, "odds_ratio_loss": 2.829737901687622, "rewards/accuracies": 0.8291666507720947, "rewards/chosen": -0.07793533802032471, "rewards/margins": 0.43115654587745667, "rewards/rejected": -0.509091854095459, "sft_loss": 0.5323660969734192, "step": 770 }, { "epoch": 0.08215967592572274, "grad_norm": 3.8492166996002197, "learning_rate": 0.0004105263157894737, "logits/chosen": -4.681753158569336, "logits/rejected": -4.681027889251709, "logps/chosen": -0.67795729637146, "logps/rejected": -5.4289870262146, "loss": 0.7104, "odds_ratio_loss": 2.6001367568969727, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06779572367668152, "rewards/margins": 0.4751029908657074, "rewards/rejected": -0.5428987145423889, "sft_loss": 0.45040473341941833, "step": 780 }, { "epoch": 0.08321300510425765, "grad_norm": 4.350924491882324, "learning_rate": 0.0004157894736842106, "logits/chosen": -5.090719699859619, "logits/rejected": -5.0898871421813965, "logps/chosen": -0.6309987902641296, "logps/rejected": -6.083089828491211, "loss": 0.6608, "odds_ratio_loss": 2.363413095474243, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.06309988349676132, "rewards/margins": 0.5452090501785278, "rewards/rejected": -0.6083090305328369, "sft_loss": 0.4244639277458191, "step": 790 }, { "epoch": 0.08426633428279255, "grad_norm": 4.629517078399658, "learning_rate": 0.00042105263157894734, "logits/chosen": -5.171376705169678, "logits/rejected": -5.170820713043213, "logps/chosen": -0.7821296453475952, "logps/rejected": -4.942056655883789, "loss": 0.8229, "odds_ratio_loss": 2.6525399684906006, "rewards/accuracies": 0.8208333253860474, "rewards/chosen": -0.07821296900510788, "rewards/margins": 0.41599270701408386, "rewards/rejected": -0.49420568346977234, "sft_loss": 0.557674765586853, "step": 800 }, { "epoch": 0.08531966346132745, "grad_norm": 5.111828327178955, "learning_rate": 0.0004263157894736842, "logits/chosen": -4.80694580078125, "logits/rejected": -4.806312561035156, "logps/chosen": -0.7565589547157288, "logps/rejected": -5.1447882652282715, "loss": 0.791, "odds_ratio_loss": 2.5595006942749023, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.07565589994192123, "rewards/margins": 0.4388229250907898, "rewards/rejected": -0.514478862285614, "sft_loss": 0.5350964665412903, "step": 810 }, { "epoch": 0.08637299263986237, "grad_norm": 4.786795616149902, "learning_rate": 0.0004315789473684211, "logits/chosen": -4.826291561126709, "logits/rejected": -4.825577259063721, "logps/chosen": -0.6888704895973206, "logps/rejected": -5.567938804626465, "loss": 0.7202, "odds_ratio_loss": 2.4763171672821045, "rewards/accuracies": 0.8541666865348816, "rewards/chosen": -0.06888704746961594, "rewards/margins": 0.487906813621521, "rewards/rejected": -0.5567939281463623, "sft_loss": 0.472540020942688, "step": 820 }, { "epoch": 0.08742632181839727, "grad_norm": 7.63191556930542, "learning_rate": 0.00043684210526315795, "logits/chosen": -4.981302738189697, "logits/rejected": -4.980616569519043, "logps/chosen": -0.7095519304275513, "logps/rejected": -5.726536273956299, "loss": 0.7455, "odds_ratio_loss": 2.7154994010925293, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.07095518708229065, "rewards/margins": 0.5016984343528748, "rewards/rejected": -0.5726536512374878, "sft_loss": 0.4739212989807129, "step": 830 }, { "epoch": 0.08847965099693218, "grad_norm": 3.9395651817321777, "learning_rate": 0.0004421052631578947, "logits/chosen": -5.055291652679443, "logits/rejected": -5.054480075836182, "logps/chosen": -0.6979976892471313, "logps/rejected": -5.470063209533691, "loss": 0.7345, "odds_ratio_loss": 2.5650830268859863, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.0697997659444809, "rewards/margins": 0.4772065579891205, "rewards/rejected": -0.547006368637085, "sft_loss": 0.477975457906723, "step": 840 }, { "epoch": 0.08953298017546708, "grad_norm": 9.380526542663574, "learning_rate": 0.0004473684210526316, "logits/chosen": -4.9736008644104, "logits/rejected": -4.972882270812988, "logps/chosen": -0.6816462874412537, "logps/rejected": -5.888026714324951, "loss": 0.715, "odds_ratio_loss": 2.5402190685272217, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.0681646317243576, "rewards/margins": 0.520638108253479, "rewards/rejected": -0.588802695274353, "sft_loss": 0.46102267503738403, "step": 850 }, { "epoch": 0.090586309354002, "grad_norm": 3.032940626144409, "learning_rate": 0.00045263157894736845, "logits/chosen": -4.86979866027832, "logits/rejected": -4.869546413421631, "logps/chosen": -0.8415181636810303, "logps/rejected": -4.487551689147949, "loss": 0.8781, "odds_ratio_loss": 2.631443977355957, "rewards/accuracies": 0.8229166865348816, "rewards/chosen": -0.08415181934833527, "rewards/margins": 0.3646034300327301, "rewards/rejected": -0.44875526428222656, "sft_loss": 0.6149870157241821, "step": 860 }, { "epoch": 0.0916396385325369, "grad_norm": 5.457218170166016, "learning_rate": 0.00045789473684210527, "logits/chosen": -4.5640482902526855, "logits/rejected": -4.5640788078308105, "logps/chosen": -0.7579687833786011, "logps/rejected": -3.3095312118530273, "loss": 0.7946, "odds_ratio_loss": 2.6955649852752686, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.07579687237739563, "rewards/margins": 0.2551562488079071, "rewards/rejected": -0.33095312118530273, "sft_loss": 0.5250447988510132, "step": 870 }, { "epoch": 0.0926929677110718, "grad_norm": 4.475607872009277, "learning_rate": 0.00046315789473684214, "logits/chosen": -4.721373558044434, "logits/rejected": -4.7214860916137695, "logps/chosen": -0.7569971680641174, "logps/rejected": -3.347615957260132, "loss": 0.7905, "odds_ratio_loss": 2.5438392162323, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.07569971680641174, "rewards/margins": 0.25906190276145935, "rewards/rejected": -0.3347616195678711, "sft_loss": 0.536092221736908, "step": 880 }, { "epoch": 0.09374629688960671, "grad_norm": 31.67135238647461, "learning_rate": 0.00046842105263157895, "logits/chosen": -4.7270989418029785, "logits/rejected": -4.727247714996338, "logps/chosen": -0.7946822047233582, "logps/rejected": -3.15295147895813, "loss": 0.829, "odds_ratio_loss": 2.443174123764038, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.07946821302175522, "rewards/margins": 0.23582692444324493, "rewards/rejected": -0.31529513001441956, "sft_loss": 0.5846543908119202, "step": 890 }, { "epoch": 0.09479962606814162, "grad_norm": 3.2339320182800293, "learning_rate": 0.00047368421052631577, "logits/chosen": -5.0870866775512695, "logits/rejected": -5.087241172790527, "logps/chosen": -0.6878632307052612, "logps/rejected": -2.8736085891723633, "loss": 0.7248, "odds_ratio_loss": 2.5279555320739746, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.06878631561994553, "rewards/margins": 0.21857453882694244, "rewards/rejected": -0.2873608469963074, "sft_loss": 0.47198787331581116, "step": 900 }, { "epoch": 0.09585295524667653, "grad_norm": 2.4642884731292725, "learning_rate": 0.00047894736842105264, "logits/chosen": -5.1205644607543945, "logits/rejected": -5.120718479156494, "logps/chosen": -0.6843523383140564, "logps/rejected": -3.210320472717285, "loss": 0.7126, "odds_ratio_loss": 2.3976242542266846, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.06843523681163788, "rewards/margins": 0.25259679555892944, "rewards/rejected": -0.3210320770740509, "sft_loss": 0.47283756732940674, "step": 910 }, { "epoch": 0.09690628442521143, "grad_norm": 5.920956611633301, "learning_rate": 0.0004842105263157895, "logits/chosen": -5.090200424194336, "logits/rejected": -5.090282917022705, "logps/chosen": -0.6722908616065979, "logps/rejected": -3.273742198944092, "loss": 0.7063, "odds_ratio_loss": 2.530402421951294, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06722908467054367, "rewards/margins": 0.2601451575756073, "rewards/rejected": -0.3273741900920868, "sft_loss": 0.45327988266944885, "step": 920 }, { "epoch": 0.09795961360374635, "grad_norm": 4.960987567901611, "learning_rate": 0.0004894736842105264, "logits/chosen": -5.087591648101807, "logits/rejected": -5.087601661682129, "logps/chosen": -0.7244377136230469, "logps/rejected": -3.0884244441986084, "loss": 0.7571, "odds_ratio_loss": 2.6207199096679688, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07244376838207245, "rewards/margins": 0.23639869689941406, "rewards/rejected": -0.3088424801826477, "sft_loss": 0.4949897825717926, "step": 930 }, { "epoch": 0.09901294278228125, "grad_norm": 5.539714336395264, "learning_rate": 0.0004947368421052632, "logits/chosen": -5.29771614074707, "logits/rejected": -5.297707557678223, "logps/chosen": -0.6988152265548706, "logps/rejected": -3.315657615661621, "loss": 0.7308, "odds_ratio_loss": 2.3126912117004395, "rewards/accuracies": 0.8645833134651184, "rewards/chosen": -0.06988153606653214, "rewards/margins": 0.26168423891067505, "rewards/rejected": -0.3315657675266266, "sft_loss": 0.4995124638080597, "step": 940 }, { "epoch": 0.10006627196081616, "grad_norm": 6.586909294128418, "learning_rate": 0.0005, "logits/chosen": -5.285855293273926, "logits/rejected": -5.2858428955078125, "logps/chosen": -0.6867055296897888, "logps/rejected": -3.43638277053833, "loss": 0.7161, "odds_ratio_loss": 2.5117852687835693, "rewards/accuracies": 0.8791666626930237, "rewards/chosen": -0.06867055594921112, "rewards/margins": 0.2749677002429962, "rewards/rejected": -0.34363824129104614, "sft_loss": 0.4649271070957184, "step": 950 }, { "epoch": 0.10111960113935106, "grad_norm": 6.756776809692383, "learning_rate": 0.0004999983096040005, "logits/chosen": -5.550118923187256, "logits/rejected": -5.5501179695129395, "logps/chosen": -0.7224279642105103, "logps/rejected": -3.376439332962036, "loss": 0.7555, "odds_ratio_loss": 2.5149753093719482, "rewards/accuracies": 0.8604166507720947, "rewards/chosen": -0.07224280387163162, "rewards/margins": 0.2654011845588684, "rewards/rejected": -0.3376440107822418, "sft_loss": 0.5040432214736938, "step": 960 }, { "epoch": 0.10217293031788596, "grad_norm": 161.19760131835938, "learning_rate": 0.0004999932384388613, "logits/chosen": -5.225105285644531, "logits/rejected": -5.225213527679443, "logps/chosen": -1.5466647148132324, "logps/rejected": -3.935602903366089, "loss": 1.5839, "odds_ratio_loss": 3.059565782546997, "rewards/accuracies": 0.8604166507720947, "rewards/chosen": -0.154666468501091, "rewards/margins": 0.2388937920331955, "rewards/rejected": -0.3935602605342865, "sft_loss": 1.277909755706787, "step": 970 }, { "epoch": 0.10322625949642088, "grad_norm": 4.612696170806885, "learning_rate": 0.000499984786573161, "logits/chosen": -5.253983020782471, "logits/rejected": -5.254087448120117, "logps/chosen": -0.6865792870521545, "logps/rejected": -3.219088077545166, "loss": 0.7214, "odds_ratio_loss": 2.43359375, "rewards/accuracies": 0.8166666626930237, "rewards/chosen": -0.06865792721509933, "rewards/margins": 0.2532508671283722, "rewards/rejected": -0.3219088315963745, "sft_loss": 0.4780765473842621, "step": 980 }, { "epoch": 0.10427958867495578, "grad_norm": 4.7582621574401855, "learning_rate": 0.0004999729541211952, "logits/chosen": -5.1987152099609375, "logits/rejected": -5.198534965515137, "logps/chosen": -0.8549334406852722, "logps/rejected": -4.291516304016113, "loss": 0.8956, "odds_ratio_loss": 2.6601970195770264, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08549333363771439, "rewards/margins": 0.3436582684516907, "rewards/rejected": -0.42915162444114685, "sft_loss": 0.6296234726905823, "step": 990 }, { "epoch": 0.10533291785349069, "grad_norm": 5.924813747406006, "learning_rate": 0.0004999577412429764, "logits/chosen": -5.115817070007324, "logits/rejected": -5.115783214569092, "logps/chosen": -0.6959132552146912, "logps/rejected": -3.6701784133911133, "loss": 0.7282, "odds_ratio_loss": 2.448162078857422, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.06959132105112076, "rewards/margins": 0.29742658138275146, "rewards/rejected": -0.36701786518096924, "sft_loss": 0.48338958621025085, "step": 1000 }, { "epoch": 0.1063862470320256, "grad_norm": 3.775995969772339, "learning_rate": 0.0004999391481442307, "logits/chosen": -5.038882732391357, "logits/rejected": -5.039083003997803, "logps/chosen": -0.6505192518234253, "logps/rejected": -2.7321646213531494, "loss": 0.686, "odds_ratio_loss": 2.5259392261505127, "rewards/accuracies": 0.8395833373069763, "rewards/chosen": -0.06505192071199417, "rewards/margins": 0.2081645280122757, "rewards/rejected": -0.2732164263725281, "sft_loss": 0.43344053626060486, "step": 1010 }, { "epoch": 0.10743957621056051, "grad_norm": 3.577996253967285, "learning_rate": 0.0004999171750763959, "logits/chosen": -4.925287246704102, "logits/rejected": -4.925505638122559, "logps/chosen": -0.6491485238075256, "logps/rejected": -2.673661947250366, "loss": 0.683, "odds_ratio_loss": 2.427180528640747, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.06491485238075256, "rewards/margins": 0.20245136320590973, "rewards/rejected": -0.2673662006855011, "sft_loss": 0.44026196002960205, "step": 1020 }, { "epoch": 0.10849290538909541, "grad_norm": 4.37986946105957, "learning_rate": 0.0004998918223366173, "logits/chosen": -5.010101318359375, "logits/rejected": -5.010295391082764, "logps/chosen": -0.7345671057701111, "logps/rejected": -2.6488146781921387, "loss": 0.7723, "odds_ratio_loss": 2.661365509033203, "rewards/accuracies": 0.8291666507720947, "rewards/chosen": -0.07345671951770782, "rewards/margins": 0.19142475724220276, "rewards/rejected": -0.2648814618587494, "sft_loss": 0.5061719417572021, "step": 1030 }, { "epoch": 0.10954623456763031, "grad_norm": 8.366073608398438, "learning_rate": 0.0004998630902677444, "logits/chosen": -5.033807277679443, "logits/rejected": -5.034041881561279, "logps/chosen": -0.7631211876869202, "logps/rejected": -2.8217647075653076, "loss": 0.8007, "odds_ratio_loss": 2.807353973388672, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0763121098279953, "rewards/margins": 0.2058643400669098, "rewards/rejected": -0.2821764647960663, "sft_loss": 0.5199962854385376, "step": 1040 }, { "epoch": 0.11059956374616522, "grad_norm": 3.063091993331909, "learning_rate": 0.0004998309792583257, "logits/chosen": -5.056707859039307, "logits/rejected": -5.056928634643555, "logps/chosen": -0.7930384278297424, "logps/rejected": -3.048496723175049, "loss": 0.8372, "odds_ratio_loss": 2.895695209503174, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.07930383831262589, "rewards/margins": 0.22554583847522736, "rewards/rejected": -0.30484965443611145, "sft_loss": 0.5476340651512146, "step": 1050 }, { "epoch": 0.11165289292470013, "grad_norm": 4.035704612731934, "learning_rate": 0.0004997954897426039, "logits/chosen": -4.731100559234619, "logits/rejected": -4.731292247772217, "logps/chosen": -0.7024089694023132, "logps/rejected": -3.2537832260131836, "loss": 0.7407, "odds_ratio_loss": 2.768791675567627, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.07024088501930237, "rewards/margins": 0.25513747334480286, "rewards/rejected": -0.3253783583641052, "sft_loss": 0.46386364102363586, "step": 1060 }, { "epoch": 0.11270622210323504, "grad_norm": 5.625176906585693, "learning_rate": 0.0004997566222005095, "logits/chosen": -5.09245491027832, "logits/rejected": -5.0925984382629395, "logps/chosen": -0.7186325192451477, "logps/rejected": -3.2234888076782227, "loss": 0.7539, "odds_ratio_loss": 2.5938258171081543, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.07186325639486313, "rewards/margins": 0.2504856288433075, "rewards/rejected": -0.3223489224910736, "sft_loss": 0.4945569932460785, "step": 1070 }, { "epoch": 0.11375955128176994, "grad_norm": 8.753211975097656, "learning_rate": 0.0004997143771576551, "logits/chosen": -5.339606761932373, "logits/rejected": -5.339755058288574, "logps/chosen": -0.6703072190284729, "logps/rejected": -3.089592933654785, "loss": 0.706, "odds_ratio_loss": 2.5606048107147217, "rewards/accuracies": 0.8208333253860474, "rewards/chosen": -0.06703073531389236, "rewards/margins": 0.2419285774230957, "rewards/rejected": -0.30895933508872986, "sft_loss": 0.44997820258140564, "step": 1080 }, { "epoch": 0.11481288046030486, "grad_norm": 7.652149677276611, "learning_rate": 0.0004996687551853271, "logits/chosen": -5.191481113433838, "logits/rejected": -5.191573619842529, "logps/chosen": -0.7030736207962036, "logps/rejected": -3.310757875442505, "loss": 0.7353, "odds_ratio_loss": 2.5654568672180176, "rewards/accuracies": 0.8645833134651184, "rewards/chosen": -0.07030736654996872, "rewards/margins": 0.26076844334602356, "rewards/rejected": -0.3310757875442505, "sft_loss": 0.47873547673225403, "step": 1090 }, { "epoch": 0.11586620963883976, "grad_norm": 4.607894420623779, "learning_rate": 0.0004996197569004794, "logits/chosen": -4.798411846160889, "logits/rejected": -4.798523426055908, "logps/chosen": -0.6611460447311401, "logps/rejected": -2.983532428741455, "loss": 0.6988, "odds_ratio_loss": 2.426917314529419, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06611461192369461, "rewards/margins": 0.23223866522312164, "rewards/rejected": -0.29835325479507446, "sft_loss": 0.4561263918876648, "step": 1100 }, { "epoch": 0.11691953881737467, "grad_norm": 6.003838539123535, "learning_rate": 0.000499567382965724, "logits/chosen": -4.689076900482178, "logits/rejected": -4.689194202423096, "logps/chosen": -0.6927405595779419, "logps/rejected": -3.072282552719116, "loss": 0.7266, "odds_ratio_loss": 2.494777202606201, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06927405297756195, "rewards/margins": 0.23795419931411743, "rewards/rejected": -0.3072282373905182, "sft_loss": 0.477167546749115, "step": 1110 }, { "epoch": 0.11797286799590957, "grad_norm": 3.377394676208496, "learning_rate": 0.0004995116340893223, "logits/chosen": -4.666645526885986, "logits/rejected": -4.666871547698975, "logps/chosen": -0.6164705157279968, "logps/rejected": -3.2369017601013184, "loss": 0.6451, "odds_ratio_loss": 2.411543369293213, "rewards/accuracies": 0.8645833134651184, "rewards/chosen": -0.061647046357393265, "rewards/margins": 0.26204314827919006, "rewards/rejected": -0.3236902058124542, "sft_loss": 0.40390655398368835, "step": 1120 }, { "epoch": 0.11902619717444447, "grad_norm": 3.650819778442383, "learning_rate": 0.0004994525110251759, "logits/chosen": -5.011782169342041, "logits/rejected": -5.012125492095947, "logps/chosen": -0.8005008101463318, "logps/rejected": -3.1773478984832764, "loss": 0.8349, "odds_ratio_loss": 2.8024778366088867, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.08005008101463318, "rewards/margins": 0.2376846969127655, "rewards/rejected": -0.3177347779273987, "sft_loss": 0.5546395182609558, "step": 1130 }, { "epoch": 0.12007952635297939, "grad_norm": 5.192956447601318, "learning_rate": 0.0004993900145728157, "logits/chosen": -5.2029805183410645, "logits/rejected": -5.203344821929932, "logps/chosen": -0.7520886063575745, "logps/rejected": -3.3132715225219727, "loss": 0.7852, "odds_ratio_loss": 2.5903329849243164, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07520885765552521, "rewards/margins": 0.2561182677745819, "rewards/rejected": -0.3313271403312683, "sft_loss": 0.5261538624763489, "step": 1140 }, { "epoch": 0.12113285553151429, "grad_norm": 7.4040985107421875, "learning_rate": 0.0004993241455773918, "logits/chosen": -5.165520191192627, "logits/rejected": -5.165550708770752, "logps/chosen": -0.7126467227935791, "logps/rejected": -3.5545897483825684, "loss": 0.748, "odds_ratio_loss": 2.594726800918579, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07126467674970627, "rewards/margins": 0.28419435024261475, "rewards/rejected": -0.3554590046405792, "sft_loss": 0.48852840065956116, "step": 1150 }, { "epoch": 0.1221861847100492, "grad_norm": 4.905418872833252, "learning_rate": 0.0004992549049296619, "logits/chosen": -5.190316200256348, "logits/rejected": -5.190418720245361, "logps/chosen": -0.6702563762664795, "logps/rejected": -3.4835519790649414, "loss": 0.7052, "odds_ratio_loss": 2.497361183166504, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06702563911676407, "rewards/margins": 0.28132954239845276, "rewards/rejected": -0.3483552038669586, "sft_loss": 0.4554961621761322, "step": 1160 }, { "epoch": 0.1232395138885841, "grad_norm": 4.1152238845825195, "learning_rate": 0.0004991822935659786, "logits/chosen": -5.477373123168945, "logits/rejected": -5.477328300476074, "logps/chosen": -0.8472169637680054, "logps/rejected": -3.917269229888916, "loss": 0.8817, "odds_ratio_loss": 3.221430540084839, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.08472169190645218, "rewards/margins": 0.30700525641441345, "rewards/rejected": -0.39172691106796265, "sft_loss": 0.5595788955688477, "step": 1170 }, { "epoch": 0.12429284306711902, "grad_norm": 5.008730888366699, "learning_rate": 0.0004991063124682778, "logits/chosen": -5.323733329772949, "logits/rejected": -5.323288917541504, "logps/chosen": -0.757644534111023, "logps/rejected": -5.853792190551758, "loss": 0.7925, "odds_ratio_loss": 2.670191526412964, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07576445490121841, "rewards/margins": 0.5096147656440735, "rewards/rejected": -0.5853793025016785, "sft_loss": 0.5255211591720581, "step": 1180 }, { "epoch": 0.12534617224565392, "grad_norm": 31.133501052856445, "learning_rate": 0.0004990269626640645, "logits/chosen": -5.64047384262085, "logits/rejected": -5.64005184173584, "logps/chosen": -0.768204391002655, "logps/rejected": -5.164267063140869, "loss": 0.8058, "odds_ratio_loss": 2.815326452255249, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07682044059038162, "rewards/margins": 0.43960627913475037, "rewards/rejected": -0.516426682472229, "sft_loss": 0.5242764949798584, "step": 1190 }, { "epoch": 0.12639950142418882, "grad_norm": 6.6835222244262695, "learning_rate": 0.0004989442452263996, "logits/chosen": -5.151331424713135, "logits/rejected": -5.1510162353515625, "logps/chosen": -0.8430954217910767, "logps/rejected": -4.105188369750977, "loss": 0.8846, "odds_ratio_loss": 3.0806498527526855, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08430954068899155, "rewards/margins": 0.32620927691459656, "rewards/rejected": -0.4105188250541687, "sft_loss": 0.5764933228492737, "step": 1200 }, { "epoch": 0.12745283060272372, "grad_norm": 6.060980796813965, "learning_rate": 0.0004988581612738847, "logits/chosen": -5.232106685638428, "logits/rejected": -5.2317962646484375, "logps/chosen": -0.7580560445785522, "logps/rejected": -3.9481935501098633, "loss": 0.7955, "odds_ratio_loss": 2.8234329223632812, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.07580561190843582, "rewards/margins": 0.319013774394989, "rewards/rejected": -0.39481934905052185, "sft_loss": 0.5131634473800659, "step": 1210 }, { "epoch": 0.12850615978125865, "grad_norm": 4.154183864593506, "learning_rate": 0.0004987687119706477, "logits/chosen": -5.385165214538574, "logits/rejected": -5.385090351104736, "logps/chosen": -0.7838355302810669, "logps/rejected": -3.81575608253479, "loss": 0.8198, "odds_ratio_loss": 2.7132718563079834, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.07838355004787445, "rewards/margins": 0.30319201946258545, "rewards/rejected": -0.3815755844116211, "sft_loss": 0.5484524965286255, "step": 1220 }, { "epoch": 0.12955948895979355, "grad_norm": 4.047881603240967, "learning_rate": 0.0004986758985263265, "logits/chosen": -5.267871379852295, "logits/rejected": -5.267872333526611, "logps/chosen": -0.7567169666290283, "logps/rejected": -2.9766650199890137, "loss": 0.7939, "odds_ratio_loss": 2.620311975479126, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.07567168772220612, "rewards/margins": 0.2219947874546051, "rewards/rejected": -0.2976664900779724, "sft_loss": 0.5318555235862732, "step": 1230 }, { "epoch": 0.13061281813832845, "grad_norm": 2.7552692890167236, "learning_rate": 0.0004985797221960529, "logits/chosen": -5.264489650726318, "logits/rejected": -5.264598369598389, "logps/chosen": -0.7095692753791809, "logps/rejected": -3.2434911727905273, "loss": 0.7445, "odds_ratio_loss": 2.7399401664733887, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.07095693051815033, "rewards/margins": 0.25339218974113464, "rewards/rejected": -0.3243491053581238, "sft_loss": 0.47051993012428284, "step": 1240 }, { "epoch": 0.13166614731686335, "grad_norm": 3.3210740089416504, "learning_rate": 0.0004984801842804357, "logits/chosen": -5.190452575683594, "logits/rejected": -5.190454959869385, "logps/chosen": -0.6608388423919678, "logps/rejected": -3.0926926136016846, "loss": 0.6955, "odds_ratio_loss": 2.5877368450164795, "rewards/accuracies": 0.8541666865348816, "rewards/chosen": -0.0660838857293129, "rewards/margins": 0.2431853860616684, "rewards/rejected": -0.3092692792415619, "sft_loss": 0.43669426441192627, "step": 1250 }, { "epoch": 0.13271947649539828, "grad_norm": 2.113543748855591, "learning_rate": 0.0004983772861255426, "logits/chosen": -5.355245113372803, "logits/rejected": -5.3552327156066895, "logps/chosen": -0.7177728414535522, "logps/rejected": -3.0373685359954834, "loss": 0.7558, "odds_ratio_loss": 2.9420340061187744, "rewards/accuracies": 0.8020833134651184, "rewards/chosen": -0.07177729159593582, "rewards/margins": 0.23195955157279968, "rewards/rejected": -0.3037368357181549, "sft_loss": 0.4615623354911804, "step": 1260 }, { "epoch": 0.13377280567393318, "grad_norm": 2.6125612258911133, "learning_rate": 0.0004982710291228828, "logits/chosen": -5.5682549476623535, "logits/rejected": -5.568284034729004, "logps/chosen": -0.7032897472381592, "logps/rejected": -3.1011478900909424, "loss": 0.7454, "odds_ratio_loss": 2.6539957523345947, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.0703289657831192, "rewards/margins": 0.23978586494922638, "rewards/rejected": -0.3101148307323456, "sft_loss": 0.4799610376358032, "step": 1270 }, { "epoch": 0.13482613485246808, "grad_norm": 4.396284580230713, "learning_rate": 0.0004981614147093875, "logits/chosen": -5.787298202514648, "logits/rejected": -5.787331581115723, "logps/chosen": -0.7411842942237854, "logps/rejected": -3.3715906143188477, "loss": 0.7797, "odds_ratio_loss": 2.705798625946045, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07411842048168182, "rewards/margins": 0.2630406320095062, "rewards/rejected": -0.33715906739234924, "sft_loss": 0.5091153979301453, "step": 1280 }, { "epoch": 0.13587946403100298, "grad_norm": 5.317102909088135, "learning_rate": 0.000498048444367391, "logits/chosen": -5.471971035003662, "logits/rejected": -5.472008228302002, "logps/chosen": -0.7457516193389893, "logps/rejected": -3.6155009269714355, "loss": 0.7782, "odds_ratio_loss": 2.801055908203125, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.07457517087459564, "rewards/margins": 0.2869749367237091, "rewards/rejected": -0.36155006289482117, "sft_loss": 0.49809518456459045, "step": 1290 }, { "epoch": 0.13693279320953788, "grad_norm": 4.3287506103515625, "learning_rate": 0.00049793211962461, "logits/chosen": -5.262281894683838, "logits/rejected": -5.262419700622559, "logps/chosen": -0.7028716802597046, "logps/rejected": -2.8387629985809326, "loss": 0.7409, "odds_ratio_loss": 2.748593807220459, "rewards/accuracies": 0.8208333253860474, "rewards/chosen": -0.07028716057538986, "rewards/margins": 0.2135891616344452, "rewards/rejected": -0.28387632966041565, "sft_loss": 0.4660036265850067, "step": 1300 }, { "epoch": 0.1379861223880728, "grad_norm": 1.7063987255096436, "learning_rate": 0.0004978124420541238, "logits/chosen": -5.180874347686768, "logits/rejected": -5.181042671203613, "logps/chosen": -0.7432348132133484, "logps/rejected": -2.8490519523620605, "loss": 0.7859, "odds_ratio_loss": 2.679919958114624, "rewards/accuracies": 0.8229166865348816, "rewards/chosen": -0.07432348281145096, "rewards/margins": 0.2105817198753357, "rewards/rejected": -0.28490516543388367, "sft_loss": 0.5179334282875061, "step": 1310 }, { "epoch": 0.1390394515666077, "grad_norm": 2.908730983734131, "learning_rate": 0.0004976894132743521, "logits/chosen": -5.495538234710693, "logits/rejected": -5.49543571472168, "logps/chosen": -0.696144700050354, "logps/rejected": -3.018319845199585, "loss": 0.7357, "odds_ratio_loss": 3.030001163482666, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.0696144625544548, "rewards/margins": 0.23221756517887115, "rewards/rejected": -0.30183205008506775, "sft_loss": 0.4326884150505066, "step": 1320 }, { "epoch": 0.14009278074514261, "grad_norm": 3.4632487297058105, "learning_rate": 0.0004975630349490338, "logits/chosen": -5.525754928588867, "logits/rejected": -5.52562141418457, "logps/chosen": -0.7210444808006287, "logps/rejected": -3.5407607555389404, "loss": 0.7551, "odds_ratio_loss": 2.435321092605591, "rewards/accuracies": 0.8229166865348816, "rewards/chosen": -0.07210444658994675, "rewards/margins": 0.28197160363197327, "rewards/rejected": -0.3540760576725006, "sft_loss": 0.5115490555763245, "step": 1330 }, { "epoch": 0.14114610992367752, "grad_norm": 6.958703517913818, "learning_rate": 0.0004974333087872041, "logits/chosen": -5.362403869628906, "logits/rejected": -5.362163066864014, "logps/chosen": -0.732792854309082, "logps/rejected": -4.104527950286865, "loss": 0.7646, "odds_ratio_loss": 2.719597339630127, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.07327928394079208, "rewards/margins": 0.3371734619140625, "rewards/rejected": -0.4104527533054352, "sft_loss": 0.4926711320877075, "step": 1340 }, { "epoch": 0.14219943910221244, "grad_norm": 2.859614849090576, "learning_rate": 0.0004973002365431719, "logits/chosen": -5.708899021148682, "logits/rejected": -5.708664894104004, "logps/chosen": -0.6528833508491516, "logps/rejected": -4.244791030883789, "loss": 0.6818, "odds_ratio_loss": 2.3199055194854736, "rewards/accuracies": 0.8770833611488342, "rewards/chosen": -0.06528832763433456, "rewards/margins": 0.35919085144996643, "rewards/rejected": -0.4244791567325592, "sft_loss": 0.44977909326553345, "step": 1350 }, { "epoch": 0.14325276828074734, "grad_norm": 4.355047702789307, "learning_rate": 0.0004971638200164954, "logits/chosen": -6.141923427581787, "logits/rejected": -6.141890525817871, "logps/chosen": -0.7438533902168274, "logps/rejected": -3.756343126296997, "loss": 0.7845, "odds_ratio_loss": 2.7992961406707764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07438533753156662, "rewards/margins": 0.3012489676475525, "rewards/rejected": -0.3756342828273773, "sft_loss": 0.5046018362045288, "step": 1360 }, { "epoch": 0.14430609745928225, "grad_norm": 2.1691360473632812, "learning_rate": 0.0004970240610519582, "logits/chosen": -5.513346195220947, "logits/rejected": -5.513165473937988, "logps/chosen": -0.7122442722320557, "logps/rejected": -4.005341529846191, "loss": 0.7452, "odds_ratio_loss": 2.3780596256256104, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07122442126274109, "rewards/margins": 0.32930976152420044, "rewards/rejected": -0.40053418278694153, "sft_loss": 0.5073560476303101, "step": 1370 }, { "epoch": 0.14535942663781715, "grad_norm": 8.108336448669434, "learning_rate": 0.0004968809615395443, "logits/chosen": -5.834110736846924, "logits/rejected": -5.834160327911377, "logps/chosen": -0.7287462949752808, "logps/rejected": -3.322920560836792, "loss": 0.7683, "odds_ratio_loss": 2.7808871269226074, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07287462800741196, "rewards/margins": 0.25941744446754456, "rewards/rejected": -0.3322920799255371, "sft_loss": 0.4901922941207886, "step": 1380 }, { "epoch": 0.14641275581635205, "grad_norm": 4.8621392250061035, "learning_rate": 0.0004967345234144125, "logits/chosen": -5.492813587188721, "logits/rejected": -5.492822647094727, "logps/chosen": -0.6987211108207703, "logps/rejected": -3.542156934738159, "loss": 0.7353, "odds_ratio_loss": 2.4634013175964355, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.06987211108207703, "rewards/margins": 0.2843436002731323, "rewards/rejected": -0.35421568155288696, "sft_loss": 0.48899564146995544, "step": 1390 }, { "epoch": 0.14746608499488698, "grad_norm": 4.177161693572998, "learning_rate": 0.00049658474865687, "logits/chosen": -5.457418441772461, "logits/rejected": -5.457381248474121, "logps/chosen": -0.6720997095108032, "logps/rejected": -3.794857978820801, "loss": 0.705, "odds_ratio_loss": 2.515441417694092, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06720996648073196, "rewards/margins": 0.31227582693099976, "rewards/rejected": -0.3794857859611511, "sft_loss": 0.4534277021884918, "step": 1400 }, { "epoch": 0.14851941417342188, "grad_norm": 6.939654350280762, "learning_rate": 0.000496431639292346, "logits/chosen": -5.591572284698486, "logits/rejected": -5.591520309448242, "logps/chosen": -0.6898292899131775, "logps/rejected": -3.8443169593811035, "loss": 0.7205, "odds_ratio_loss": 2.573276996612549, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.06898292899131775, "rewards/margins": 0.3154487609863281, "rewards/rejected": -0.3844316899776459, "sft_loss": 0.46317487955093384, "step": 1410 }, { "epoch": 0.14957274335195678, "grad_norm": 3.3419246673583984, "learning_rate": 0.0004962751973913644, "logits/chosen": -5.660191059112549, "logits/rejected": -5.6601338386535645, "logps/chosen": -0.7214117050170898, "logps/rejected": -3.800171375274658, "loss": 0.7539, "odds_ratio_loss": 2.2002346515655518, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07214117050170898, "rewards/margins": 0.30787599086761475, "rewards/rejected": -0.38001713156700134, "sft_loss": 0.5338917374610901, "step": 1420 }, { "epoch": 0.15062607253049168, "grad_norm": 6.627447128295898, "learning_rate": 0.0004961154250695152, "logits/chosen": -5.646604537963867, "logits/rejected": -5.646506309509277, "logps/chosen": -0.6789618730545044, "logps/rejected": -3.7556862831115723, "loss": 0.7131, "odds_ratio_loss": 2.4358396530151367, "rewards/accuracies": 0.8395833373069763, "rewards/chosen": -0.06789619475603104, "rewards/margins": 0.3076724112033844, "rewards/rejected": -0.3755686581134796, "sft_loss": 0.46949687600135803, "step": 1430 }, { "epoch": 0.15167940170902658, "grad_norm": 7.264041900634766, "learning_rate": 0.0004959523244874262, "logits/chosen": -5.661590576171875, "logits/rejected": -5.661508560180664, "logps/chosen": -0.6989915370941162, "logps/rejected": -3.6030194759368896, "loss": 0.7324, "odds_ratio_loss": 2.595428943634033, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.06989916414022446, "rewards/margins": 0.2904028296470642, "rewards/rejected": -0.3603019714355469, "sft_loss": 0.4728315472602844, "step": 1440 }, { "epoch": 0.1527327308875615, "grad_norm": 3.4570446014404297, "learning_rate": 0.0004957858978507342, "logits/chosen": -5.628535270690918, "logits/rejected": -5.628504276275635, "logps/chosen": -0.6590794324874878, "logps/rejected": -3.3306422233581543, "loss": 0.6931, "odds_ratio_loss": 2.6064822673797607, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.06590793281793594, "rewards/margins": 0.2671562433242798, "rewards/rejected": -0.3330641984939575, "sft_loss": 0.43240654468536377, "step": 1450 }, { "epoch": 0.1537860600660964, "grad_norm": 7.473481178283691, "learning_rate": 0.0004956161474100544, "logits/chosen": -5.7138261795043945, "logits/rejected": -5.713827133178711, "logps/chosen": -0.6599766612052917, "logps/rejected": -3.381110668182373, "loss": 0.6939, "odds_ratio_loss": 2.4460840225219727, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06599767506122589, "rewards/margins": 0.2721133828163147, "rewards/rejected": -0.3381110727787018, "sft_loss": 0.44932422041893005, "step": 1460 }, { "epoch": 0.1548393892446313, "grad_norm": 6.867354869842529, "learning_rate": 0.0004954430754609506, "logits/chosen": -5.79508638381958, "logits/rejected": -5.795089244842529, "logps/chosen": -0.6903258562088013, "logps/rejected": -3.085076332092285, "loss": 0.7338, "odds_ratio_loss": 2.651606798171997, "rewards/accuracies": 0.8041666746139526, "rewards/chosen": -0.06903257966041565, "rewards/margins": 0.2394750565290451, "rewards/rejected": -0.30850762128829956, "sft_loss": 0.4686751961708069, "step": 1470 }, { "epoch": 0.1558927184231662, "grad_norm": 3.0605275630950928, "learning_rate": 0.0004952666843439038, "logits/chosen": -5.6379008293151855, "logits/rejected": -5.6378703117370605, "logps/chosen": -0.6344018578529358, "logps/rejected": -3.5333213806152344, "loss": 0.6667, "odds_ratio_loss": 2.371415853500366, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.06344018131494522, "rewards/margins": 0.2898919880390167, "rewards/rejected": -0.35333216190338135, "sft_loss": 0.42956602573394775, "step": 1480 }, { "epoch": 0.15694604760170114, "grad_norm": 3.5998635292053223, "learning_rate": 0.0004950869764442807, "logits/chosen": -5.513609886169434, "logits/rejected": -5.513594627380371, "logps/chosen": -0.6546897888183594, "logps/rejected": -3.4388887882232666, "loss": 0.6841, "odds_ratio_loss": 2.4476258754730225, "rewards/accuracies": 0.8854166865348816, "rewards/chosen": -0.06546898186206818, "rewards/margins": 0.2784199118614197, "rewards/rejected": -0.34388887882232666, "sft_loss": 0.43934836983680725, "step": 1490 }, { "epoch": 0.15799937678023604, "grad_norm": 3.7529213428497314, "learning_rate": 0.0004949039541923015, "logits/chosen": -5.581011772155762, "logits/rejected": -5.580976486206055, "logps/chosen": -0.6702675223350525, "logps/rejected": -3.7115352153778076, "loss": 0.7055, "odds_ratio_loss": 2.6884007453918457, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.06702675670385361, "rewards/margins": 0.3041267395019531, "rewards/rejected": -0.37115350365638733, "sft_loss": 0.43668031692504883, "step": 1500 }, { "epoch": 0.15905270595877094, "grad_norm": 6.72556734085083, "learning_rate": 0.0004947176200630068, "logits/chosen": -5.502162456512451, "logits/rejected": -5.502138137817383, "logps/chosen": -0.6218239068984985, "logps/rejected": -3.428339719772339, "loss": 0.6499, "odds_ratio_loss": 2.3972864151000977, "rewards/accuracies": 0.8708333373069763, "rewards/chosen": -0.06218238174915314, "rewards/margins": 0.28065159916877747, "rewards/rejected": -0.3428339660167694, "sft_loss": 0.4101923108100891, "step": 1510 }, { "epoch": 0.16010603513730584, "grad_norm": 3.806243658065796, "learning_rate": 0.0004945279765762243, "logits/chosen": -5.590113639831543, "logits/rejected": -5.590085029602051, "logps/chosen": -0.6871938109397888, "logps/rejected": -3.6291747093200684, "loss": 0.7198, "odds_ratio_loss": 2.472882032394409, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.06871937960386276, "rewards/margins": 0.2941981554031372, "rewards/rejected": -0.3629175126552582, "sft_loss": 0.4725038409233093, "step": 1520 }, { "epoch": 0.16115936431584074, "grad_norm": 5.523288249969482, "learning_rate": 0.0004943350262965349, "logits/chosen": -5.691066265106201, "logits/rejected": -5.6911163330078125, "logps/chosen": -0.6510820984840393, "logps/rejected": -3.059138774871826, "loss": 0.685, "odds_ratio_loss": 2.56689715385437, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.06510820984840393, "rewards/margins": 0.24080567061901093, "rewards/rejected": -0.30591386556625366, "sft_loss": 0.42830324172973633, "step": 1530 }, { "epoch": 0.16221269349437567, "grad_norm": 4.537405967712402, "learning_rate": 0.0004941387718332374, "logits/chosen": -5.746434688568115, "logits/rejected": -5.746466159820557, "logps/chosen": -0.6964403390884399, "logps/rejected": -3.4062578678131104, "loss": 0.7309, "odds_ratio_loss": 2.425229787826538, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.0696440264582634, "rewards/margins": 0.2709817886352539, "rewards/rejected": -0.3406257629394531, "sft_loss": 0.4884008467197418, "step": 1540 }, { "epoch": 0.16326602267291057, "grad_norm": 1.987900972366333, "learning_rate": 0.000493939215840314, "logits/chosen": -5.694365978240967, "logits/rejected": -5.694273471832275, "logps/chosen": -0.6464301347732544, "logps/rejected": -3.67587947845459, "loss": 0.6811, "odds_ratio_loss": 2.441976308822632, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0646430179476738, "rewards/margins": 0.3029448688030243, "rewards/rejected": -0.36758795380592346, "sft_loss": 0.43692201375961304, "step": 1550 }, { "epoch": 0.16431935185144547, "grad_norm": 4.507101058959961, "learning_rate": 0.000493736361016394, "logits/chosen": -5.841778755187988, "logits/rejected": -5.841654300689697, "logps/chosen": -0.6818705797195435, "logps/rejected": -3.587505578994751, "loss": 0.7136, "odds_ratio_loss": 2.5504865646362305, "rewards/accuracies": 0.8291666507720947, "rewards/chosen": -0.06818706542253494, "rewards/margins": 0.29056352376937866, "rewards/rejected": -0.3587505519390106, "sft_loss": 0.45857658982276917, "step": 1560 }, { "epoch": 0.16537268102998037, "grad_norm": 3.6050593852996826, "learning_rate": 0.0004935302101047171, "logits/chosen": -5.996950149536133, "logits/rejected": -5.996947288513184, "logps/chosen": -0.6442388296127319, "logps/rejected": -3.464613437652588, "loss": 0.673, "odds_ratio_loss": 2.08451247215271, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06442389637231827, "rewards/margins": 0.2820374071598053, "rewards/rejected": -0.34646129608154297, "sft_loss": 0.4645636975765228, "step": 1570 }, { "epoch": 0.1664260102085153, "grad_norm": 6.22938871383667, "learning_rate": 0.0004933207658930968, "logits/chosen": -6.110846996307373, "logits/rejected": -6.1108527183532715, "logps/chosen": -0.5905119776725769, "logps/rejected": -3.7033638954162598, "loss": 0.6196, "odds_ratio_loss": 2.2230594158172607, "rewards/accuracies": 0.8708333373069763, "rewards/chosen": -0.05905119329690933, "rewards/margins": 0.3112851679325104, "rewards/rejected": -0.3703364133834839, "sft_loss": 0.3972512185573578, "step": 1580 }, { "epoch": 0.1674793393870502, "grad_norm": 5.692537307739258, "learning_rate": 0.0004931080312138824, "logits/chosen": -5.9748077392578125, "logits/rejected": -5.974870681762695, "logps/chosen": -0.6414510607719421, "logps/rejected": -3.1401894092559814, "loss": 0.6755, "odds_ratio_loss": 2.408728837966919, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.06414511054754257, "rewards/margins": 0.24987384676933289, "rewards/rejected": -0.31401893496513367, "sft_loss": 0.4346589744091034, "step": 1590 }, { "epoch": 0.1685326685655851, "grad_norm": 4.23068380355835, "learning_rate": 0.0004928920089439206, "logits/chosen": -5.843720436096191, "logits/rejected": -5.84379768371582, "logps/chosen": -0.7081334590911865, "logps/rejected": -3.097430467605591, "loss": 0.7425, "odds_ratio_loss": 2.416752338409424, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07081333547830582, "rewards/margins": 0.23892968893051147, "rewards/rejected": -0.3097430169582367, "sft_loss": 0.5008493661880493, "step": 1600 }, { "epoch": 0.16958599774412, "grad_norm": 7.679098606109619, "learning_rate": 0.000492672702004517, "logits/chosen": -5.8228349685668945, "logits/rejected": -5.822881698608398, "logps/chosen": -0.6390455365180969, "logps/rejected": -2.928208112716675, "loss": 0.6714, "odds_ratio_loss": 2.5421833992004395, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.0639045462012291, "rewards/margins": 0.2289162576198578, "rewards/rejected": -0.29282084107398987, "sft_loss": 0.4171499013900757, "step": 1610 }, { "epoch": 0.1706393269226549, "grad_norm": 2.961296558380127, "learning_rate": 0.000492450113361396, "logits/chosen": -5.76025915145874, "logits/rejected": -5.760366916656494, "logps/chosen": -0.707727313041687, "logps/rejected": -2.787848472595215, "loss": 0.7488, "odds_ratio_loss": 2.6859564781188965, "rewards/accuracies": 0.8083333373069763, "rewards/chosen": -0.07077272981405258, "rewards/margins": 0.20801211893558502, "rewards/rejected": -0.278784841299057, "sft_loss": 0.48024895787239075, "step": 1620 }, { "epoch": 0.17169265610118983, "grad_norm": 5.1038432121276855, "learning_rate": 0.0004922242460246613, "logits/chosen": -5.724485397338867, "logits/rejected": -5.724647521972656, "logps/chosen": -0.6975029110908508, "logps/rejected": -2.6335670948028564, "loss": 0.7302, "odds_ratio_loss": 2.552466869354248, "rewards/accuracies": 0.8729166388511658, "rewards/chosen": -0.06975029408931732, "rewards/margins": 0.1936063915491104, "rewards/rejected": -0.26335668563842773, "sft_loss": 0.4749198257923126, "step": 1630 }, { "epoch": 0.17274598527972473, "grad_norm": 4.588533401489258, "learning_rate": 0.0004919951030487549, "logits/chosen": -5.752465724945068, "logits/rejected": -5.75269079208374, "logps/chosen": -0.7377220392227173, "logps/rejected": -2.5051889419555664, "loss": 0.7742, "odds_ratio_loss": 2.6720080375671387, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.07377220690250397, "rewards/margins": 0.1767466962337494, "rewards/rejected": -0.25051891803741455, "sft_loss": 0.5070357918739319, "step": 1640 }, { "epoch": 0.17379931445825963, "grad_norm": 4.702722072601318, "learning_rate": 0.0004917626875324156, "logits/chosen": -5.999307155609131, "logits/rejected": -5.999597072601318, "logps/chosen": -0.6588828563690186, "logps/rejected": -3.105346202850342, "loss": 0.6907, "odds_ratio_loss": 2.3203392028808594, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.06588829308748245, "rewards/margins": 0.244646355509758, "rewards/rejected": -0.31053462624549866, "sft_loss": 0.458629310131073, "step": 1650 }, { "epoch": 0.17485264363679454, "grad_norm": 4.34697961807251, "learning_rate": 0.0004915270026186377, "logits/chosen": -6.0448760986328125, "logits/rejected": -6.045097827911377, "logps/chosen": -0.6159750819206238, "logps/rejected": -3.314389705657959, "loss": 0.6451, "odds_ratio_loss": 2.2701919078826904, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.06159750744700432, "rewards/margins": 0.2698414921760559, "rewards/rejected": -0.33143898844718933, "sft_loss": 0.4180779755115509, "step": 1660 }, { "epoch": 0.17590597281532946, "grad_norm": 6.3104119300842285, "learning_rate": 0.0004912880514946277, "logits/chosen": -6.198366165161133, "logits/rejected": -6.198540210723877, "logps/chosen": -0.6512912511825562, "logps/rejected": -3.2115368843078613, "loss": 0.6825, "odds_ratio_loss": 2.350752353668213, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0651291236281395, "rewards/margins": 0.256024569272995, "rewards/rejected": -0.3211536705493927, "sft_loss": 0.4474564790725708, "step": 1670 }, { "epoch": 0.17695930199386437, "grad_norm": 4.557435989379883, "learning_rate": 0.0004910458373917618, "logits/chosen": -5.757941722869873, "logits/rejected": -5.75801944732666, "logps/chosen": -0.7286573648452759, "logps/rejected": -3.038480043411255, "loss": 0.7631, "odds_ratio_loss": 2.563901662826538, "rewards/accuracies": 0.8229166865348816, "rewards/chosen": -0.07286573201417923, "rewards/margins": 0.23098230361938477, "rewards/rejected": -0.303847998380661, "sft_loss": 0.5067596435546875, "step": 1680 }, { "epoch": 0.17801263117239927, "grad_norm": 4.517858028411865, "learning_rate": 0.0004908003635855421, "logits/chosen": -5.7685866355896, "logits/rejected": -5.768723011016846, "logps/chosen": -0.6709701418876648, "logps/rejected": -3.045161724090576, "loss": 0.7057, "odds_ratio_loss": 2.353038787841797, "rewards/accuracies": 0.8270833492279053, "rewards/chosen": -0.0670970231294632, "rewards/margins": 0.23741915822029114, "rewards/rejected": -0.30451616644859314, "sft_loss": 0.4703839421272278, "step": 1690 }, { "epoch": 0.17906596035093417, "grad_norm": 3.8505797386169434, "learning_rate": 0.0004905516333955521, "logits/chosen": -5.820653915405273, "logits/rejected": -5.820913791656494, "logps/chosen": -0.6014044880867004, "logps/rejected": -2.9712061882019043, "loss": 0.6348, "odds_ratio_loss": 2.333472728729248, "rewards/accuracies": 0.84375, "rewards/chosen": -0.060140449553728104, "rewards/margins": 0.23698018491268158, "rewards/rejected": -0.2971206307411194, "sft_loss": 0.40144431591033936, "step": 1700 }, { "epoch": 0.18011928952946907, "grad_norm": 5.725916385650635, "learning_rate": 0.0004902996501854119, "logits/chosen": -6.354620933532715, "logits/rejected": -6.355036735534668, "logps/chosen": -1.4475494623184204, "logps/rejected": -3.9082717895507812, "loss": 1.4845, "odds_ratio_loss": 4.034452438354492, "rewards/accuracies": 0.8291666507720947, "rewards/chosen": -0.14475493133068085, "rewards/margins": 0.24607227742671967, "rewards/rejected": -0.3908271789550781, "sft_loss": 1.0810879468917847, "step": 1710 }, { "epoch": 0.181172618708004, "grad_norm": 8.74376392364502, "learning_rate": 0.0004900444173627328, "logits/chosen": -6.625903129577637, "logits/rejected": -6.62622594833374, "logps/chosen": -0.7164724469184875, "logps/rejected": -3.1040403842926025, "loss": 0.7538, "odds_ratio_loss": 2.710744619369507, "rewards/accuracies": 0.8208333253860474, "rewards/chosen": -0.07164724916219711, "rewards/margins": 0.23875676095485687, "rewards/rejected": -0.3104040026664734, "sft_loss": 0.4827170670032501, "step": 1720 }, { "epoch": 0.1822259478865389, "grad_norm": 2.9171664714813232, "learning_rate": 0.0004897859383790711, "logits/chosen": -6.8558220863342285, "logits/rejected": -6.856238842010498, "logps/chosen": -0.6772664785385132, "logps/rejected": -3.0685346126556396, "loss": 0.7145, "odds_ratio_loss": 2.5943028926849365, "rewards/accuracies": 0.8145833611488342, "rewards/chosen": -0.06772664934396744, "rewards/margins": 0.23912683129310608, "rewards/rejected": -0.3068534731864929, "sft_loss": 0.4550252854824066, "step": 1730 }, { "epoch": 0.1832792770650738, "grad_norm": 4.223478317260742, "learning_rate": 0.0004895242167298816, "logits/chosen": -6.91244649887085, "logits/rejected": -6.912972927093506, "logps/chosen": -0.6928088665008545, "logps/rejected": -3.333386182785034, "loss": 0.724, "odds_ratio_loss": 2.613675117492676, "rewards/accuracies": 0.8291666507720947, "rewards/chosen": -0.06928088515996933, "rewards/margins": 0.2640577256679535, "rewards/rejected": -0.3333386480808258, "sft_loss": 0.4625937044620514, "step": 1740 }, { "epoch": 0.1843326062436087, "grad_norm": 6.387345790863037, "learning_rate": 0.0004892592559544702, "logits/chosen": -6.475886821746826, "logits/rejected": -6.476265907287598, "logps/chosen": -0.689757764339447, "logps/rejected": -2.9596078395843506, "loss": 0.7284, "odds_ratio_loss": 2.5536398887634277, "rewards/accuracies": 0.8083333373069763, "rewards/chosen": -0.0689757764339447, "rewards/margins": 0.22698503732681274, "rewards/rejected": -0.29596078395843506, "sft_loss": 0.47301238775253296, "step": 1750 }, { "epoch": 0.1853859354221436, "grad_norm": 4.752090930938721, "learning_rate": 0.0004889910596359457, "logits/chosen": -6.3866801261901855, "logits/rejected": -6.3870625495910645, "logps/chosen": -0.6425169110298157, "logps/rejected": -3.2402262687683105, "loss": 0.6768, "odds_ratio_loss": 2.416498899459839, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.06425168365240097, "rewards/margins": 0.259770929813385, "rewards/rejected": -0.3240226209163666, "sft_loss": 0.43517401814460754, "step": 1760 }, { "epoch": 0.18643926460067853, "grad_norm": 4.994302272796631, "learning_rate": 0.0004887196314011722, "logits/chosen": -6.208808422088623, "logits/rejected": -6.209136009216309, "logps/chosen": -0.6876904964447021, "logps/rejected": -3.412658929824829, "loss": 0.7195, "odds_ratio_loss": 2.441316843032837, "rewards/accuracies": 0.8479166626930237, "rewards/chosen": -0.06876904517412186, "rewards/margins": 0.2724968492984772, "rewards/rejected": -0.34126585721969604, "sft_loss": 0.4754055142402649, "step": 1770 }, { "epoch": 0.18749259377921343, "grad_norm": 6.439792156219482, "learning_rate": 0.0004884449749207192, "logits/chosen": -6.457438945770264, "logits/rejected": -6.457731246948242, "logps/chosen": -0.6336179375648499, "logps/rejected": -2.840827703475952, "loss": 0.6657, "odds_ratio_loss": 2.448340892791748, "rewards/accuracies": 0.8708333373069763, "rewards/chosen": -0.06336179375648499, "rewards/margins": 0.22072099149227142, "rewards/rejected": -0.2840828001499176, "sft_loss": 0.420904278755188, "step": 1780 }, { "epoch": 0.18854592295774833, "grad_norm": 4.342998027801514, "learning_rate": 0.00048816709390881266, "logits/chosen": -6.21989631652832, "logits/rejected": -6.220187664031982, "logps/chosen": -0.6857010722160339, "logps/rejected": -2.874756336212158, "loss": 0.7182, "odds_ratio_loss": 2.453400135040283, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06857011467218399, "rewards/margins": 0.218905508518219, "rewards/rejected": -0.2874756455421448, "sft_loss": 0.47288984060287476, "step": 1790 }, { "epoch": 0.18959925213628323, "grad_norm": 33.773277282714844, "learning_rate": 0.0004878859921232839, "logits/chosen": -5.917886257171631, "logits/rejected": -5.9181623458862305, "logps/chosen": -0.7129290103912354, "logps/rejected": -2.9589409828186035, "loss": 0.7486, "odds_ratio_loss": 2.4278337955474854, "rewards/accuracies": 0.8166666626930237, "rewards/chosen": -0.07129290699958801, "rewards/margins": 0.22460119426250458, "rewards/rejected": -0.2958941161632538, "sft_loss": 0.5058320760726929, "step": 1800 }, { "epoch": 0.19065258131481816, "grad_norm": 4.040477275848389, "learning_rate": 0.00048760167336551964, "logits/chosen": -5.841413974761963, "logits/rejected": -5.8417158126831055, "logps/chosen": -0.6335561275482178, "logps/rejected": -3.0441653728485107, "loss": 0.6684, "odds_ratio_loss": 2.3195104598999023, "rewards/accuracies": 0.8354166746139526, "rewards/chosen": -0.06335561722517014, "rewards/margins": 0.24106094241142273, "rewards/rejected": -0.30441656708717346, "sft_loss": 0.4364630877971649, "step": 1810 }, { "epoch": 0.19170591049335306, "grad_norm": 6.749710559844971, "learning_rate": 0.0004873141414804103, "logits/chosen": -5.7162394523620605, "logits/rejected": -5.716516017913818, "logps/chosen": -0.6518925428390503, "logps/rejected": -3.0878660678863525, "loss": 0.6886, "odds_ratio_loss": 2.5920615196228027, "rewards/accuracies": 0.8458333611488342, "rewards/chosen": -0.06518926471471786, "rewards/margins": 0.2435973733663559, "rewards/rejected": -0.30878666043281555, "sft_loss": 0.4293573498725891, "step": 1820 }, { "epoch": 0.19275923967188796, "grad_norm": 2.045081615447998, "learning_rate": 0.00048702340035629787, "logits/chosen": -5.856993198394775, "logits/rejected": -5.857146263122559, "logps/chosen": -0.6080865263938904, "logps/rejected": -2.7589311599731445, "loss": 0.6369, "odds_ratio_loss": 2.007796287536621, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.06080865487456322, "rewards/margins": 0.21508444845676422, "rewards/rejected": -0.27589309215545654, "sft_loss": 0.43609535694122314, "step": 1830 }, { "epoch": 0.19381256885042286, "grad_norm": 4.158146381378174, "learning_rate": 0.0004867294539249234, "logits/chosen": -6.230529308319092, "logits/rejected": -6.230895519256592, "logps/chosen": -0.6979438066482544, "logps/rejected": -3.5587799549102783, "loss": 0.7295, "odds_ratio_loss": 2.502671718597412, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.06979438662528992, "rewards/margins": 0.2860836088657379, "rewards/rejected": -0.35587799549102783, "sft_loss": 0.4792328178882599, "step": 1840 }, { "epoch": 0.19486589802895776, "grad_norm": 2.7599072456359863, "learning_rate": 0.0004864323061613738, "logits/chosen": -6.244935512542725, "logits/rejected": -6.245189189910889, "logps/chosen": -0.6155544519424438, "logps/rejected": -3.0617711544036865, "loss": 0.6473, "odds_ratio_loss": 2.4084250926971436, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06155544891953468, "rewards/margins": 0.2446216493844986, "rewards/rejected": -0.3061771094799042, "sft_loss": 0.4064619243144989, "step": 1850 }, { "epoch": 0.1959192272074927, "grad_norm": 4.056497573852539, "learning_rate": 0.0004861319610840282, "logits/chosen": -5.854410648345947, "logits/rejected": -5.8545074462890625, "logps/chosen": -0.7075474262237549, "logps/rejected": -3.4564712047576904, "loss": 0.7458, "odds_ratio_loss": 2.5600531101226807, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.07075474411249161, "rewards/margins": 0.2748924195766449, "rewards/rejected": -0.3456471860408783, "sft_loss": 0.4897785782814026, "step": 1860 }, { "epoch": 0.1969725563860276, "grad_norm": 7.0494489669799805, "learning_rate": 0.00048582842275450366, "logits/chosen": -5.870307922363281, "logits/rejected": -5.870253086090088, "logps/chosen": -0.6499666571617126, "logps/rejected": -3.4302499294281006, "loss": 0.6847, "odds_ratio_loss": 2.5706870555877686, "rewards/accuracies": 0.8520833253860474, "rewards/chosen": -0.06499668210744858, "rewards/margins": 0.27802836894989014, "rewards/rejected": -0.3430250287055969, "sft_loss": 0.4276408553123474, "step": 1870 }, { "epoch": 0.1980258855645625, "grad_norm": 9.770491600036621, "learning_rate": 0.0004855216952775999, "logits/chosen": -6.05530309677124, "logits/rejected": -6.05518102645874, "logps/chosen": -0.6710807681083679, "logps/rejected": -3.7501718997955322, "loss": 0.7031, "odds_ratio_loss": 2.5512893199920654, "rewards/accuracies": 0.8583333492279053, "rewards/chosen": -0.06710807234048843, "rewards/margins": 0.3079090714454651, "rewards/rejected": -0.3750171661376953, "sft_loss": 0.4479447305202484, "step": 1880 }, { "epoch": 0.1990792147430974, "grad_norm": 46.68679428100586, "learning_rate": 0.0004852117828012441, "logits/chosen": -6.125611782073975, "logits/rejected": -6.125678539276123, "logps/chosen": -0.8755971789360046, "logps/rejected": -4.0310163497924805, "loss": 0.9086, "odds_ratio_loss": 3.206876516342163, "rewards/accuracies": 0.8416666388511658, "rewards/chosen": -0.08755972236394882, "rewards/margins": 0.31554192304611206, "rewards/rejected": -0.4031016528606415, "sft_loss": 0.5878926515579224, "step": 1890 }, { "epoch": 0.20013254392163232, "grad_norm": 7.289094924926758, "learning_rate": 0.00048489868951643477, "logits/chosen": -6.526234149932861, "logits/rejected": -6.526541233062744, "logps/chosen": -1.0017836093902588, "logps/rejected": -3.509293556213379, "loss": 1.0477, "odds_ratio_loss": 3.5900070667266846, "rewards/accuracies": 0.8270833492279053, "rewards/chosen": -0.10017836093902588, "rewards/margins": 0.2507510483264923, "rewards/rejected": -0.3509294092655182, "sft_loss": 0.6887442469596863, "step": 1900 } ], "logging_steps": 10, "max_steps": 9493, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4024826693511741e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }