{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 11608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017229496898690558, "grad_norm": 2.1823763847351074, "learning_rate": 8.613264427217915e-11, "logits/chosen": -2.967046022415161, "logits/rejected": -2.9243061542510986, "logps/chosen": -43.99115753173828, "logps/rejected": -41.627906799316406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0017229496898690559, "grad_norm": 2.387622594833374, "learning_rate": 8.613264427217916e-10, "logits/chosen": -3.055140972137451, "logits/rejected": -3.0257670879364014, "logps/chosen": -50.45387649536133, "logps/rejected": -49.622737884521484, "loss": 0.6929, "rewards/accuracies": 0.5763888955116272, "rewards/chosen": 0.00012421452265698463, "rewards/margins": 0.0005009726155549288, "rewards/rejected": -0.00037675804924219847, "step": 10 }, { "epoch": 0.0034458993797381117, "grad_norm": 2.2426376342773438, "learning_rate": 1.7226528854435832e-09, "logits/chosen": -3.119055986404419, "logits/rejected": -3.1108334064483643, "logps/chosen": -52.67353057861328, "logps/rejected": -53.0074462890625, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.0001044743912643753, "rewards/margins": -8.299542969325557e-05, "rewards/rejected": -2.1478936105268076e-05, "step": 20 }, { "epoch": 0.005168849069607168, "grad_norm": 2.5754239559173584, "learning_rate": 2.5839793281653743e-09, "logits/chosen": -3.091740369796753, "logits/rejected": -3.0679197311401367, "logps/chosen": -56.782386779785156, "logps/rejected": -58.43836212158203, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00013735578977502882, "rewards/margins": 0.00014890992315486073, "rewards/rejected": -1.1554128832358401e-05, "step": 30 }, { "epoch": 0.006891798759476223, "grad_norm": 2.0134646892547607, "learning_rate": 3.4453057708871665e-09, "logits/chosen": -3.1050755977630615, "logits/rejected": -3.073472499847412, "logps/chosen": -55.25732421875, "logps/rejected": -50.6669921875, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00014844746328890324, "rewards/margins": -3.4907094232039526e-05, "rewards/rejected": 0.0001833545247791335, "step": 40 }, { "epoch": 0.00861474844934528, "grad_norm": 2.385958194732666, "learning_rate": 4.306632213608958e-09, "logits/chosen": -3.100965738296509, "logits/rejected": -3.0844969749450684, "logps/chosen": -53.12641143798828, "logps/rejected": -51.509857177734375, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -9.805540321394801e-05, "rewards/margins": 2.18080822378397e-06, "rewards/rejected": -0.00010023623326560482, "step": 50 }, { "epoch": 0.010337698139214336, "grad_norm": 2.795139789581299, "learning_rate": 5.167958656330749e-09, "logits/chosen": -3.1540331840515137, "logits/rejected": -3.1243770122528076, "logps/chosen": -57.58463668823242, "logps/rejected": -54.14760208129883, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 1.8001441276283003e-05, "rewards/margins": -0.00032479705987498164, "rewards/rejected": 0.00034279850660823286, "step": 60 }, { "epoch": 0.012060647829083391, "grad_norm": 2.2026424407958984, "learning_rate": 6.02928509905254e-09, "logits/chosen": -3.0509531497955322, "logits/rejected": -3.0309481620788574, "logps/chosen": -53.7503547668457, "logps/rejected": -53.22412872314453, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00018892706430051476, "rewards/margins": 0.00018463641754351556, "rewards/rejected": 4.290644028515089e-06, "step": 70 }, { "epoch": 0.013783597518952447, "grad_norm": 2.4384751319885254, "learning_rate": 6.890611541774333e-09, "logits/chosen": -3.15974497795105, "logits/rejected": -3.1263458728790283, "logps/chosen": -59.09418869018555, "logps/rejected": -54.10878372192383, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 2.246947587991599e-05, "rewards/margins": 0.00015436351532116532, "rewards/rejected": -0.00013189406308811158, "step": 80 }, { "epoch": 0.015506547208821502, "grad_norm": 2.472266674041748, "learning_rate": 7.751937984496123e-09, "logits/chosen": -2.9935109615325928, "logits/rejected": -2.9786577224731445, "logps/chosen": -53.474945068359375, "logps/rejected": -52.831932067871094, "loss": 0.6932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.8896745536476374e-05, "rewards/margins": -0.00014048551383893937, "rewards/rejected": 0.00010158878285437822, "step": 90 }, { "epoch": 0.01722949689869056, "grad_norm": 2.484550714492798, "learning_rate": 8.613264427217916e-09, "logits/chosen": -3.1697795391082764, "logits/rejected": -3.1076667308807373, "logps/chosen": -55.963539123535156, "logps/rejected": -49.63465118408203, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.00018626314704306424, "rewards/margins": -0.00013597900397144258, "rewards/rejected": -5.028415398555808e-05, "step": 100 }, { "epoch": 0.01722949689869056, "eval_logits/chosen": -3.162914991378784, "eval_logits/rejected": -3.157243490219116, "eval_logps/chosen": -58.70985794067383, "eval_logps/rejected": -63.17200469970703, "eval_loss": 0.6931781768798828, "eval_rewards/accuracies": 0.48187732696533203, "eval_rewards/chosen": 2.0352143110358156e-05, "eval_rewards/margins": -6.087439396651462e-05, "eval_rewards/rejected": 8.122652798192576e-05, "eval_runtime": 358.6096, "eval_samples_per_second": 12.002, "eval_steps_per_second": 1.5, "step": 100 }, { "epoch": 0.018952446588559616, "grad_norm": 2.542006015777588, "learning_rate": 9.474590869939706e-09, "logits/chosen": -3.121802806854248, "logits/rejected": -3.097970485687256, "logps/chosen": -55.59550094604492, "logps/rejected": -52.33124542236328, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00018450118659529835, "rewards/margins": -0.00018190534319728613, "rewards/rejected": -2.59584044215444e-06, "step": 110 }, { "epoch": 0.02067539627842867, "grad_norm": 2.568783760070801, "learning_rate": 1.0335917312661497e-08, "logits/chosen": -3.065636157989502, "logits/rejected": -3.050166606903076, "logps/chosen": -53.187355041503906, "logps/rejected": -55.559715270996094, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -5.230843089520931e-05, "rewards/margins": -0.00017658497381489724, "rewards/rejected": 0.00012427649926394224, "step": 120 }, { "epoch": 0.022398345968297727, "grad_norm": 2.1416878700256348, "learning_rate": 1.119724375538329e-08, "logits/chosen": -3.1008267402648926, "logits/rejected": -3.086881637573242, "logps/chosen": -55.180259704589844, "logps/rejected": -53.775177001953125, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0001337065768893808, "rewards/margins": 0.00028117914916947484, "rewards/rejected": -0.00014747263048775494, "step": 130 }, { "epoch": 0.024121295658166782, "grad_norm": 2.428621768951416, "learning_rate": 1.205857019810508e-08, "logits/chosen": -3.122899293899536, "logits/rejected": -3.1046016216278076, "logps/chosen": -54.185447692871094, "logps/rejected": -53.77512741088867, "loss": 0.6931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.432188572740415e-06, "rewards/margins": 0.00018397874373476952, "rewards/rejected": -0.00017854655743576586, "step": 140 }, { "epoch": 0.025844245348035838, "grad_norm": 2.2146685123443604, "learning_rate": 1.2919896640826872e-08, "logits/chosen": -3.0275516510009766, "logits/rejected": -3.0097765922546387, "logps/chosen": -52.62202072143555, "logps/rejected": -52.41334915161133, "loss": 0.6931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 4.773867112817243e-05, "rewards/margins": 0.00015990195970516652, "rewards/rejected": -0.0001121632958529517, "step": 150 }, { "epoch": 0.027567195037904894, "grad_norm": 2.1602721214294434, "learning_rate": 1.3781223083548666e-08, "logits/chosen": -3.088822364807129, "logits/rejected": -3.0679469108581543, "logps/chosen": -53.511573791503906, "logps/rejected": -54.70383834838867, "loss": 0.6933, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.0002432465844321996, "rewards/margins": -0.0002327763504581526, "rewards/rejected": -1.0470268534845673e-05, "step": 160 }, { "epoch": 0.02929014472777395, "grad_norm": 2.3529062271118164, "learning_rate": 1.4642549526270457e-08, "logits/chosen": -3.0763819217681885, "logits/rejected": -3.0568976402282715, "logps/chosen": -56.28557586669922, "logps/rejected": -51.32001495361328, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -9.712641258374788e-06, "rewards/margins": -2.3522810806753114e-05, "rewards/rejected": 1.3810163181915414e-05, "step": 170 }, { "epoch": 0.031013094417643005, "grad_norm": 2.607346296310425, "learning_rate": 1.5503875968992246e-08, "logits/chosen": -3.0625109672546387, "logits/rejected": -3.0438477993011475, "logps/chosen": -56.41377639770508, "logps/rejected": -53.783180236816406, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -8.122886356431991e-05, "rewards/margins": -5.78999643039424e-05, "rewards/rejected": -2.3328897441388108e-05, "step": 180 }, { "epoch": 0.03273604410751206, "grad_norm": 2.635439157485962, "learning_rate": 1.636520241171404e-08, "logits/chosen": -3.1240930557250977, "logits/rejected": -3.080695390701294, "logps/chosen": -58.194053649902344, "logps/rejected": -52.56025314331055, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00010519265197217464, "rewards/margins": 0.00011926700972253457, "rewards/rejected": -0.00022445968352258205, "step": 190 }, { "epoch": 0.03445899379738112, "grad_norm": 2.576287031173706, "learning_rate": 1.722652885443583e-08, "logits/chosen": -3.0595781803131104, "logits/rejected": -3.04390287399292, "logps/chosen": -54.107139587402344, "logps/rejected": -54.70692825317383, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00015565704961773008, "rewards/margins": 2.6284324121661484e-06, "rewards/rejected": 0.00015302860992960632, "step": 200 }, { "epoch": 0.03445899379738112, "eval_logits/chosen": -3.1632211208343506, "eval_logits/rejected": -3.1575613021850586, "eval_logps/chosen": -58.711769104003906, "eval_logps/rejected": -63.17155456542969, "eval_loss": 0.6931898593902588, "eval_rewards/accuracies": 0.4893122613430023, "eval_rewards/chosen": 1.2645278957279515e-06, "eval_rewards/margins": -8.44153473735787e-05, "eval_rewards/rejected": 8.56798724271357e-05, "eval_runtime": 358.4945, "eval_samples_per_second": 12.006, "eval_steps_per_second": 1.501, "step": 200 }, { "epoch": 0.03618194348725017, "grad_norm": 2.2875328063964844, "learning_rate": 1.8087855297157624e-08, "logits/chosen": -3.0142710208892822, "logits/rejected": -3.0056774616241455, "logps/chosen": -53.26692581176758, "logps/rejected": -57.282997131347656, "loss": 0.6932, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.878843224607408e-05, "rewards/margins": -0.00010379708692198619, "rewards/rejected": 7.500863284803927e-05, "step": 210 }, { "epoch": 0.03790489317711923, "grad_norm": 2.327131748199463, "learning_rate": 1.8949181739879413e-08, "logits/chosen": -3.0508837699890137, "logits/rejected": -3.019562244415283, "logps/chosen": -52.19524002075195, "logps/rejected": -51.32683181762695, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 4.231429556966759e-05, "rewards/margins": 0.00014887299039401114, "rewards/rejected": -0.00010655868391040713, "step": 220 }, { "epoch": 0.03962784286698828, "grad_norm": 2.3964474201202393, "learning_rate": 1.9810508182601205e-08, "logits/chosen": -3.051140546798706, "logits/rejected": -3.0328004360198975, "logps/chosen": -48.911827087402344, "logps/rejected": -49.93369674682617, "loss": 0.6932, "rewards/accuracies": 0.40625, "rewards/chosen": -2.207815668953117e-05, "rewards/margins": -0.00013280121493153274, "rewards/rejected": 0.00011072307825088501, "step": 230 }, { "epoch": 0.04135079255685734, "grad_norm": 2.246399164199829, "learning_rate": 2.0671834625322995e-08, "logits/chosen": -3.0245563983917236, "logits/rejected": -2.982118844985962, "logps/chosen": -55.94266891479492, "logps/rejected": -52.16364669799805, "loss": 0.693, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.00011497551167849451, "rewards/margins": 0.00020507563021965325, "rewards/rejected": -9.010009671328589e-05, "step": 240 }, { "epoch": 0.043073742246726394, "grad_norm": 2.313981294631958, "learning_rate": 2.153316106804479e-08, "logits/chosen": -3.1180367469787598, "logits/rejected": -3.0976953506469727, "logps/chosen": -52.28910446166992, "logps/rejected": -51.09497833251953, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -9.025823965203017e-05, "rewards/margins": -0.00011660426389425993, "rewards/rejected": 2.6346038794144988e-05, "step": 250 }, { "epoch": 0.044796691936595454, "grad_norm": 2.310410976409912, "learning_rate": 2.239448751076658e-08, "logits/chosen": -3.0942649841308594, "logits/rejected": -3.0820066928863525, "logps/chosen": -54.8565673828125, "logps/rejected": -56.64534378051758, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0001261205761693418, "rewards/margins": 0.0004069819115102291, "rewards/rejected": -0.0002808613935485482, "step": 260 }, { "epoch": 0.046519641626464506, "grad_norm": 2.2104978561401367, "learning_rate": 2.3255813953488372e-08, "logits/chosen": -3.0330111980438232, "logits/rejected": -3.014916181564331, "logps/chosen": -53.13231658935547, "logps/rejected": -54.31147384643555, "loss": 0.6932, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0001277371047763154, "rewards/margins": -6.514495180454105e-05, "rewards/rejected": -6.259213841985911e-05, "step": 270 }, { "epoch": 0.048242591316333565, "grad_norm": 2.4354984760284424, "learning_rate": 2.411714039621016e-08, "logits/chosen": -3.1244430541992188, "logits/rejected": -3.0903525352478027, "logps/chosen": -57.60699462890625, "logps/rejected": -53.4240837097168, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 6.179927731864154e-05, "rewards/margins": 0.00015818976680748165, "rewards/rejected": -9.639047493692487e-05, "step": 280 }, { "epoch": 0.04996554100620262, "grad_norm": 2.2454993724823, "learning_rate": 2.4978466838931954e-08, "logits/chosen": -3.0470690727233887, "logits/rejected": -3.032789707183838, "logps/chosen": -55.36420440673828, "logps/rejected": -54.27949142456055, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.00012243811215739697, "rewards/margins": 9.9905242677778e-05, "rewards/rejected": 2.2532873117597774e-05, "step": 290 }, { "epoch": 0.051688490696071676, "grad_norm": 2.363119125366211, "learning_rate": 2.5839793281653743e-08, "logits/chosen": -3.002816677093506, "logits/rejected": -2.9941678047180176, "logps/chosen": -52.84331130981445, "logps/rejected": -53.92693328857422, "loss": 0.6932, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -9.468554708291776e-06, "rewards/margins": -0.00018169668328482658, "rewards/rejected": 0.00017222815949935466, "step": 300 }, { "epoch": 0.051688490696071676, "eval_logits/chosen": -3.1631081104278564, "eval_logits/rejected": -3.1574697494506836, "eval_logps/chosen": -58.7095947265625, "eval_logps/rejected": -63.167720794677734, "eval_loss": 0.6931981444358826, "eval_rewards/accuracies": 0.46956318616867065, "eval_rewards/chosen": 2.3022554159979336e-05, "eval_rewards/margins": -0.0001009509724099189, "eval_rewards/rejected": 0.00012397351383697242, "eval_runtime": 358.8693, "eval_samples_per_second": 11.993, "eval_steps_per_second": 1.499, "step": 300 }, { "epoch": 0.05341144038594073, "grad_norm": 2.470623731613159, "learning_rate": 2.6701119724375536e-08, "logits/chosen": -3.0653061866760254, "logits/rejected": -3.0597829818725586, "logps/chosen": -53.518653869628906, "logps/rejected": -53.315582275390625, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 7.493379143852508e-06, "rewards/margins": 0.0002100001584040001, "rewards/rejected": -0.00020250678062438965, "step": 310 }, { "epoch": 0.05513439007580979, "grad_norm": 2.3561344146728516, "learning_rate": 2.756244616709733e-08, "logits/chosen": -3.0231707096099854, "logits/rejected": -2.996835947036743, "logps/chosen": -54.5215950012207, "logps/rejected": -49.26898956298828, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.0001359484886052087, "rewards/margins": 0.00015204887313302606, "rewards/rejected": -0.00028799736173823476, "step": 320 }, { "epoch": 0.05685733976567884, "grad_norm": 2.341036081314087, "learning_rate": 2.8423772609819118e-08, "logits/chosen": -3.0833919048309326, "logits/rejected": -3.059788465499878, "logps/chosen": -55.05128860473633, "logps/rejected": -52.21721267700195, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0001815518771763891, "rewards/margins": -0.00011233085388084874, "rewards/rejected": -6.922103784745559e-05, "step": 330 }, { "epoch": 0.0585802894555479, "grad_norm": 2.1545321941375732, "learning_rate": 2.9285099052540913e-08, "logits/chosen": -3.0052454471588135, "logits/rejected": -2.983654022216797, "logps/chosen": -52.548309326171875, "logps/rejected": -51.952247619628906, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00021751364693045616, "rewards/margins": -5.0237147661391646e-05, "rewards/rejected": -0.0001672764919931069, "step": 340 }, { "epoch": 0.06030323914541695, "grad_norm": 2.320605993270874, "learning_rate": 3.01464254952627e-08, "logits/chosen": -2.977752923965454, "logits/rejected": -2.9382853507995605, "logps/chosen": -56.247840881347656, "logps/rejected": -53.581092834472656, "loss": 0.6932, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0003836905234493315, "rewards/margins": -0.00013840605970472097, "rewards/rejected": -0.0002452843473292887, "step": 350 }, { "epoch": 0.06202618883528601, "grad_norm": 2.4087536334991455, "learning_rate": 3.100775193798449e-08, "logits/chosen": -3.1283411979675293, "logits/rejected": -3.105438709259033, "logps/chosen": -54.5811767578125, "logps/rejected": -50.53804016113281, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 6.32234150543809e-05, "rewards/margins": 0.000302921689581126, "rewards/rejected": -0.0002396982890786603, "step": 360 }, { "epoch": 0.06374913852515507, "grad_norm": 2.3251330852508545, "learning_rate": 3.186907838070629e-08, "logits/chosen": -3.1035568714141846, "logits/rejected": -3.0743801593780518, "logps/chosen": -52.390281677246094, "logps/rejected": -51.37581253051758, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -5.121355570736341e-05, "rewards/margins": 0.0002968797634821385, "rewards/rejected": -0.00034809333737939596, "step": 370 }, { "epoch": 0.06547208821502412, "grad_norm": 2.0806522369384766, "learning_rate": 3.273040482342808e-08, "logits/chosen": -3.2038414478302, "logits/rejected": -3.179076671600342, "logps/chosen": -53.51982879638672, "logps/rejected": -52.2159309387207, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0001610913168406114, "rewards/margins": 9.2298214440234e-05, "rewards/rejected": -0.0002533895312808454, "step": 380 }, { "epoch": 0.06719503790489317, "grad_norm": 2.4117937088012695, "learning_rate": 3.359173126614987e-08, "logits/chosen": -3.1004347801208496, "logits/rejected": -3.074990749359131, "logps/chosen": -56.068931579589844, "logps/rejected": -55.217803955078125, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.399456909392029e-05, "rewards/margins": 6.996531010372564e-05, "rewards/rejected": -0.00010395983554190025, "step": 390 }, { "epoch": 0.06891798759476224, "grad_norm": 2.096646308898926, "learning_rate": 3.445305770887166e-08, "logits/chosen": -3.0706591606140137, "logits/rejected": -3.054755687713623, "logps/chosen": -52.662017822265625, "logps/rejected": -52.76776123046875, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00022807842469774187, "rewards/margins": -0.00022528569388668984, "rewards/rejected": -2.7927221708523575e-06, "step": 400 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.1630728244781494, "eval_logits/rejected": -3.157430648803711, "eval_logps/chosen": -58.69290542602539, "eval_logps/rejected": -63.15719985961914, "eval_loss": 0.6931674480438232, "eval_rewards/accuracies": 0.4844330847263336, "eval_rewards/chosen": 0.00018984945199918002, "eval_rewards/margins": -3.937046858482063e-05, "eval_rewards/rejected": 0.00022921990603208542, "eval_runtime": 358.8505, "eval_samples_per_second": 11.994, "eval_steps_per_second": 1.499, "step": 400 }, { "epoch": 0.07064093728463129, "grad_norm": 2.153409242630005, "learning_rate": 3.531438415159345e-08, "logits/chosen": -3.0743587017059326, "logits/rejected": -3.070240020751953, "logps/chosen": -50.77604293823242, "logps/rejected": -55.5684814453125, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0003055145207326859, "rewards/margins": 9.899081487674266e-05, "rewards/rejected": -0.0004045053501613438, "step": 410 }, { "epoch": 0.07236388697450034, "grad_norm": 2.5351555347442627, "learning_rate": 3.617571059431525e-08, "logits/chosen": -3.0597853660583496, "logits/rejected": -3.05192232131958, "logps/chosen": -54.19941329956055, "logps/rejected": -53.91211700439453, "loss": 0.6933, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0003343108983244747, "rewards/margins": -0.00035097176441922784, "rewards/rejected": 1.6660869732731953e-05, "step": 420 }, { "epoch": 0.0740868366643694, "grad_norm": 2.2438621520996094, "learning_rate": 3.7037037037037036e-08, "logits/chosen": -3.0879173278808594, "logits/rejected": -3.074145555496216, "logps/chosen": -53.09697723388672, "logps/rejected": -54.19342803955078, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -5.8635945606511086e-05, "rewards/margins": 0.0002445178688503802, "rewards/rejected": -0.0003031538217328489, "step": 430 }, { "epoch": 0.07580978635423846, "grad_norm": 2.5127062797546387, "learning_rate": 3.7898363479758826e-08, "logits/chosen": -3.133507490158081, "logits/rejected": -3.098098039627075, "logps/chosen": -54.32233428955078, "logps/rejected": -53.277740478515625, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.00017584441229701042, "rewards/margins": 0.0001403773931087926, "rewards/rejected": -0.0003162218490615487, "step": 440 }, { "epoch": 0.07753273604410751, "grad_norm": 2.283708333969116, "learning_rate": 3.875968992248062e-08, "logits/chosen": -3.0555710792541504, "logits/rejected": -3.023096799850464, "logps/chosen": -56.102012634277344, "logps/rejected": -54.62397384643555, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -6.127264350652695e-05, "rewards/margins": 0.00016291522479150444, "rewards/rejected": -0.00022418785374611616, "step": 450 }, { "epoch": 0.07925568573397657, "grad_norm": 2.387023448944092, "learning_rate": 3.962101636520241e-08, "logits/chosen": -3.024385929107666, "logits/rejected": -3.004836082458496, "logps/chosen": -56.2154655456543, "logps/rejected": -53.040618896484375, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00015148159582167864, "rewards/margins": 0.0003812285140156746, "rewards/rejected": -0.0005327101098373532, "step": 460 }, { "epoch": 0.08097863542384562, "grad_norm": 2.2020015716552734, "learning_rate": 4.04823428079242e-08, "logits/chosen": -3.0474162101745605, "logits/rejected": -3.015669584274292, "logps/chosen": -53.30449676513672, "logps/rejected": -51.4334716796875, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0003891284577548504, "rewards/margins": -4.8433535994263366e-05, "rewards/rejected": -0.0003406949108466506, "step": 470 }, { "epoch": 0.08270158511371468, "grad_norm": 2.44252610206604, "learning_rate": 4.134366925064599e-08, "logits/chosen": -3.0451035499572754, "logits/rejected": -3.0398306846618652, "logps/chosen": -54.25200271606445, "logps/rejected": -59.00077438354492, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00012817162496503443, "rewards/margins": 0.00028305535670369864, "rewards/rejected": -0.00041122696711681783, "step": 480 }, { "epoch": 0.08442453480358374, "grad_norm": 2.477860927581787, "learning_rate": 4.2204995693367785e-08, "logits/chosen": -2.9549922943115234, "logits/rejected": -2.9052486419677734, "logps/chosen": -60.62226486206055, "logps/rejected": -51.43208694458008, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.762434350093827e-05, "rewards/margins": 0.0008653089171275496, "rewards/rejected": -0.000932933297008276, "step": 490 }, { "epoch": 0.08614748449345279, "grad_norm": 2.260258913040161, "learning_rate": 4.306632213608958e-08, "logits/chosen": -3.0170648097991943, "logits/rejected": -2.9883503913879395, "logps/chosen": -55.014686584472656, "logps/rejected": -51.66577911376953, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0006481458549387753, "rewards/margins": 0.00012295909982640296, "rewards/rejected": -0.0007711048820056021, "step": 500 }, { "epoch": 0.08614748449345279, "eval_logits/chosen": -3.1627964973449707, "eval_logits/rejected": -3.157134771347046, "eval_logps/chosen": -58.68923568725586, "eval_logps/rejected": -63.15824508666992, "eval_loss": 0.6931439638137817, "eval_rewards/accuracies": 0.5016263723373413, "eval_rewards/chosen": 0.00022665159485768527, "eval_rewards/margins": 7.881514648033772e-06, "eval_rewards/rejected": 0.00021877007384318858, "eval_runtime": 358.4675, "eval_samples_per_second": 12.007, "eval_steps_per_second": 1.501, "step": 500 }, { "epoch": 0.08787043418332184, "grad_norm": 2.2401208877563477, "learning_rate": 4.3927648578811363e-08, "logits/chosen": -3.009411573410034, "logits/rejected": -2.988041400909424, "logps/chosen": -58.32307052612305, "logps/rejected": -52.057106018066406, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0004813670529983938, "rewards/margins": 0.00014227270730771124, "rewards/rejected": -0.0006236397894099355, "step": 510 }, { "epoch": 0.08959338387319091, "grad_norm": 2.0701003074645996, "learning_rate": 4.478897502153316e-08, "logits/chosen": -3.0574822425842285, "logits/rejected": -3.031964063644409, "logps/chosen": -56.46551513671875, "logps/rejected": -51.7152214050293, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005857711075805128, "rewards/margins": 0.0002671369875315577, "rewards/rejected": -0.0008529081824235618, "step": 520 }, { "epoch": 0.09131633356305996, "grad_norm": 2.063835382461548, "learning_rate": 4.5650301464254955e-08, "logits/chosen": -3.0539472103118896, "logits/rejected": -3.012070894241333, "logps/chosen": -55.717613220214844, "logps/rejected": -51.2911491394043, "loss": 0.6927, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00017993396613746881, "rewards/margins": 0.0008722777711227536, "rewards/rejected": -0.0010522117372602224, "step": 530 }, { "epoch": 0.09303928325292901, "grad_norm": 2.2334859371185303, "learning_rate": 4.6511627906976744e-08, "logits/chosen": -3.039977550506592, "logits/rejected": -3.0235543251037598, "logps/chosen": -52.89643096923828, "logps/rejected": -52.98133087158203, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0004417643358465284, "rewards/margins": 0.00019420584430918097, "rewards/rejected": -0.0006359702092595398, "step": 540 }, { "epoch": 0.09476223294279806, "grad_norm": 2.2609357833862305, "learning_rate": 4.7372954349698534e-08, "logits/chosen": -3.1005699634552, "logits/rejected": -3.082533836364746, "logps/chosen": -53.61052322387695, "logps/rejected": -52.064308166503906, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00036659216857515275, "rewards/margins": 0.0006485350313596427, "rewards/rejected": -0.0010151272872462869, "step": 550 }, { "epoch": 0.09648518263266713, "grad_norm": 2.5743932723999023, "learning_rate": 4.823428079242032e-08, "logits/chosen": -3.07200026512146, "logits/rejected": -3.0641121864318848, "logps/chosen": -52.15167236328125, "logps/rejected": -55.08266067504883, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0004310603253543377, "rewards/margins": 0.0006935327546671033, "rewards/rejected": -0.001124593080021441, "step": 560 }, { "epoch": 0.09820813232253618, "grad_norm": 2.238826036453247, "learning_rate": 4.909560723514212e-08, "logits/chosen": -3.044468641281128, "logits/rejected": -3.0358006954193115, "logps/chosen": -51.41157913208008, "logps/rejected": -53.927734375, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0004941682564094663, "rewards/margins": 0.0005627792561426759, "rewards/rejected": -0.0010569475125521421, "step": 570 }, { "epoch": 0.09993108201240523, "grad_norm": 1.7754663228988647, "learning_rate": 4.995693367786391e-08, "logits/chosen": -3.0513761043548584, "logits/rejected": -3.0455574989318848, "logps/chosen": -51.184574127197266, "logps/rejected": -53.333045959472656, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0007650370826013386, "rewards/margins": 0.00014660393935628235, "rewards/rejected": -0.0009116409346461296, "step": 580 }, { "epoch": 0.1016540317022743, "grad_norm": 2.0567564964294434, "learning_rate": 5.08182601205857e-08, "logits/chosen": -3.0504350662231445, "logits/rejected": -3.028501033782959, "logps/chosen": -54.93719482421875, "logps/rejected": -54.896331787109375, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0005827373242937028, "rewards/margins": 0.0005187354981899261, "rewards/rejected": -0.00110147288069129, "step": 590 }, { "epoch": 0.10337698139214335, "grad_norm": 2.3285975456237793, "learning_rate": 5.1679586563307486e-08, "logits/chosen": -3.0234103202819824, "logits/rejected": -2.9992403984069824, "logps/chosen": -53.90376663208008, "logps/rejected": -56.81464385986328, "loss": 0.6925, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00020016773487441242, "rewards/margins": 0.0012626869138330221, "rewards/rejected": -0.001462854677811265, "step": 600 }, { "epoch": 0.10337698139214335, "eval_logits/chosen": -3.1622629165649414, "eval_logits/rejected": -3.1565933227539062, "eval_logps/chosen": -58.667110443115234, "eval_logps/rejected": -63.150657653808594, "eval_loss": 0.6930716037750244, "eval_rewards/accuracies": 0.5157992839813232, "eval_rewards/chosen": 0.0004477898473851383, "eval_rewards/margins": 0.0001531161105958745, "eval_rewards/rejected": 0.0002946736931335181, "eval_runtime": 358.8537, "eval_samples_per_second": 11.994, "eval_steps_per_second": 1.499, "step": 600 }, { "epoch": 0.1050999310820124, "grad_norm": 2.227032423019409, "learning_rate": 5.254091300602929e-08, "logits/chosen": -2.9893407821655273, "logits/rejected": -2.9866795539855957, "logps/chosen": -52.614341735839844, "logps/rejected": -53.344757080078125, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0004973303293809295, "rewards/margins": 0.0003403354494366795, "rewards/rejected": -0.0008376656915061176, "step": 610 }, { "epoch": 0.10682288077188146, "grad_norm": 2.3722407817840576, "learning_rate": 5.340223944875107e-08, "logits/chosen": -3.148256540298462, "logits/rejected": -3.1216824054718018, "logps/chosen": -55.066368103027344, "logps/rejected": -53.42070388793945, "loss": 0.6926, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0004736125993076712, "rewards/margins": 0.001064404845237732, "rewards/rejected": -0.0015380174154415727, "step": 620 }, { "epoch": 0.10854583046175052, "grad_norm": 2.4768080711364746, "learning_rate": 5.426356589147286e-08, "logits/chosen": -3.1315252780914307, "logits/rejected": -3.1046836376190186, "logps/chosen": -53.95940017700195, "logps/rejected": -51.006202697753906, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0009459175053052604, "rewards/margins": 0.0005772692384198308, "rewards/rejected": -0.0015231866855174303, "step": 630 }, { "epoch": 0.11026878015161957, "grad_norm": 2.4684054851531982, "learning_rate": 5.512489233419466e-08, "logits/chosen": -3.099504232406616, "logits/rejected": -3.0885913372039795, "logps/chosen": -52.9940071105957, "logps/rejected": -54.46311569213867, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0008191806264221668, "rewards/margins": 0.0005473231431096792, "rewards/rejected": -0.001366503769531846, "step": 640 }, { "epoch": 0.11199172984148863, "grad_norm": 2.682082176208496, "learning_rate": 5.598621877691645e-08, "logits/chosen": -3.111945629119873, "logits/rejected": -3.1133782863616943, "logps/chosen": -51.602684020996094, "logps/rejected": -54.963134765625, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0012779730604961514, "rewards/margins": 0.00013899999612476677, "rewards/rejected": -0.0014169730711728334, "step": 650 }, { "epoch": 0.11371467953135768, "grad_norm": 2.239454984664917, "learning_rate": 5.6847545219638235e-08, "logits/chosen": -3.002143621444702, "logits/rejected": -2.995913028717041, "logps/chosen": -54.70659255981445, "logps/rejected": -52.40941619873047, "loss": 0.6928, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.000785406562499702, "rewards/margins": 0.0006732040201313794, "rewards/rejected": -0.0014586106408387423, "step": 660 }, { "epoch": 0.11543762922122675, "grad_norm": 2.2007970809936523, "learning_rate": 5.770887166236004e-08, "logits/chosen": -3.0248777866363525, "logits/rejected": -3.020057439804077, "logps/chosen": -53.08942794799805, "logps/rejected": -57.63835906982422, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0008383308304473758, "rewards/margins": 0.000339227553922683, "rewards/rejected": -0.0011775584425777197, "step": 670 }, { "epoch": 0.1171605789110958, "grad_norm": 2.3188748359680176, "learning_rate": 5.857019810508183e-08, "logits/chosen": -2.976760149002075, "logits/rejected": -2.9516377449035645, "logps/chosen": -54.03368377685547, "logps/rejected": -51.06056213378906, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0010046338429674506, "rewards/margins": 0.0009472542442381382, "rewards/rejected": -0.0019518882036209106, "step": 680 }, { "epoch": 0.11888352860096485, "grad_norm": 2.590332269668579, "learning_rate": 5.9431524547803616e-08, "logits/chosen": -3.1253817081451416, "logits/rejected": -3.0954742431640625, "logps/chosen": -59.173851013183594, "logps/rejected": -50.692222595214844, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0009134371066465974, "rewards/margins": 0.0005907953018322587, "rewards/rejected": -0.0015042326413094997, "step": 690 }, { "epoch": 0.1206064782908339, "grad_norm": 2.1985859870910645, "learning_rate": 6.02928509905254e-08, "logits/chosen": -3.083073139190674, "logits/rejected": -3.0547680854797363, "logps/chosen": -55.823707580566406, "logps/rejected": -53.165374755859375, "loss": 0.6927, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0009546114015392959, "rewards/margins": 0.0008917865343391895, "rewards/rejected": -0.0018463979940861464, "step": 700 }, { "epoch": 0.1206064782908339, "eval_logits/chosen": -3.161243200302124, "eval_logits/rejected": -3.1556365489959717, "eval_logps/chosen": -58.65497970581055, "eval_logps/rejected": -63.14204025268555, "eval_loss": 0.6930544376373291, "eval_rewards/accuracies": 0.5276486873626709, "eval_rewards/chosen": 0.0005691515398211777, "eval_rewards/margins": 0.0001882914948510006, "eval_rewards/rejected": 0.00038086005952209234, "eval_runtime": 358.9432, "eval_samples_per_second": 11.991, "eval_steps_per_second": 1.499, "step": 700 }, { "epoch": 0.12232942798070297, "grad_norm": 2.2699103355407715, "learning_rate": 6.11541774332472e-08, "logits/chosen": -3.0615015029907227, "logits/rejected": -3.032845973968506, "logps/chosen": -54.72351837158203, "logps/rejected": -54.8898811340332, "loss": 0.6926, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0007481739739887416, "rewards/margins": 0.0011700403410941362, "rewards/rejected": -0.0019182141404598951, "step": 710 }, { "epoch": 0.12405237767057202, "grad_norm": 2.5009286403656006, "learning_rate": 6.201550387596898e-08, "logits/chosen": -3.0233469009399414, "logits/rejected": -3.019094944000244, "logps/chosen": -53.42400360107422, "logps/rejected": -54.57286834716797, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0010689168702811003, "rewards/margins": 0.0006829799385741353, "rewards/rejected": -0.0017518966924399137, "step": 720 }, { "epoch": 0.12577532736044109, "grad_norm": 2.380788803100586, "learning_rate": 6.287683031869078e-08, "logits/chosen": -3.1400551795959473, "logits/rejected": -3.115017890930176, "logps/chosen": -56.4382209777832, "logps/rejected": -52.75396728515625, "loss": 0.6925, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.001059521222487092, "rewards/margins": 0.001269210479222238, "rewards/rejected": -0.0023287315852940083, "step": 730 }, { "epoch": 0.12749827705031014, "grad_norm": 2.2064075469970703, "learning_rate": 6.373815676141258e-08, "logits/chosen": -3.0220272541046143, "logits/rejected": -2.996253252029419, "logps/chosen": -54.99372100830078, "logps/rejected": -53.92853927612305, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -0.0007686042226850986, "rewards/margins": 0.0015255699399858713, "rewards/rejected": -0.0022941739298403263, "step": 740 }, { "epoch": 0.1292212267401792, "grad_norm": 2.4289848804473877, "learning_rate": 6.459948320413436e-08, "logits/chosen": -3.195591449737549, "logits/rejected": -3.1674859523773193, "logps/chosen": -56.058433532714844, "logps/rejected": -54.27191162109375, "loss": 0.692, "rewards/accuracies": 0.65625, "rewards/chosen": -0.00045926342136226594, "rewards/margins": 0.0022056284360587597, "rewards/rejected": -0.0026648917701095343, "step": 750 }, { "epoch": 0.13094417643004824, "grad_norm": 2.580063819885254, "learning_rate": 6.546080964685615e-08, "logits/chosen": -3.0457024574279785, "logits/rejected": -3.007045269012451, "logps/chosen": -54.52983856201172, "logps/rejected": -49.803890228271484, "loss": 0.6922, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.00093385751824826, "rewards/margins": 0.0019278116524219513, "rewards/rejected": -0.002861668821424246, "step": 760 }, { "epoch": 0.1326671261199173, "grad_norm": 2.090689182281494, "learning_rate": 6.632213608957795e-08, "logits/chosen": -3.096262216567993, "logits/rejected": -3.073035478591919, "logps/chosen": -53.0078239440918, "logps/rejected": -52.45222091674805, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0008852818864397705, "rewards/margins": 0.0015501610469073057, "rewards/rejected": -0.00243544252589345, "step": 770 }, { "epoch": 0.13439007580978635, "grad_norm": 2.663207530975342, "learning_rate": 6.718346253229975e-08, "logits/chosen": -3.091189384460449, "logits/rejected": -3.06011700630188, "logps/chosen": -53.27953338623047, "logps/rejected": -51.486976623535156, "loss": 0.6923, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0013955377507954836, "rewards/margins": 0.00176303053740412, "rewards/rejected": -0.003158567938953638, "step": 780 }, { "epoch": 0.1361130254996554, "grad_norm": 2.2545323371887207, "learning_rate": 6.804478897502153e-08, "logits/chosen": -3.0985918045043945, "logits/rejected": -3.065253496170044, "logps/chosen": -54.0023078918457, "logps/rejected": -53.85721969604492, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0009566719527356327, "rewards/margins": 0.001929369755089283, "rewards/rejected": -0.0028860417660325766, "step": 790 }, { "epoch": 0.13783597518952448, "grad_norm": 2.6749494075775146, "learning_rate": 6.890611541774332e-08, "logits/chosen": -2.984854221343994, "logits/rejected": -2.958704710006714, "logps/chosen": -55.17264938354492, "logps/rejected": -54.7309455871582, "loss": 0.6924, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0017509472090750933, "rewards/margins": 0.0016059564659371972, "rewards/rejected": -0.003356903325766325, "step": 800 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.160144567489624, "eval_logits/rejected": -3.154557228088379, "eval_logps/chosen": -58.60886764526367, "eval_logps/rejected": -63.124420166015625, "eval_loss": 0.692913293838501, "eval_rewards/accuracies": 0.550882875919342, "eval_rewards/chosen": 0.0010302558075636625, "eval_rewards/margins": 0.0004732160014100373, "eval_rewards/rejected": 0.0005570398643612862, "eval_runtime": 358.4769, "eval_samples_per_second": 12.006, "eval_steps_per_second": 1.501, "step": 800 }, { "epoch": 0.13955892487939353, "grad_norm": 2.225269317626953, "learning_rate": 6.976744186046512e-08, "logits/chosen": -3.059091567993164, "logits/rejected": -3.0310420989990234, "logps/chosen": -56.76906204223633, "logps/rejected": -55.854148864746094, "loss": 0.6924, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0013166740536689758, "rewards/margins": 0.001593802124261856, "rewards/rejected": -0.002910476177930832, "step": 810 }, { "epoch": 0.14128187456926258, "grad_norm": 2.152216911315918, "learning_rate": 7.06287683031869e-08, "logits/chosen": -3.108264923095703, "logits/rejected": -3.0827205181121826, "logps/chosen": -51.76808547973633, "logps/rejected": -51.052452087402344, "loss": 0.6926, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0022040274925529957, "rewards/margins": 0.0010511528234928846, "rewards/rejected": -0.0032551803160458803, "step": 820 }, { "epoch": 0.14300482425913164, "grad_norm": 2.438965082168579, "learning_rate": 7.149009474590869e-08, "logits/chosen": -3.036221742630005, "logits/rejected": -3.0208523273468018, "logps/chosen": -54.74811935424805, "logps/rejected": -54.435279846191406, "loss": 0.6917, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0016831646207720041, "rewards/margins": 0.002907418180257082, "rewards/rejected": -0.00459058303385973, "step": 830 }, { "epoch": 0.1447277739490007, "grad_norm": 2.556420087814331, "learning_rate": 7.23514211886305e-08, "logits/chosen": -3.139453172683716, "logits/rejected": -3.112957715988159, "logps/chosen": -54.24663162231445, "logps/rejected": -49.9597053527832, "loss": 0.6918, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0015173661522567272, "rewards/margins": 0.002657081466168165, "rewards/rejected": -0.004174447618424892, "step": 840 }, { "epoch": 0.14645072363886974, "grad_norm": 2.29156494140625, "learning_rate": 7.321274763135228e-08, "logits/chosen": -3.015171766281128, "logits/rejected": -3.0045599937438965, "logps/chosen": -51.05498504638672, "logps/rejected": -55.26131057739258, "loss": 0.6927, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.002338874852284789, "rewards/margins": 0.0009175013983622193, "rewards/rejected": -0.0032563761342316866, "step": 850 }, { "epoch": 0.1481736733287388, "grad_norm": 2.323979377746582, "learning_rate": 7.407407407407407e-08, "logits/chosen": -3.0407052040100098, "logits/rejected": -3.0206594467163086, "logps/chosen": -53.085052490234375, "logps/rejected": -52.8076057434082, "loss": 0.6918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0022581566590815783, "rewards/margins": 0.0027536931447684765, "rewards/rejected": -0.005011850036680698, "step": 860 }, { "epoch": 0.14989662301860784, "grad_norm": 1.9577609300613403, "learning_rate": 7.493540051679587e-08, "logits/chosen": -3.1143956184387207, "logits/rejected": -3.1105878353118896, "logps/chosen": -51.548980712890625, "logps/rejected": -54.14459228515625, "loss": 0.692, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0024740765802562237, "rewards/margins": 0.0024134463164955378, "rewards/rejected": -0.004887523595243692, "step": 870 }, { "epoch": 0.15161957270847692, "grad_norm": 1.9016143083572388, "learning_rate": 7.579672695951765e-08, "logits/chosen": -3.027916669845581, "logits/rejected": -3.005197763442993, "logps/chosen": -51.94502639770508, "logps/rejected": -51.932273864746094, "loss": 0.6915, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0021511423401534557, "rewards/margins": 0.003370120655745268, "rewards/rejected": -0.005521262995898724, "step": 880 }, { "epoch": 0.15334252239834598, "grad_norm": 2.2559702396392822, "learning_rate": 7.665805340223945e-08, "logits/chosen": -3.055936098098755, "logits/rejected": -3.018589496612549, "logps/chosen": -58.56825637817383, "logps/rejected": -54.62053298950195, "loss": 0.6913, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0008119450649246573, "rewards/margins": 0.0037904041819274426, "rewards/rejected": -0.004602349363267422, "step": 890 }, { "epoch": 0.15506547208821503, "grad_norm": 2.1713647842407227, "learning_rate": 7.751937984496124e-08, "logits/chosen": -3.076097011566162, "logits/rejected": -3.0657076835632324, "logps/chosen": -54.41865158081055, "logps/rejected": -52.89152908325195, "loss": 0.692, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.002819065237417817, "rewards/margins": 0.002241636859253049, "rewards/rejected": -0.005060701631009579, "step": 900 }, { "epoch": 0.15506547208821503, "eval_logits/chosen": -3.158005714416504, "eval_logits/rejected": -3.152367115020752, "eval_logps/chosen": -58.56904983520508, "eval_logps/rejected": -63.108516693115234, "eval_loss": 0.6927970051765442, "eval_rewards/accuracies": 0.553438663482666, "eval_rewards/chosen": 0.001428465824574232, "eval_rewards/margins": 0.000712412700522691, "eval_rewards/rejected": 0.0007160529494285583, "eval_runtime": 358.7453, "eval_samples_per_second": 11.997, "eval_steps_per_second": 1.5, "step": 900 }, { "epoch": 0.15678842177808408, "grad_norm": 2.131281852722168, "learning_rate": 7.838070628768303e-08, "logits/chosen": -3.046898603439331, "logits/rejected": -3.038809299468994, "logps/chosen": -51.3541374206543, "logps/rejected": -52.53623580932617, "loss": 0.6924, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0032709583174437284, "rewards/margins": 0.001514771138317883, "rewards/rejected": -0.004785729572176933, "step": 910 }, { "epoch": 0.15851137146795313, "grad_norm": 2.3213186264038086, "learning_rate": 7.924203273040482e-08, "logits/chosen": -3.0878713130950928, "logits/rejected": -3.0445356369018555, "logps/chosen": -54.627174377441406, "logps/rejected": -50.106483459472656, "loss": 0.6914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.003049984108656645, "rewards/margins": 0.003485618159174919, "rewards/rejected": -0.006535602267831564, "step": 920 }, { "epoch": 0.16023432115782218, "grad_norm": 2.5753068923950195, "learning_rate": 8.010335917312662e-08, "logits/chosen": -3.148953437805176, "logits/rejected": -3.1329092979431152, "logps/chosen": -52.779518127441406, "logps/rejected": -55.11871337890625, "loss": 0.6915, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0029565533623099327, "rewards/margins": 0.0033222727943211794, "rewards/rejected": -0.006278825458139181, "step": 930 }, { "epoch": 0.16195727084769124, "grad_norm": 2.208367347717285, "learning_rate": 8.09646856158484e-08, "logits/chosen": -3.128950834274292, "logits/rejected": -3.091381072998047, "logps/chosen": -60.43787384033203, "logps/rejected": -55.45613479614258, "loss": 0.6915, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0023253881372511387, "rewards/margins": 0.0033840001560747623, "rewards/rejected": -0.005709387827664614, "step": 940 }, { "epoch": 0.16368022053756032, "grad_norm": 2.2127363681793213, "learning_rate": 8.18260120585702e-08, "logits/chosen": -2.9215450286865234, "logits/rejected": -2.904310941696167, "logps/chosen": -55.69914627075195, "logps/rejected": -56.334136962890625, "loss": 0.6918, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.004277623258531094, "rewards/margins": 0.002753492910414934, "rewards/rejected": -0.007031116634607315, "step": 950 }, { "epoch": 0.16540317022742937, "grad_norm": 2.286198139190674, "learning_rate": 8.268733850129198e-08, "logits/chosen": -2.8904712200164795, "logits/rejected": -2.893658399581909, "logps/chosen": -51.27644729614258, "logps/rejected": -56.144874572753906, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.005351345054805279, "rewards/margins": 9.641332144383341e-05, "rewards/rejected": -0.00544775789603591, "step": 960 }, { "epoch": 0.16712611991729842, "grad_norm": 2.3423335552215576, "learning_rate": 8.354866494401377e-08, "logits/chosen": -3.0606789588928223, "logits/rejected": -3.025033473968506, "logps/chosen": -61.13622283935547, "logps/rejected": -53.06995391845703, "loss": 0.6924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.004727092571556568, "rewards/margins": 0.0015245076501742005, "rewards/rejected": -0.0062515996396541595, "step": 970 }, { "epoch": 0.16884906960716747, "grad_norm": 3.1241321563720703, "learning_rate": 8.440999138673557e-08, "logits/chosen": -3.1625547409057617, "logits/rejected": -3.142909288406372, "logps/chosen": -56.2438850402832, "logps/rejected": -55.52935028076172, "loss": 0.6907, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0019513750448822975, "rewards/margins": 0.004942773841321468, "rewards/rejected": -0.006894148886203766, "step": 980 }, { "epoch": 0.17057201929703653, "grad_norm": 2.4787614345550537, "learning_rate": 8.527131782945735e-08, "logits/chosen": -3.0291647911071777, "logits/rejected": -3.0057761669158936, "logps/chosen": -54.96424102783203, "logps/rejected": -53.39398956298828, "loss": 0.6913, "rewards/accuracies": 0.59375, "rewards/chosen": -0.003949849866330624, "rewards/margins": 0.003804834559559822, "rewards/rejected": -0.007754684425890446, "step": 990 }, { "epoch": 0.17229496898690558, "grad_norm": 2.216383218765259, "learning_rate": 8.613264427217916e-08, "logits/chosen": -2.9808809757232666, "logits/rejected": -2.9531712532043457, "logps/chosen": -57.33502197265625, "logps/rejected": -52.36609649658203, "loss": 0.6924, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.004668045789003372, "rewards/margins": 0.0015643674414604902, "rewards/rejected": -0.006232412997633219, "step": 1000 }, { "epoch": 0.17229496898690558, "eval_logits/chosen": -3.1550450325012207, "eval_logits/rejected": -3.1494359970092773, "eval_logps/chosen": -58.53335952758789, "eval_logps/rejected": -63.10966491699219, "eval_loss": 0.6926171779632568, "eval_rewards/accuracies": 0.5659851431846619, "eval_rewards/chosen": 0.0017853755271062255, "eval_rewards/margins": 0.0010808416409417987, "eval_rewards/rejected": 0.0007045338279567659, "eval_runtime": 358.4783, "eval_samples_per_second": 12.006, "eval_steps_per_second": 1.501, "step": 1000 }, { "epoch": 0.17401791867677463, "grad_norm": 2.3578145503997803, "learning_rate": 8.699397071490094e-08, "logits/chosen": -2.9164555072784424, "logits/rejected": -2.927083730697632, "logps/chosen": -54.320526123046875, "logps/rejected": -58.7243537902832, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": -0.005627059377729893, "rewards/margins": 0.002754826098680496, "rewards/rejected": -0.008381885476410389, "step": 1010 }, { "epoch": 0.17574086836664368, "grad_norm": 2.234851837158203, "learning_rate": 8.785529715762273e-08, "logits/chosen": -3.0988657474517822, "logits/rejected": -3.063410997390747, "logps/chosen": -57.33527374267578, "logps/rejected": -55.05609893798828, "loss": 0.6902, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.002326482906937599, "rewards/margins": 0.005923739168792963, "rewards/rejected": -0.008250223472714424, "step": 1020 }, { "epoch": 0.17746381805651276, "grad_norm": 2.3829662799835205, "learning_rate": 8.871662360034454e-08, "logits/chosen": -3.147603988647461, "logits/rejected": -3.120401382446289, "logps/chosen": -54.09687423706055, "logps/rejected": -52.60686492919922, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0037808609195053577, "rewards/margins": 0.003162707667797804, "rewards/rejected": -0.006943568587303162, "step": 1030 }, { "epoch": 0.17918676774638181, "grad_norm": 2.459951639175415, "learning_rate": 8.957795004306632e-08, "logits/chosen": -3.0629427433013916, "logits/rejected": -3.052708148956299, "logps/chosen": -53.935150146484375, "logps/rejected": -56.3193244934082, "loss": 0.692, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.005367881152778864, "rewards/margins": 0.0024500801227986813, "rewards/rejected": -0.007817961275577545, "step": 1040 }, { "epoch": 0.18090971743625087, "grad_norm": 2.4851348400115967, "learning_rate": 9.04392764857881e-08, "logits/chosen": -2.9838688373565674, "logits/rejected": -2.9484922885894775, "logps/chosen": -57.58784103393555, "logps/rejected": -51.438087463378906, "loss": 0.6911, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.004447963088750839, "rewards/margins": 0.0042016999796032906, "rewards/rejected": -0.00864966306835413, "step": 1050 }, { "epoch": 0.18263266712611992, "grad_norm": 2.4529802799224854, "learning_rate": 9.130060292850991e-08, "logits/chosen": -2.9925010204315186, "logits/rejected": -2.973201036453247, "logps/chosen": -57.02306365966797, "logps/rejected": -55.665924072265625, "loss": 0.6923, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.005218047183007002, "rewards/margins": 0.0016771454829722643, "rewards/rejected": -0.00689519289880991, "step": 1060 }, { "epoch": 0.18435561681598897, "grad_norm": 2.302189588546753, "learning_rate": 9.216192937123169e-08, "logits/chosen": -3.135798215866089, "logits/rejected": -3.101851463317871, "logps/chosen": -56.41740798950195, "logps/rejected": -54.17853546142578, "loss": 0.691, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.004861590452492237, "rewards/margins": 0.004440676420927048, "rewards/rejected": -0.009302266873419285, "step": 1070 }, { "epoch": 0.18607856650585802, "grad_norm": 2.2802231311798096, "learning_rate": 9.302325581395349e-08, "logits/chosen": -3.1133651733398438, "logits/rejected": -3.090050458908081, "logps/chosen": -56.36030960083008, "logps/rejected": -51.7191047668457, "loss": 0.6911, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0057452223263680935, "rewards/margins": 0.004140386823564768, "rewards/rejected": -0.009885609149932861, "step": 1080 }, { "epoch": 0.18780151619572708, "grad_norm": 2.3795809745788574, "learning_rate": 9.388458225667527e-08, "logits/chosen": -2.990147590637207, "logits/rejected": -2.9818215370178223, "logps/chosen": -52.83929443359375, "logps/rejected": -53.5362434387207, "loss": 0.6917, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.006076144985854626, "rewards/margins": 0.0029487418942153454, "rewards/rejected": -0.009024888277053833, "step": 1090 }, { "epoch": 0.18952446588559613, "grad_norm": 2.1332859992980957, "learning_rate": 9.474590869939707e-08, "logits/chosen": -3.0555577278137207, "logits/rejected": -3.0526106357574463, "logps/chosen": -51.41303634643555, "logps/rejected": -55.484619140625, "loss": 0.6913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005056067835539579, "rewards/margins": 0.0037126517854630947, "rewards/rejected": -0.008768720552325249, "step": 1100 }, { "epoch": 0.18952446588559613, "eval_logits/chosen": -3.1514084339141846, "eval_logits/rejected": -3.145766258239746, "eval_logps/chosen": -58.5028076171875, "eval_logps/rejected": -63.130279541015625, "eval_loss": 0.6923677325248718, "eval_rewards/accuracies": 0.5736523866653442, "eval_rewards/chosen": 0.0020908990409225225, "eval_rewards/margins": 0.0015924354083836079, "eval_rewards/rejected": 0.0004984635161235929, "eval_runtime": 358.7667, "eval_samples_per_second": 11.997, "eval_steps_per_second": 1.5, "step": 1100 }, { "epoch": 0.1912474155754652, "grad_norm": 2.6487016677856445, "learning_rate": 9.560723514211886e-08, "logits/chosen": -3.052523136138916, "logits/rejected": -3.0647952556610107, "logps/chosen": -53.663841247558594, "logps/rejected": -57.5274543762207, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.007680105976760387, "rewards/margins": 0.0030740201473236084, "rewards/rejected": -0.010754126124083996, "step": 1110 }, { "epoch": 0.19297036526533426, "grad_norm": 2.3997607231140137, "learning_rate": 9.646856158484065e-08, "logits/chosen": -3.0824146270751953, "logits/rejected": -3.0605499744415283, "logps/chosen": -56.986732482910156, "logps/rejected": -54.80094528198242, "loss": 0.691, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.006157047115266323, "rewards/margins": 0.004412065260112286, "rewards/rejected": -0.010569113306701183, "step": 1120 }, { "epoch": 0.1946933149552033, "grad_norm": 2.228640556335449, "learning_rate": 9.732988802756244e-08, "logits/chosen": -3.1479599475860596, "logits/rejected": -3.1234474182128906, "logps/chosen": -52.66511917114258, "logps/rejected": -55.532569885253906, "loss": 0.6897, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.006327435374259949, "rewards/margins": 0.007075057830661535, "rewards/rejected": -0.013402493670582771, "step": 1130 }, { "epoch": 0.19641626464507236, "grad_norm": 2.443138837814331, "learning_rate": 9.819121447028424e-08, "logits/chosen": -3.0550379753112793, "logits/rejected": -3.0215907096862793, "logps/chosen": -57.4569091796875, "logps/rejected": -54.08552932739258, "loss": 0.6906, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005755481775850058, "rewards/margins": 0.005251362454146147, "rewards/rejected": -0.011006844229996204, "step": 1140 }, { "epoch": 0.19813921433494142, "grad_norm": 2.4156675338745117, "learning_rate": 9.905254091300602e-08, "logits/chosen": -2.994870662689209, "logits/rejected": -2.9757802486419678, "logps/chosen": -53.95721435546875, "logps/rejected": -55.600433349609375, "loss": 0.6906, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.006008354481309652, "rewards/margins": 0.005140149500221014, "rewards/rejected": -0.011148503981530666, "step": 1150 }, { "epoch": 0.19986216402481047, "grad_norm": 2.241849899291992, "learning_rate": 9.991386735572782e-08, "logits/chosen": -3.090665817260742, "logits/rejected": -3.081921100616455, "logps/chosen": -54.39727020263672, "logps/rejected": -55.57891845703125, "loss": 0.6899, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0037164140958338976, "rewards/margins": 0.00659269979223609, "rewards/rejected": -0.010309114120900631, "step": 1160 }, { "epoch": 0.20158511371467952, "grad_norm": 2.1488406658172607, "learning_rate": 9.999981687766457e-08, "logits/chosen": -2.9630703926086426, "logits/rejected": -2.9503164291381836, "logps/chosen": -52.81767654418945, "logps/rejected": -55.88347244262695, "loss": 0.6909, "rewards/accuracies": 0.59375, "rewards/chosen": -0.007383792661130428, "rewards/margins": 0.004716981668025255, "rewards/rejected": -0.012100773863494396, "step": 1170 }, { "epoch": 0.2033080634045486, "grad_norm": 2.5440945625305176, "learning_rate": 9.999918386390616e-08, "logits/chosen": -2.934640645980835, "logits/rejected": -2.903498411178589, "logps/chosen": -53.553077697753906, "logps/rejected": -52.60760498046875, "loss": 0.6889, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007512775249779224, "rewards/margins": 0.008542842231690884, "rewards/rejected": -0.016055617481470108, "step": 1180 }, { "epoch": 0.20503101309441765, "grad_norm": 2.4966301918029785, "learning_rate": 9.999809870367821e-08, "logits/chosen": -3.1306917667388916, "logits/rejected": -3.0955986976623535, "logps/chosen": -59.563941955566406, "logps/rejected": -51.870330810546875, "loss": 0.6902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008644647896289825, "rewards/margins": 0.006059793755412102, "rewards/rejected": -0.014704440720379353, "step": 1190 }, { "epoch": 0.2067539627842867, "grad_norm": 2.1367452144622803, "learning_rate": 9.999656140679395e-08, "logits/chosen": -2.962451934814453, "logits/rejected": -2.9473876953125, "logps/chosen": -54.0195426940918, "logps/rejected": -52.5385856628418, "loss": 0.6912, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.008550790138542652, "rewards/margins": 0.004117645788937807, "rewards/rejected": -0.012668436393141747, "step": 1200 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.1463611125946045, "eval_logits/rejected": -3.140730619430542, "eval_logps/chosen": -58.488128662109375, "eval_logps/rejected": -63.16765594482422, "eval_loss": 0.692119836807251, "eval_rewards/accuracies": 0.5794609785079956, "eval_rewards/chosen": 0.0022376305423676968, "eval_rewards/margins": 0.002112939953804016, "eval_rewards/rejected": 0.00012469064677134156, "eval_runtime": 358.7316, "eval_samples_per_second": 11.998, "eval_steps_per_second": 1.5, "step": 1200 }, { "epoch": 0.20847691247415576, "grad_norm": 2.3286216259002686, "learning_rate": 9.999457198715525e-08, "logits/chosen": -3.0613112449645996, "logits/rejected": -3.0273356437683105, "logps/chosen": -54.412635803222656, "logps/rejected": -54.4266471862793, "loss": 0.6893, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.006243783514946699, "rewards/margins": 0.00786572601646185, "rewards/rejected": -0.014109509997069836, "step": 1210 }, { "epoch": 0.2101998621640248, "grad_norm": 2.1323070526123047, "learning_rate": 9.999213046275256e-08, "logits/chosen": -3.0718696117401123, "logits/rejected": -3.0449776649475098, "logps/chosen": -54.26097869873047, "logps/rejected": -53.925315856933594, "loss": 0.6894, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008796758949756622, "rewards/margins": 0.007621658034622669, "rewards/rejected": -0.016418416053056717, "step": 1220 }, { "epoch": 0.21192281185389386, "grad_norm": 2.6729142665863037, "learning_rate": 9.99892368556648e-08, "logits/chosen": -3.1416068077087402, "logits/rejected": -3.0996181964874268, "logps/chosen": -56.39683151245117, "logps/rejected": -53.861724853515625, "loss": 0.6883, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.006459666881710291, "rewards/margins": 0.009909730404615402, "rewards/rejected": -0.016369396820664406, "step": 1230 }, { "epoch": 0.2136457615437629, "grad_norm": 2.1508169174194336, "learning_rate": 9.998589119205909e-08, "logits/chosen": -3.002638816833496, "logits/rejected": -2.98604679107666, "logps/chosen": -53.4490852355957, "logps/rejected": -54.5033073425293, "loss": 0.6899, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01011999323964119, "rewards/margins": 0.006711783818900585, "rewards/rejected": -0.0168317761272192, "step": 1240 }, { "epoch": 0.21536871123363197, "grad_norm": 2.67529034614563, "learning_rate": 9.99820935021905e-08, "logits/chosen": -3.1221249103546143, "logits/rejected": -3.085853099822998, "logps/chosen": -54.63789749145508, "logps/rejected": -52.832984924316406, "loss": 0.6897, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009338639676570892, "rewards/margins": 0.007117290049791336, "rewards/rejected": -0.01645592972636223, "step": 1250 }, { "epoch": 0.21709166092350105, "grad_norm": 2.1187283992767334, "learning_rate": 9.997784382040184e-08, "logits/chosen": -2.9702210426330566, "logits/rejected": -2.9389166831970215, "logps/chosen": -53.91698455810547, "logps/rejected": -52.1651611328125, "loss": 0.6886, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009416448883712292, "rewards/margins": 0.0093917828053236, "rewards/rejected": -0.018808234483003616, "step": 1260 }, { "epoch": 0.2188146106133701, "grad_norm": 2.4756853580474854, "learning_rate": 9.997314218512333e-08, "logits/chosen": -3.030531644821167, "logits/rejected": -3.024228572845459, "logps/chosen": -52.87784957885742, "logps/rejected": -56.451568603515625, "loss": 0.6895, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.010020340792834759, "rewards/margins": 0.007446722127497196, "rewards/rejected": -0.017467062920331955, "step": 1270 }, { "epoch": 0.22053756030323915, "grad_norm": 2.364240884780884, "learning_rate": 9.996798863887219e-08, "logits/chosen": -3.1014938354492188, "logits/rejected": -3.1085102558135986, "logps/chosen": -52.22028732299805, "logps/rejected": -61.497581481933594, "loss": 0.6906, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.010487397201359272, "rewards/margins": 0.005354976747184992, "rewards/rejected": -0.01584237441420555, "step": 1280 }, { "epoch": 0.2222605099931082, "grad_norm": 2.251617193222046, "learning_rate": 9.996238322825236e-08, "logits/chosen": -3.001249074935913, "logits/rejected": -2.9696197509765625, "logps/chosen": -55.589752197265625, "logps/rejected": -52.25542449951172, "loss": 0.6891, "rewards/accuracies": 0.625, "rewards/chosen": -0.009888270869851112, "rewards/margins": 0.008468730375170708, "rewards/rejected": -0.01835699938237667, "step": 1290 }, { "epoch": 0.22398345968297725, "grad_norm": 2.166390895843506, "learning_rate": 9.995632600395398e-08, "logits/chosen": -3.0807461738586426, "logits/rejected": -3.0679678916931152, "logps/chosen": -51.707557678222656, "logps/rejected": -55.653839111328125, "loss": 0.6911, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014926217496395111, "rewards/margins": 0.004388140980154276, "rewards/rejected": -0.0193143580108881, "step": 1300 }, { "epoch": 0.22398345968297725, "eval_logits/chosen": -3.141386032104492, "eval_logits/rejected": -3.1357979774475098, "eval_logps/chosen": -58.53720474243164, "eval_logps/rejected": -63.28915023803711, "eval_loss": 0.6917756199836731, "eval_rewards/accuracies": 0.5901486873626709, "eval_rewards/chosen": 0.0017469110898673534, "eval_rewards/margins": 0.0028372127562761307, "eval_rewards/rejected": -0.0010903014335781336, "eval_runtime": 358.9888, "eval_samples_per_second": 11.989, "eval_steps_per_second": 1.499, "step": 1300 }, { "epoch": 0.2257064093728463, "grad_norm": 1.9226795434951782, "learning_rate": 9.9949817020753e-08, "logits/chosen": -3.080859661102295, "logits/rejected": -3.045379877090454, "logps/chosen": -54.435699462890625, "logps/rejected": -51.69322967529297, "loss": 0.6874, "rewards/accuracies": 0.625, "rewards/chosen": -0.010951696895062923, "rewards/margins": 0.011780351400375366, "rewards/rejected": -0.022732049226760864, "step": 1310 }, { "epoch": 0.22742935906271536, "grad_norm": 2.345458745956421, "learning_rate": 9.994285633751067e-08, "logits/chosen": -3.053536891937256, "logits/rejected": -3.0280728340148926, "logps/chosen": -56.30860137939453, "logps/rejected": -52.7589225769043, "loss": 0.6876, "rewards/accuracies": 0.625, "rewards/chosen": -0.008888078853487968, "rewards/margins": 0.011339199729263783, "rewards/rejected": -0.020227279514074326, "step": 1320 }, { "epoch": 0.22915230875258444, "grad_norm": 2.4452571868896484, "learning_rate": 9.993544401717297e-08, "logits/chosen": -3.1321685314178467, "logits/rejected": -3.1015255451202393, "logps/chosen": -56.034820556640625, "logps/rejected": -52.01787185668945, "loss": 0.688, "rewards/accuracies": 0.65625, "rewards/chosen": -0.008172191679477692, "rewards/margins": 0.010616080835461617, "rewards/rejected": -0.018788272514939308, "step": 1330 }, { "epoch": 0.2308752584424535, "grad_norm": 2.2883365154266357, "learning_rate": 9.992758012677008e-08, "logits/chosen": -2.973372459411621, "logits/rejected": -2.951862335205078, "logps/chosen": -56.06450271606445, "logps/rejected": -54.80854415893555, "loss": 0.6882, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.009250979870557785, "rewards/margins": 0.010158119723200798, "rewards/rejected": -0.019409101456403732, "step": 1340 }, { "epoch": 0.23259820813232254, "grad_norm": 2.2209107875823975, "learning_rate": 9.991926473741578e-08, "logits/chosen": -3.038304090499878, "logits/rejected": -3.013697862625122, "logps/chosen": -55.443382263183594, "logps/rejected": -55.3911018371582, "loss": 0.6893, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.013421006500720978, "rewards/margins": 0.008150272071361542, "rewards/rejected": -0.02157127857208252, "step": 1350 }, { "epoch": 0.2343211578221916, "grad_norm": 2.3511853218078613, "learning_rate": 9.991049792430679e-08, "logits/chosen": -2.9954142570495605, "logits/rejected": -2.9832215309143066, "logps/chosen": -56.58509063720703, "logps/rejected": -58.09856033325195, "loss": 0.6899, "rewards/accuracies": 0.59375, "rewards/chosen": -0.013825833797454834, "rewards/margins": 0.006910757161676884, "rewards/rejected": -0.020736588165163994, "step": 1360 }, { "epoch": 0.23604410751206065, "grad_norm": 2.4769160747528076, "learning_rate": 9.990127976672203e-08, "logits/chosen": -3.1084232330322266, "logits/rejected": -3.0888638496398926, "logps/chosen": -53.93939208984375, "logps/rejected": -55.24439239501953, "loss": 0.6873, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.012645436450839043, "rewards/margins": 0.011925434693694115, "rewards/rejected": -0.024570871144533157, "step": 1370 }, { "epoch": 0.2377670572019297, "grad_norm": 2.2031283378601074, "learning_rate": 9.989161034802205e-08, "logits/chosen": -3.0005006790161133, "logits/rejected": -2.9822466373443604, "logps/chosen": -55.36827850341797, "logps/rejected": -54.40727996826172, "loss": 0.6881, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.010355006903409958, "rewards/margins": 0.010543933138251305, "rewards/rejected": -0.020898941904306412, "step": 1380 }, { "epoch": 0.23949000689179875, "grad_norm": 2.3673853874206543, "learning_rate": 9.988148975564812e-08, "logits/chosen": -3.1491761207580566, "logits/rejected": -3.1218996047973633, "logps/chosen": -58.048797607421875, "logps/rejected": -55.777122497558594, "loss": 0.6875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.011549661867320538, "rewards/margins": 0.011656830087304115, "rewards/rejected": -0.023206491023302078, "step": 1390 }, { "epoch": 0.2412129565816678, "grad_norm": 2.19177508354187, "learning_rate": 9.987091808112155e-08, "logits/chosen": -2.969104528427124, "logits/rejected": -2.9434850215911865, "logps/chosen": -57.14324951171875, "logps/rejected": -55.39662551879883, "loss": 0.6871, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.013344550505280495, "rewards/margins": 0.012570117600262165, "rewards/rejected": -0.025914669036865234, "step": 1400 }, { "epoch": 0.2412129565816678, "eval_logits/chosen": -3.135641574859619, "eval_logits/rejected": -3.1300060749053955, "eval_logps/chosen": -58.64906692504883, "eval_logps/rejected": -63.489479064941406, "eval_loss": 0.6913572549819946, "eval_rewards/accuracies": 0.5785316228866577, "eval_rewards/chosen": 0.000628301000688225, "eval_rewards/margins": 0.0037218371871858835, "eval_rewards/rejected": -0.0030935362447053194, "eval_runtime": 358.6517, "eval_samples_per_second": 12.001, "eval_steps_per_second": 1.5, "step": 1400 }, { "epoch": 0.24293590627153688, "grad_norm": 2.0600688457489014, "learning_rate": 9.985989542004283e-08, "logits/chosen": -3.032985210418701, "logits/rejected": -3.02095365524292, "logps/chosen": -55.83747100830078, "logps/rejected": -55.733192443847656, "loss": 0.688, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.014638135209679604, "rewards/margins": 0.010690612718462944, "rewards/rejected": -0.0253287460654974, "step": 1410 }, { "epoch": 0.24465885596140594, "grad_norm": 2.159818649291992, "learning_rate": 9.984842187209068e-08, "logits/chosen": -2.950575590133667, "logits/rejected": -2.956359624862671, "logps/chosen": -52.466026306152344, "logps/rejected": -56.8805046081543, "loss": 0.6909, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.017118731513619423, "rewards/margins": 0.004853106569498777, "rewards/rejected": -0.021971840411424637, "step": 1420 }, { "epoch": 0.246381805651275, "grad_norm": 2.4923534393310547, "learning_rate": 9.983649754102133e-08, "logits/chosen": -3.06890869140625, "logits/rejected": -3.0572915077209473, "logps/chosen": -54.95482635498047, "logps/rejected": -59.163429260253906, "loss": 0.6869, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01370183564722538, "rewards/margins": 0.013033677823841572, "rewards/rejected": -0.026735514402389526, "step": 1430 }, { "epoch": 0.24810475534114404, "grad_norm": 2.1912496089935303, "learning_rate": 9.982412253466739e-08, "logits/chosen": -2.995776653289795, "logits/rejected": -2.964540958404541, "logps/chosen": -52.209800720214844, "logps/rejected": -51.3564338684082, "loss": 0.6872, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.017473148182034492, "rewards/margins": 0.012446993961930275, "rewards/rejected": -0.029920142143964767, "step": 1440 }, { "epoch": 0.2498277050310131, "grad_norm": 2.883305788040161, "learning_rate": 9.9811296964937e-08, "logits/chosen": -3.008527994155884, "logits/rejected": -2.971277952194214, "logps/chosen": -56.74669647216797, "logps/rejected": -54.7548713684082, "loss": 0.6856, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.012715873308479786, "rewards/margins": 0.015632428228855133, "rewards/rejected": -0.028348300606012344, "step": 1450 }, { "epoch": 0.25155065472088217, "grad_norm": 2.354778289794922, "learning_rate": 9.97980209478128e-08, "logits/chosen": -3.0111136436462402, "logits/rejected": -2.9785988330841064, "logps/chosen": -53.67363357543945, "logps/rejected": -53.75532913208008, "loss": 0.6869, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.019740106537938118, "rewards/margins": 0.01316138543188572, "rewards/rejected": -0.03290148824453354, "step": 1460 }, { "epoch": 0.2532736044107512, "grad_norm": 2.4939029216766357, "learning_rate": 9.97842946033508e-08, "logits/chosen": -3.1756339073181152, "logits/rejected": -3.1396703720092773, "logps/chosen": -58.40922164916992, "logps/rejected": -57.95037841796875, "loss": 0.684, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01385563611984253, "rewards/margins": 0.01888427510857582, "rewards/rejected": -0.03273991495370865, "step": 1470 }, { "epoch": 0.2549965541006203, "grad_norm": 2.315673351287842, "learning_rate": 9.977011805567941e-08, "logits/chosen": -3.1426024436950684, "logits/rejected": -3.1065621376037598, "logps/chosen": -53.217979431152344, "logps/rejected": -54.60027313232422, "loss": 0.6886, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01907247118651867, "rewards/margins": 0.00952645018696785, "rewards/rejected": -0.028598923236131668, "step": 1480 }, { "epoch": 0.2567195037904893, "grad_norm": 2.4700214862823486, "learning_rate": 9.975549143299824e-08, "logits/chosen": -3.0736770629882812, "logits/rejected": -3.0491397380828857, "logps/chosen": -59.33967971801758, "logps/rejected": -56.70935821533203, "loss": 0.688, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01955995336174965, "rewards/margins": 0.010835406370460987, "rewards/rejected": -0.030395355075597763, "step": 1490 }, { "epoch": 0.2584424534803584, "grad_norm": 2.4240033626556396, "learning_rate": 9.974041486757696e-08, "logits/chosen": -2.9827020168304443, "logits/rejected": -2.9801254272460938, "logps/chosen": -52.06401824951172, "logps/rejected": -56.11759567260742, "loss": 0.6866, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.020753109827637672, "rewards/margins": 0.013757812790572643, "rewards/rejected": -0.03451092168688774, "step": 1500 }, { "epoch": 0.2584424534803584, "eval_logits/chosen": -3.130272150039673, "eval_logits/rejected": -3.124645471572876, "eval_logps/chosen": -58.866065979003906, "eval_logps/rejected": -63.78525924682617, "eval_loss": 0.6909964680671692, "eval_rewards/accuracies": 0.5750464797019958, "eval_rewards/chosen": -0.001541697303764522, "eval_rewards/margins": 0.0045096841640770435, "eval_rewards/rejected": -0.006051382049918175, "eval_runtime": 358.407, "eval_samples_per_second": 12.009, "eval_steps_per_second": 1.501, "step": 1500 }, { "epoch": 0.2601654031702274, "grad_norm": 2.4855523109436035, "learning_rate": 9.972488849575411e-08, "logits/chosen": -2.933039903640747, "logits/rejected": -2.8975093364715576, "logps/chosen": -60.27305221557617, "logps/rejected": -56.31962203979492, "loss": 0.6844, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01667121797800064, "rewards/margins": 0.018259279429912567, "rewards/rejected": -0.034930501133203506, "step": 1510 }, { "epoch": 0.2618883528600965, "grad_norm": 2.682542085647583, "learning_rate": 9.970891245793588e-08, "logits/chosen": -3.045544385910034, "logits/rejected": -3.0375313758850098, "logps/chosen": -54.4551887512207, "logps/rejected": -54.225868225097656, "loss": 0.6908, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.022045383229851723, "rewards/margins": 0.0052342889830470085, "rewards/rejected": -0.027279671281576157, "step": 1520 }, { "epoch": 0.26361130254996556, "grad_norm": 2.655073881149292, "learning_rate": 9.96924868985948e-08, "logits/chosen": -2.9312405586242676, "logits/rejected": -2.9148247241973877, "logps/chosen": -53.90313720703125, "logps/rejected": -54.13452911376953, "loss": 0.6877, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02214507758617401, "rewards/margins": 0.011658025905489922, "rewards/rejected": -0.03380310535430908, "step": 1530 }, { "epoch": 0.2653342522398346, "grad_norm": 2.3471813201904297, "learning_rate": 9.967561196626846e-08, "logits/chosen": -3.059717893600464, "logits/rejected": -3.0278360843658447, "logps/chosen": -58.14380645751953, "logps/rejected": -54.910728454589844, "loss": 0.6852, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.022383395582437515, "rewards/margins": 0.016689913347363472, "rewards/rejected": -0.03907330706715584, "step": 1540 }, { "epoch": 0.26705720192970367, "grad_norm": 2.238987684249878, "learning_rate": 9.965828781355818e-08, "logits/chosen": -2.978020668029785, "logits/rejected": -2.9583582878112793, "logps/chosen": -55.6759033203125, "logps/rejected": -54.38631057739258, "loss": 0.6858, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02306569740176201, "rewards/margins": 0.015391230583190918, "rewards/rejected": -0.03845692425966263, "step": 1550 }, { "epoch": 0.2687801516195727, "grad_norm": 2.3482720851898193, "learning_rate": 9.964051459712762e-08, "logits/chosen": -2.991518974304199, "logits/rejected": -2.974125385284424, "logps/chosen": -53.505340576171875, "logps/rejected": -54.268577575683594, "loss": 0.6887, "rewards/accuracies": 0.59375, "rewards/chosen": -0.026575718075037003, "rewards/margins": 0.00950780138373375, "rewards/rejected": -0.03608352318406105, "step": 1560 }, { "epoch": 0.2705031013094418, "grad_norm": 2.3481009006500244, "learning_rate": 9.962229247770133e-08, "logits/chosen": -3.042026996612549, "logits/rejected": -3.04404878616333, "logps/chosen": -54.13409423828125, "logps/rejected": -59.780250549316406, "loss": 0.6868, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02218012697994709, "rewards/margins": 0.0133208017796278, "rewards/rejected": -0.03550093248486519, "step": 1570 }, { "epoch": 0.2722260509993108, "grad_norm": 2.7631266117095947, "learning_rate": 9.960362162006333e-08, "logits/chosen": -2.977109909057617, "logits/rejected": -2.96714448928833, "logps/chosen": -55.50933837890625, "logps/rejected": -57.580101013183594, "loss": 0.688, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02594080939888954, "rewards/margins": 0.010953729972243309, "rewards/rejected": -0.0368945375084877, "step": 1580 }, { "epoch": 0.2739490006891799, "grad_norm": 2.5365638732910156, "learning_rate": 9.958450219305565e-08, "logits/chosen": -3.022242546081543, "logits/rejected": -3.007664918899536, "logps/chosen": -56.24147415161133, "logps/rejected": -57.927711486816406, "loss": 0.6876, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.02111046575009823, "rewards/margins": 0.011745044030249119, "rewards/rejected": -0.03285551071166992, "step": 1590 }, { "epoch": 0.27567195037904896, "grad_norm": 2.4298782348632812, "learning_rate": 9.956493436957672e-08, "logits/chosen": -3.0135552883148193, "logits/rejected": -2.9755609035491943, "logps/chosen": -54.7454833984375, "logps/rejected": -54.69611358642578, "loss": 0.6876, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.025662336498498917, "rewards/margins": 0.011798778548836708, "rewards/rejected": -0.037461116909980774, "step": 1600 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.124072790145874, "eval_logits/rejected": -3.118478775024414, "eval_logps/chosen": -59.09284591674805, "eval_logps/rejected": -64.08631134033203, "eval_loss": 0.6906586289405823, "eval_rewards/accuracies": 0.5873606204986572, "eval_rewards/chosen": -0.0038095172494649887, "eval_rewards/margins": 0.005252342205494642, "eval_rewards/rejected": -0.009061858989298344, "eval_runtime": 358.6329, "eval_samples_per_second": 12.001, "eval_steps_per_second": 1.5, "step": 1600 }, { "epoch": 0.277394900068918, "grad_norm": 2.2438275814056396, "learning_rate": 9.954491832657987e-08, "logits/chosen": -3.0523838996887207, "logits/rejected": -3.0273146629333496, "logps/chosen": -58.795616149902344, "logps/rejected": -61.6534309387207, "loss": 0.6883, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029015138745307922, "rewards/margins": 0.010533750057220459, "rewards/rejected": -0.03954889252781868, "step": 1610 }, { "epoch": 0.27911784975878706, "grad_norm": 2.277097225189209, "learning_rate": 9.952445424507174e-08, "logits/chosen": -2.974520206451416, "logits/rejected": -2.9524998664855957, "logps/chosen": -54.341697692871094, "logps/rejected": -58.2484016418457, "loss": 0.6872, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.029016416519880295, "rewards/margins": 0.012814966030418873, "rewards/rejected": -0.04183139279484749, "step": 1620 }, { "epoch": 0.2808407994486561, "grad_norm": 2.4578676223754883, "learning_rate": 9.950354231011059e-08, "logits/chosen": -3.0443010330200195, "logits/rejected": -3.0180282592773438, "logps/chosen": -57.86555099487305, "logps/rejected": -55.727874755859375, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": -0.023557905107736588, "rewards/margins": 0.016457753255963326, "rewards/rejected": -0.04001566022634506, "step": 1630 }, { "epoch": 0.28256374913852517, "grad_norm": 2.5449278354644775, "learning_rate": 9.948218271080464e-08, "logits/chosen": -3.085779905319214, "logits/rejected": -3.0488622188568115, "logps/chosen": -59.664390563964844, "logps/rejected": -55.85193634033203, "loss": 0.6847, "rewards/accuracies": 0.59375, "rewards/chosen": -0.027100270614027977, "rewards/margins": 0.017988886684179306, "rewards/rejected": -0.045089155435562134, "step": 1640 }, { "epoch": 0.2842866988283942, "grad_norm": 2.6558942794799805, "learning_rate": 9.94603756403104e-08, "logits/chosen": -3.0556998252868652, "logits/rejected": -3.0283634662628174, "logps/chosen": -59.260841369628906, "logps/rejected": -58.779266357421875, "loss": 0.6847, "rewards/accuracies": 0.65625, "rewards/chosen": -0.020244024693965912, "rewards/margins": 0.017693202942609787, "rewards/rejected": -0.0379372276365757, "step": 1650 }, { "epoch": 0.28600964851826327, "grad_norm": 2.612154722213745, "learning_rate": 9.943812129583088e-08, "logits/chosen": -3.0901899337768555, "logits/rejected": -3.0435256958007812, "logps/chosen": -57.216285705566406, "logps/rejected": -54.11063766479492, "loss": 0.6829, "rewards/accuracies": 0.65625, "rewards/chosen": -0.022447334602475166, "rewards/margins": 0.02150181494653225, "rewards/rejected": -0.043949149549007416, "step": 1660 }, { "epoch": 0.2877325982081323, "grad_norm": 2.6352086067199707, "learning_rate": 9.941541987861386e-08, "logits/chosen": -3.0773003101348877, "logits/rejected": -3.055241584777832, "logps/chosen": -56.383026123046875, "logps/rejected": -59.22760772705078, "loss": 0.6827, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01868412271142006, "rewards/margins": 0.021537818014621735, "rewards/rejected": -0.040221940726041794, "step": 1670 }, { "epoch": 0.2894555478980014, "grad_norm": 2.385436534881592, "learning_rate": 9.939227159394998e-08, "logits/chosen": -3.0334177017211914, "logits/rejected": -3.007333278656006, "logps/chosen": -58.554542541503906, "logps/rejected": -59.123863220214844, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": -0.0294620580971241, "rewards/margins": 0.01769358292222023, "rewards/rejected": -0.04715564101934433, "step": 1680 }, { "epoch": 0.29117849758787046, "grad_norm": 2.487203359603882, "learning_rate": 9.936867665117098e-08, "logits/chosen": -3.0129518508911133, "logits/rejected": -2.9973418712615967, "logps/chosen": -58.51457595825195, "logps/rejected": -55.72222137451172, "loss": 0.6911, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.029692823067307472, "rewards/margins": 0.004795686341822147, "rewards/rejected": -0.03448851406574249, "step": 1690 }, { "epoch": 0.2929014472777395, "grad_norm": 2.6827774047851562, "learning_rate": 9.93446352636477e-08, "logits/chosen": -2.9218242168426514, "logits/rejected": -2.926464557647705, "logps/chosen": -52.7581901550293, "logps/rejected": -59.58576202392578, "loss": 0.6882, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.031951457262039185, "rewards/margins": 0.010778420604765415, "rewards/rejected": -0.04272987321019173, "step": 1700 }, { "epoch": 0.2929014472777395, "eval_logits/chosen": -3.1172916889190674, "eval_logits/rejected": -3.1116535663604736, "eval_logps/chosen": -59.38001251220703, "eval_logps/rejected": -64.44491577148438, "eval_loss": 0.6903406977653503, "eval_rewards/accuracies": 0.5850371718406677, "eval_rewards/chosen": -0.006681189872324467, "eval_rewards/margins": 0.005966781172901392, "eval_rewards/rejected": -0.012647970579564571, "eval_runtime": 358.5851, "eval_samples_per_second": 12.003, "eval_steps_per_second": 1.5, "step": 1700 }, { "epoch": 0.29462439696760856, "grad_norm": 2.7968385219573975, "learning_rate": 9.932014764878828e-08, "logits/chosen": -3.0532126426696777, "logits/rejected": -3.0127451419830322, "logps/chosen": -61.0193977355957, "logps/rejected": -58.547027587890625, "loss": 0.6863, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.026096154004335403, "rewards/margins": 0.014539210125803947, "rewards/rejected": -0.0406353622674942, "step": 1710 }, { "epoch": 0.2963473466574776, "grad_norm": 2.369809865951538, "learning_rate": 9.929521402803614e-08, "logits/chosen": -3.06132173538208, "logits/rejected": -3.0323057174682617, "logps/chosen": -59.400787353515625, "logps/rejected": -55.1656608581543, "loss": 0.6879, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.029749011620879173, "rewards/margins": 0.011349111795425415, "rewards/rejected": -0.04109812527894974, "step": 1720 }, { "epoch": 0.29807029634734666, "grad_norm": 2.2750089168548584, "learning_rate": 9.92698346268679e-08, "logits/chosen": -3.0554797649383545, "logits/rejected": -3.0384256839752197, "logps/chosen": -57.126609802246094, "logps/rejected": -56.65547561645508, "loss": 0.6888, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03192286565899849, "rewards/margins": 0.009574519470334053, "rewards/rejected": -0.04149738699197769, "step": 1730 }, { "epoch": 0.2997932460372157, "grad_norm": 2.516836404800415, "learning_rate": 9.924400967479145e-08, "logits/chosen": -3.0047802925109863, "logits/rejected": -2.994741678237915, "logps/chosen": -55.8693733215332, "logps/rejected": -58.82105255126953, "loss": 0.6887, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03768100589513779, "rewards/margins": 0.009916347451508045, "rewards/rejected": -0.04759735241532326, "step": 1740 }, { "epoch": 0.30151619572708477, "grad_norm": 2.693798303604126, "learning_rate": 9.921773940534381e-08, "logits/chosen": -3.0474226474761963, "logits/rejected": -3.0362508296966553, "logps/chosen": -57.77741622924805, "logps/rejected": -59.905982971191406, "loss": 0.6871, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.025913890451192856, "rewards/margins": 0.012879503890872002, "rewards/rejected": -0.03879339620471001, "step": 1750 }, { "epoch": 0.30323914541695385, "grad_norm": 2.5608084201812744, "learning_rate": 9.919102405608905e-08, "logits/chosen": -3.0343425273895264, "logits/rejected": -2.998051404953003, "logps/chosen": -58.855934143066406, "logps/rejected": -58.174530029296875, "loss": 0.6817, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028327789157629013, "rewards/margins": 0.023931993171572685, "rewards/rejected": -0.05225978419184685, "step": 1760 }, { "epoch": 0.3049620951068229, "grad_norm": 2.1804721355438232, "learning_rate": 9.916386386861613e-08, "logits/chosen": -3.060925245285034, "logits/rejected": -3.0244739055633545, "logps/chosen": -58.5760498046875, "logps/rejected": -54.62489700317383, "loss": 0.6833, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.035943519324064255, "rewards/margins": 0.020883310586214066, "rewards/rejected": -0.05682682991027832, "step": 1770 }, { "epoch": 0.30668504479669195, "grad_norm": 2.838467836380005, "learning_rate": 9.913625908853674e-08, "logits/chosen": -2.976637363433838, "logits/rejected": -2.978466033935547, "logps/chosen": -57.95159149169922, "logps/rejected": -63.324363708496094, "loss": 0.6865, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.034330397844314575, "rewards/margins": 0.01428927667438984, "rewards/rejected": -0.048619672656059265, "step": 1780 }, { "epoch": 0.308407994486561, "grad_norm": 2.678936719894409, "learning_rate": 9.910820996548301e-08, "logits/chosen": -3.1204793453216553, "logits/rejected": -3.0981993675231934, "logps/chosen": -55.150596618652344, "logps/rejected": -57.8071174621582, "loss": 0.6831, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03155216947197914, "rewards/margins": 0.02121361531317234, "rewards/rejected": -0.05276578664779663, "step": 1790 }, { "epoch": 0.31013094417643006, "grad_norm": 2.4490652084350586, "learning_rate": 9.907971675310532e-08, "logits/chosen": -3.0842478275299072, "logits/rejected": -3.060288190841675, "logps/chosen": -55.61701202392578, "logps/rejected": -57.78422927856445, "loss": 0.6838, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0381484255194664, "rewards/margins": 0.019785432144999504, "rewards/rejected": -0.05793385952711105, "step": 1800 }, { "epoch": 0.31013094417643006, "eval_logits/chosen": -3.109452486038208, "eval_logits/rejected": -3.103776216506958, "eval_logps/chosen": -59.92013931274414, "eval_logps/rejected": -65.07719421386719, "eval_loss": 0.6899515390396118, "eval_rewards/accuracies": 0.5824813842773438, "eval_rewards/chosen": -0.012082410044968128, "eval_rewards/margins": 0.006888336502015591, "eval_rewards/rejected": -0.01897074468433857, "eval_runtime": 358.2772, "eval_samples_per_second": 12.013, "eval_steps_per_second": 1.502, "step": 1800 }, { "epoch": 0.3118538938662991, "grad_norm": 2.658210039138794, "learning_rate": 9.905077970906998e-08, "logits/chosen": -3.0863704681396484, "logits/rejected": -3.0510458946228027, "logps/chosen": -59.87849807739258, "logps/rejected": -58.87512969970703, "loss": 0.6841, "rewards/accuracies": 0.65625, "rewards/chosen": -0.032613541930913925, "rewards/margins": 0.01923174224793911, "rewards/rejected": -0.05184528976678848, "step": 1810 }, { "epoch": 0.31357684355616816, "grad_norm": 2.5303404331207275, "learning_rate": 9.902139909505691e-08, "logits/chosen": -3.056530475616455, "logits/rejected": -3.0221753120422363, "logps/chosen": -58.96949005126953, "logps/rejected": -56.16400909423828, "loss": 0.6854, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03807400166988373, "rewards/margins": 0.01674531027674675, "rewards/rejected": -0.05481930822134018, "step": 1820 }, { "epoch": 0.31529979324603724, "grad_norm": 2.4892539978027344, "learning_rate": 9.899157517675728e-08, "logits/chosen": -2.952310085296631, "logits/rejected": -2.938739061355591, "logps/chosen": -59.15592575073242, "logps/rejected": -59.31223678588867, "loss": 0.6889, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.036126311868429184, "rewards/margins": 0.009667845442891121, "rewards/rejected": -0.045794155448675156, "step": 1830 }, { "epoch": 0.31702274293590627, "grad_norm": 2.669368028640747, "learning_rate": 9.896130822387107e-08, "logits/chosen": -3.159759044647217, "logits/rejected": -3.155785322189331, "logps/chosen": -59.499298095703125, "logps/rejected": -60.6772575378418, "loss": 0.6889, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.04679185897111893, "rewards/margins": 0.009518241509795189, "rewards/rejected": -0.05631009861826897, "step": 1840 }, { "epoch": 0.31874569262577535, "grad_norm": 2.5876333713531494, "learning_rate": 9.893059851010465e-08, "logits/chosen": -3.025189161300659, "logits/rejected": -3.0251142978668213, "logps/chosen": -57.66413497924805, "logps/rejected": -60.19395065307617, "loss": 0.6886, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04454426094889641, "rewards/margins": 0.010319806635379791, "rewards/rejected": -0.0548640713095665, "step": 1850 }, { "epoch": 0.32046864231564437, "grad_norm": 2.732103109359741, "learning_rate": 9.889944631316835e-08, "logits/chosen": -2.9574267864227295, "logits/rejected": -2.921621322631836, "logps/chosen": -60.49852752685547, "logps/rejected": -59.736732482910156, "loss": 0.6798, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.032065801322460175, "rewards/margins": 0.027947356924414635, "rewards/rejected": -0.06001315638422966, "step": 1860 }, { "epoch": 0.32219159200551345, "grad_norm": 3.014956474304199, "learning_rate": 9.886785191477388e-08, "logits/chosen": -3.003999710083008, "logits/rejected": -2.9785971641540527, "logps/chosen": -59.9724006652832, "logps/rejected": -60.295265197753906, "loss": 0.6814, "rewards/accuracies": 0.625, "rewards/chosen": -0.04022688791155815, "rewards/margins": 0.024841442704200745, "rewards/rejected": -0.06506834179162979, "step": 1870 }, { "epoch": 0.3239145416953825, "grad_norm": 2.690962314605713, "learning_rate": 9.883581560063181e-08, "logits/chosen": -3.0487072467803955, "logits/rejected": -3.011261224746704, "logps/chosen": -56.3128662109375, "logps/rejected": -57.95994186401367, "loss": 0.6795, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03597037121653557, "rewards/margins": 0.028922447934746742, "rewards/rejected": -0.06489281356334686, "step": 1880 }, { "epoch": 0.32563749138525155, "grad_norm": 2.828683376312256, "learning_rate": 9.8803337660449e-08, "logits/chosen": -3.071624517440796, "logits/rejected": -3.065598964691162, "logps/chosen": -56.401588439941406, "logps/rejected": -60.657203674316406, "loss": 0.6796, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0334382988512516, "rewards/margins": 0.02865518070757389, "rewards/rejected": -0.062093477696180344, "step": 1890 }, { "epoch": 0.32736044107512063, "grad_norm": 2.8319833278656006, "learning_rate": 9.877041838792595e-08, "logits/chosen": -3.0038390159606934, "logits/rejected": -2.9711012840270996, "logps/chosen": -60.720428466796875, "logps/rejected": -57.573692321777344, "loss": 0.6836, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.032806314527988434, "rewards/margins": 0.020312577486038208, "rewards/rejected": -0.053118884563446045, "step": 1900 }, { "epoch": 0.32736044107512063, "eval_logits/chosen": -3.103686571121216, "eval_logits/rejected": -3.0980401039123535, "eval_logps/chosen": -60.28012466430664, "eval_logps/rejected": -65.52766418457031, "eval_loss": 0.6895469427108765, "eval_rewards/accuracies": 0.5882899761199951, "eval_rewards/chosen": -0.01568230614066124, "eval_rewards/margins": 0.007793075405061245, "eval_rewards/rejected": -0.02347538247704506, "eval_runtime": 359.1258, "eval_samples_per_second": 11.985, "eval_steps_per_second": 1.498, "step": 1900 }, { "epoch": 0.32908339076498966, "grad_norm": 2.763634204864502, "learning_rate": 9.87370580807542e-08, "logits/chosen": -2.9044349193573, "logits/rejected": -2.8699584007263184, "logps/chosen": -58.88459014892578, "logps/rejected": -59.89939498901367, "loss": 0.6829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.044168949127197266, "rewards/margins": 0.02184966765344143, "rewards/rejected": -0.06601861119270325, "step": 1910 }, { "epoch": 0.33080634045485874, "grad_norm": 2.6262426376342773, "learning_rate": 9.870325704061355e-08, "logits/chosen": -3.0172269344329834, "logits/rejected": -2.98966121673584, "logps/chosen": -59.44633102416992, "logps/rejected": -60.100379943847656, "loss": 0.6823, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0407765693962574, "rewards/margins": 0.02320735715329647, "rewards/rejected": -0.06398393213748932, "step": 1920 }, { "epoch": 0.33252929014472776, "grad_norm": 2.3864188194274902, "learning_rate": 9.866901557316944e-08, "logits/chosen": -2.964810609817505, "logits/rejected": -2.9282431602478027, "logps/chosen": -58.27106857299805, "logps/rejected": -60.40107345581055, "loss": 0.6829, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.043806519359350204, "rewards/margins": 0.021700460463762283, "rewards/rejected": -0.06550697982311249, "step": 1930 }, { "epoch": 0.33425223983459684, "grad_norm": 2.4197654724121094, "learning_rate": 9.863433398807007e-08, "logits/chosen": -2.9881813526153564, "logits/rejected": -2.969853162765503, "logps/chosen": -55.707054138183594, "logps/rejected": -56.50288009643555, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": -0.047670621424913406, "rewards/margins": 0.01711922325193882, "rewards/rejected": -0.06478984653949738, "step": 1940 }, { "epoch": 0.33597518952446587, "grad_norm": 2.494558095932007, "learning_rate": 9.85992125989437e-08, "logits/chosen": -2.9932541847229004, "logits/rejected": -2.991720676422119, "logps/chosen": -57.3494987487793, "logps/rejected": -61.66455078125, "loss": 0.6837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.048418622463941574, "rewards/margins": 0.020452255383133888, "rewards/rejected": -0.06887087225914001, "step": 1950 }, { "epoch": 0.33769813921433495, "grad_norm": 2.642951250076294, "learning_rate": 9.856365172339574e-08, "logits/chosen": -2.999870777130127, "logits/rejected": -2.980663776397705, "logps/chosen": -58.778953552246094, "logps/rejected": -61.15327835083008, "loss": 0.6827, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.04419439285993576, "rewards/margins": 0.022369753569364548, "rewards/rejected": -0.0665641501545906, "step": 1960 }, { "epoch": 0.33942108890420397, "grad_norm": 2.4793989658355713, "learning_rate": 9.852765168300596e-08, "logits/chosen": -3.043196201324463, "logits/rejected": -3.0140252113342285, "logps/chosen": -60.28436279296875, "logps/rejected": -58.49019241333008, "loss": 0.6814, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04137754067778587, "rewards/margins": 0.02506864070892334, "rewards/rejected": -0.06644618511199951, "step": 1970 }, { "epoch": 0.34114403859407305, "grad_norm": 2.7259953022003174, "learning_rate": 9.849121280332546e-08, "logits/chosen": -2.9497714042663574, "logits/rejected": -2.944230079650879, "logps/chosen": -60.91106033325195, "logps/rejected": -60.264892578125, "loss": 0.6894, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.053232572972774506, "rewards/margins": 0.009022532030940056, "rewards/rejected": -0.06225510314106941, "step": 1980 }, { "epoch": 0.34286698828394213, "grad_norm": 2.7751598358154297, "learning_rate": 9.845433541387384e-08, "logits/chosen": -3.076275587081909, "logits/rejected": -3.041668653488159, "logps/chosen": -55.9117431640625, "logps/rejected": -56.07196044921875, "loss": 0.6799, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05077340081334114, "rewards/margins": 0.02825034037232399, "rewards/rejected": -0.07902374118566513, "step": 1990 }, { "epoch": 0.34458993797381116, "grad_norm": 2.9599082469940186, "learning_rate": 9.841701984813618e-08, "logits/chosen": -2.9976062774658203, "logits/rejected": -3.0039710998535156, "logps/chosen": -55.9847412109375, "logps/rejected": -65.41357421875, "loss": 0.685, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.06023833900690079, "rewards/margins": 0.01777346059679985, "rewards/rejected": -0.07801179587841034, "step": 2000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.0961995124816895, "eval_logits/rejected": -3.090507745742798, "eval_logps/chosen": -60.984703063964844, "eval_logps/rejected": -66.37020874023438, "eval_loss": 0.6889471411705017, "eval_rewards/accuracies": 0.589684009552002, "eval_rewards/chosen": -0.022728124633431435, "eval_rewards/margins": 0.009172691032290459, "eval_rewards/rejected": -0.03190081566572189, "eval_runtime": 358.5713, "eval_samples_per_second": 12.003, "eval_steps_per_second": 1.5, "step": 2000 }, { "epoch": 0.34631288766368024, "grad_norm": 2.715341567993164, "learning_rate": 9.837926644356002e-08, "logits/chosen": -3.0026180744171143, "logits/rejected": -2.9743618965148926, "logps/chosen": -60.34886932373047, "logps/rejected": -58.93947219848633, "loss": 0.682, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04747002199292183, "rewards/margins": 0.02390061318874359, "rewards/rejected": -0.07137063890695572, "step": 2010 }, { "epoch": 0.34803583735354926, "grad_norm": 2.4323620796203613, "learning_rate": 9.834107554155232e-08, "logits/chosen": -2.990692615509033, "logits/rejected": -2.968893051147461, "logps/chosen": -55.50908279418945, "logps/rejected": -63.061180114746094, "loss": 0.6805, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.05679772049188614, "rewards/margins": 0.02702813409268856, "rewards/rejected": -0.08382586389780045, "step": 2020 }, { "epoch": 0.34975878704341834, "grad_norm": 2.9917709827423096, "learning_rate": 9.83024474874763e-08, "logits/chosen": -3.07673716545105, "logits/rejected": -3.0554261207580566, "logps/chosen": -62.36286163330078, "logps/rejected": -62.7202033996582, "loss": 0.6828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.053105879575014114, "rewards/margins": 0.022281022742390633, "rewards/rejected": -0.0753868967294693, "step": 2030 }, { "epoch": 0.35148173673328736, "grad_norm": 2.6420340538024902, "learning_rate": 9.826338263064845e-08, "logits/chosen": -2.9410736560821533, "logits/rejected": -2.928380012512207, "logps/chosen": -58.5155143737793, "logps/rejected": -61.75719451904297, "loss": 0.684, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.057995665818452835, "rewards/margins": 0.020041372627019882, "rewards/rejected": -0.07803703844547272, "step": 2040 }, { "epoch": 0.35320468642315644, "grad_norm": 2.810659170150757, "learning_rate": 9.82238813243353e-08, "logits/chosen": -3.056933641433716, "logits/rejected": -3.0320918560028076, "logps/chosen": -56.75238800048828, "logps/rejected": -61.59584426879883, "loss": 0.6856, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.05530368164181709, "rewards/margins": 0.0165498498827219, "rewards/rejected": -0.07185353338718414, "step": 2050 }, { "epoch": 0.3549276361130255, "grad_norm": 2.5844037532806396, "learning_rate": 9.818394392575017e-08, "logits/chosen": -3.044243574142456, "logits/rejected": -3.0059947967529297, "logps/chosen": -60.13054275512695, "logps/rejected": -59.328285217285156, "loss": 0.6793, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04801074415445328, "rewards/margins": 0.02944994531571865, "rewards/rejected": -0.07746069133281708, "step": 2060 }, { "epoch": 0.35665058580289455, "grad_norm": 2.8128087520599365, "learning_rate": 9.814357079605006e-08, "logits/chosen": -3.0609805583953857, "logits/rejected": -3.0380260944366455, "logps/chosen": -60.5723876953125, "logps/rejected": -62.3077392578125, "loss": 0.6829, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05279190093278885, "rewards/margins": 0.022460918873548508, "rewards/rejected": -0.07525281608104706, "step": 2070 }, { "epoch": 0.35837353549276363, "grad_norm": 2.812436580657959, "learning_rate": 9.810276230033227e-08, "logits/chosen": -3.0014538764953613, "logits/rejected": -2.9721763134002686, "logps/chosen": -58.136962890625, "logps/rejected": -59.7253303527832, "loss": 0.6799, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06269152462482452, "rewards/margins": 0.028253117576241493, "rewards/rejected": -0.09094464033842087, "step": 2080 }, { "epoch": 0.36009648518263265, "grad_norm": 2.542465925216675, "learning_rate": 9.806151880763118e-08, "logits/chosen": -3.0250954627990723, "logits/rejected": -3.021667957305908, "logps/chosen": -60.14142990112305, "logps/rejected": -61.97650909423828, "loss": 0.6861, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05746666342020035, "rewards/margins": 0.015712924301624298, "rewards/rejected": -0.07317958772182465, "step": 2090 }, { "epoch": 0.36181943487250173, "grad_norm": 2.988396167755127, "learning_rate": 9.801984069091486e-08, "logits/chosen": -2.9737119674682617, "logits/rejected": -2.9518253803253174, "logps/chosen": -61.59429931640625, "logps/rejected": -59.7376708984375, "loss": 0.6828, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06247835233807564, "rewards/margins": 0.02240810915827751, "rewards/rejected": -0.08488644659519196, "step": 2100 }, { "epoch": 0.36181943487250173, "eval_logits/chosen": -3.089683771133423, "eval_logits/rejected": -3.0839576721191406, "eval_logps/chosen": -61.820899963378906, "eval_logps/rejected": -67.35945892333984, "eval_loss": 0.6882798671722412, "eval_rewards/accuracies": 0.580622673034668, "eval_rewards/chosen": -0.03109004907310009, "eval_rewards/margins": 0.010703377425670624, "eval_rewards/rejected": -0.041793424636125565, "eval_runtime": 358.4509, "eval_samples_per_second": 12.007, "eval_steps_per_second": 1.501, "step": 2100 }, { "epoch": 0.36354238456237076, "grad_norm": 2.661639451980591, "learning_rate": 9.797772832708176e-08, "logits/chosen": -3.0293941497802734, "logits/rejected": -3.032090663909912, "logps/chosen": -56.2503547668457, "logps/rejected": -63.839683532714844, "loss": 0.6877, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0735074132680893, "rewards/margins": 0.012995610944926739, "rewards/rejected": -0.08650301396846771, "step": 2110 }, { "epoch": 0.36526533425223984, "grad_norm": 2.7685952186584473, "learning_rate": 9.793518209695718e-08, "logits/chosen": -2.8939576148986816, "logits/rejected": -2.8669164180755615, "logps/chosen": -59.037567138671875, "logps/rejected": -58.662879943847656, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": -0.05461913347244263, "rewards/margins": 0.022714272141456604, "rewards/rejected": -0.07733340561389923, "step": 2120 }, { "epoch": 0.3669882839421089, "grad_norm": 2.3837106227874756, "learning_rate": 9.789220238528999e-08, "logits/chosen": -2.930190324783325, "logits/rejected": -2.9117355346679688, "logps/chosen": -60.96294403076172, "logps/rejected": -61.99995803833008, "loss": 0.6793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06492619961500168, "rewards/margins": 0.029704291373491287, "rewards/rejected": -0.09463049471378326, "step": 2130 }, { "epoch": 0.36871123363197794, "grad_norm": 2.6930501461029053, "learning_rate": 9.784878958074901e-08, "logits/chosen": -2.90777325630188, "logits/rejected": -2.8999273777008057, "logps/chosen": -57.04816818237305, "logps/rejected": -63.14056396484375, "loss": 0.688, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08392993360757828, "rewards/margins": 0.011952447704970837, "rewards/rejected": -0.09588237851858139, "step": 2140 }, { "epoch": 0.370434183321847, "grad_norm": 2.75247859954834, "learning_rate": 9.780494407591959e-08, "logits/chosen": -2.957963228225708, "logits/rejected": -2.911945343017578, "logps/chosen": -60.3175048828125, "logps/rejected": -57.6784553527832, "loss": 0.6741, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05628920719027519, "rewards/margins": 0.040407292544841766, "rewards/rejected": -0.09669648855924606, "step": 2150 }, { "epoch": 0.37215713301171605, "grad_norm": 2.7918150424957275, "learning_rate": 9.776066626730002e-08, "logits/chosen": -2.9251790046691895, "logits/rejected": -2.9098739624023438, "logps/chosen": -61.5231819152832, "logps/rejected": -58.77111053466797, "loss": 0.6817, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06766150891780853, "rewards/margins": 0.024511078372597694, "rewards/rejected": -0.09217258542776108, "step": 2160 }, { "epoch": 0.3738800827015851, "grad_norm": 2.841006278991699, "learning_rate": 9.77159565552979e-08, "logits/chosen": -3.045158863067627, "logits/rejected": -3.0296671390533447, "logps/chosen": -60.80147171020508, "logps/rejected": -60.424476623535156, "loss": 0.6815, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06470237672328949, "rewards/margins": 0.02472129836678505, "rewards/rejected": -0.08942367881536484, "step": 2170 }, { "epoch": 0.37560303239145415, "grad_norm": 2.9336609840393066, "learning_rate": 9.76708153442266e-08, "logits/chosen": -3.0933234691619873, "logits/rejected": -3.05588436126709, "logps/chosen": -64.88115692138672, "logps/rejected": -63.36223220825195, "loss": 0.6755, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06857044249773026, "rewards/margins": 0.03770887851715088, "rewards/rejected": -0.10627932846546173, "step": 2180 }, { "epoch": 0.37732598208132323, "grad_norm": 2.9289681911468506, "learning_rate": 9.76252430423016e-08, "logits/chosen": -3.1562750339508057, "logits/rejected": -3.1501102447509766, "logps/chosen": -60.25629806518555, "logps/rejected": -62.5523567199707, "loss": 0.6877, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0741446241736412, "rewards/margins": 0.012826305814087391, "rewards/rejected": -0.08697094023227692, "step": 2190 }, { "epoch": 0.37904893177119225, "grad_norm": 2.7656748294830322, "learning_rate": 9.75792400616367e-08, "logits/chosen": -2.985726833343506, "logits/rejected": -2.9629483222961426, "logps/chosen": -58.70253372192383, "logps/rejected": -65.15387725830078, "loss": 0.6745, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.06393808126449585, "rewards/margins": 0.03966350480914116, "rewards/rejected": -0.1036015972495079, "step": 2200 }, { "epoch": 0.37904893177119225, "eval_logits/chosen": -3.0810647010803223, "eval_logits/rejected": -3.0753400325775146, "eval_logps/chosen": -62.52730178833008, "eval_logps/rejected": -68.22266387939453, "eval_loss": 0.6875881552696228, "eval_rewards/accuracies": 0.5882899761199951, "eval_rewards/chosen": -0.03815402835607529, "eval_rewards/margins": 0.01227136142551899, "eval_rewards/rejected": -0.05042538791894913, "eval_runtime": 358.5312, "eval_samples_per_second": 12.005, "eval_steps_per_second": 1.501, "step": 2200 }, { "epoch": 0.38077188146106133, "grad_norm": 2.809079885482788, "learning_rate": 9.75328068182404e-08, "logits/chosen": -3.0429909229278564, "logits/rejected": -3.0269718170166016, "logps/chosen": -60.863067626953125, "logps/rejected": -62.8907585144043, "loss": 0.679, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06136716529726982, "rewards/margins": 0.03044435940682888, "rewards/rejected": -0.09181152284145355, "step": 2210 }, { "epoch": 0.3824948311509304, "grad_norm": 2.8651580810546875, "learning_rate": 9.748594373201213e-08, "logits/chosen": -2.8712568283081055, "logits/rejected": -2.8639302253723145, "logps/chosen": -60.365760803222656, "logps/rejected": -62.60291290283203, "loss": 0.6829, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.06487758457660675, "rewards/margins": 0.022680306807160378, "rewards/rejected": -0.08755789697170258, "step": 2220 }, { "epoch": 0.38421778084079944, "grad_norm": 2.711296319961548, "learning_rate": 9.743865122673835e-08, "logits/chosen": -3.0264925956726074, "logits/rejected": -2.9949655532836914, "logps/chosen": -61.64265823364258, "logps/rejected": -60.85839080810547, "loss": 0.6834, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07965074479579926, "rewards/margins": 0.021513305604457855, "rewards/rejected": -0.10116405785083771, "step": 2230 }, { "epoch": 0.3859407305306685, "grad_norm": 2.980560779571533, "learning_rate": 9.739092973008886e-08, "logits/chosen": -2.979952096939087, "logits/rejected": -2.944349527359009, "logps/chosen": -63.78528594970703, "logps/rejected": -62.89183807373047, "loss": 0.6826, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0697564110159874, "rewards/margins": 0.02322663739323616, "rewards/rejected": -0.09298305213451385, "step": 2240 }, { "epoch": 0.38766368022053754, "grad_norm": 3.1012465953826904, "learning_rate": 9.734277967361279e-08, "logits/chosen": -2.9386608600616455, "logits/rejected": -2.904419422149658, "logps/chosen": -62.32648849487305, "logps/rejected": -62.88580322265625, "loss": 0.6814, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06976626813411713, "rewards/margins": 0.025638435035943985, "rewards/rejected": -0.09540469944477081, "step": 2250 }, { "epoch": 0.3893866299104066, "grad_norm": 2.673414945602417, "learning_rate": 9.729420149273484e-08, "logits/chosen": -2.896034002304077, "logits/rejected": -2.870567798614502, "logps/chosen": -68.32345581054688, "logps/rejected": -67.14576721191406, "loss": 0.6832, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07515819370746613, "rewards/margins": 0.022647675126791, "rewards/rejected": -0.09780587255954742, "step": 2260 }, { "epoch": 0.39110957960027565, "grad_norm": 2.9458136558532715, "learning_rate": 9.724519562675122e-08, "logits/chosen": -2.880819797515869, "logits/rejected": -2.8479409217834473, "logps/chosen": -62.41205978393555, "logps/rejected": -62.49220657348633, "loss": 0.6823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08350671082735062, "rewards/margins": 0.024250809103250504, "rewards/rejected": -0.10775750875473022, "step": 2270 }, { "epoch": 0.3928325292901447, "grad_norm": 2.828383445739746, "learning_rate": 9.719576251882575e-08, "logits/chosen": -2.9576196670532227, "logits/rejected": -2.947373867034912, "logps/chosen": -60.50732421875, "logps/rejected": -64.59284210205078, "loss": 0.684, "rewards/accuracies": 0.625, "rewards/chosen": -0.07063782960176468, "rewards/margins": 0.019989144057035446, "rewards/rejected": -0.09062696993350983, "step": 2280 }, { "epoch": 0.3945554789800138, "grad_norm": 2.8307809829711914, "learning_rate": 9.714590261598585e-08, "logits/chosen": -2.852996349334717, "logits/rejected": -2.824765682220459, "logps/chosen": -60.27251434326172, "logps/rejected": -64.0623779296875, "loss": 0.6795, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.07389631122350693, "rewards/margins": 0.029124462977051735, "rewards/rejected": -0.10302077233791351, "step": 2290 }, { "epoch": 0.39627842866988283, "grad_norm": 2.787592887878418, "learning_rate": 9.709561636911845e-08, "logits/chosen": -3.035175323486328, "logits/rejected": -3.001577854156494, "logps/chosen": -62.35862350463867, "logps/rejected": -63.316009521484375, "loss": 0.6781, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06413549929857254, "rewards/margins": 0.03246583789587021, "rewards/rejected": -0.09660132229328156, "step": 2300 }, { "epoch": 0.39627842866988283, "eval_logits/chosen": -3.074549913406372, "eval_logits/rejected": -3.0688531398773193, "eval_logps/chosen": -62.76375198364258, "eval_logps/rejected": -68.54676818847656, "eval_loss": 0.6871928572654724, "eval_rewards/accuracies": 0.5908457040786743, "eval_rewards/chosen": -0.040518537163734436, "eval_rewards/margins": 0.01314793061465025, "eval_rewards/rejected": -0.05366646498441696, "eval_runtime": 358.9768, "eval_samples_per_second": 11.99, "eval_steps_per_second": 1.499, "step": 2300 }, { "epoch": 0.3980013783597519, "grad_norm": 2.8675854206085205, "learning_rate": 9.704490423296595e-08, "logits/chosen": -2.937295436859131, "logits/rejected": -2.921793222427368, "logps/chosen": -58.962135314941406, "logps/rejected": -62.21418380737305, "loss": 0.6798, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07884009182453156, "rewards/margins": 0.028957273811101913, "rewards/rejected": -0.10779736191034317, "step": 2310 }, { "epoch": 0.39972432804962094, "grad_norm": 2.8028993606567383, "learning_rate": 9.699376666612209e-08, "logits/chosen": -3.032089948654175, "logits/rejected": -2.9912171363830566, "logps/chosen": -64.53179931640625, "logps/rejected": -62.15001678466797, "loss": 0.6747, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.07090407609939575, "rewards/margins": 0.039122868329286575, "rewards/rejected": -0.11002693325281143, "step": 2320 }, { "epoch": 0.40144727773949, "grad_norm": 2.875821590423584, "learning_rate": 9.694220413102785e-08, "logits/chosen": -2.9437172412872314, "logits/rejected": -2.9218506813049316, "logps/chosen": -63.208892822265625, "logps/rejected": -63.77135467529297, "loss": 0.6739, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07399503886699677, "rewards/margins": 0.041165683418512344, "rewards/rejected": -0.1151607409119606, "step": 2330 }, { "epoch": 0.40317022742935904, "grad_norm": 2.7427754402160645, "learning_rate": 9.689021709396718e-08, "logits/chosen": -2.9846208095550537, "logits/rejected": -2.972612142562866, "logps/chosen": -60.94713592529297, "logps/rejected": -65.76235961914062, "loss": 0.6857, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0864558219909668, "rewards/margins": 0.01739116758108139, "rewards/rejected": -0.10384698957204819, "step": 2340 }, { "epoch": 0.4048931771192281, "grad_norm": 2.897033214569092, "learning_rate": 9.683780602506288e-08, "logits/chosen": -2.9678235054016113, "logits/rejected": -2.9454286098480225, "logps/chosen": -65.06855773925781, "logps/rejected": -65.86259460449219, "loss": 0.6756, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.06807255744934082, "rewards/margins": 0.03739145025610924, "rewards/rejected": -0.10546400398015976, "step": 2350 }, { "epoch": 0.4066161268090972, "grad_norm": 2.6393425464630127, "learning_rate": 9.678497139827229e-08, "logits/chosen": -2.9860851764678955, "logits/rejected": -2.973085641860962, "logps/chosen": -58.43578338623047, "logps/rejected": -61.41686248779297, "loss": 0.683, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08073704689741135, "rewards/margins": 0.022134827449917793, "rewards/rejected": -0.10287187993526459, "step": 2360 }, { "epoch": 0.4083390764989662, "grad_norm": 3.105860948562622, "learning_rate": 9.673171369138295e-08, "logits/chosen": -2.9996142387390137, "logits/rejected": -2.993256092071533, "logps/chosen": -62.017311096191406, "logps/rejected": -65.43057250976562, "loss": 0.6775, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08255334943532944, "rewards/margins": 0.03310525417327881, "rewards/rejected": -0.11565861850976944, "step": 2370 }, { "epoch": 0.4100620261888353, "grad_norm": 2.6338090896606445, "learning_rate": 9.667803338600848e-08, "logits/chosen": -2.9264793395996094, "logits/rejected": -2.9018797874450684, "logps/chosen": -61.777565002441406, "logps/rejected": -63.37926483154297, "loss": 0.6788, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07435405254364014, "rewards/margins": 0.030792638659477234, "rewards/rejected": -0.10514669120311737, "step": 2380 }, { "epoch": 0.41178497587870433, "grad_norm": 2.8210086822509766, "learning_rate": 9.662393096758396e-08, "logits/chosen": -3.010481357574463, "logits/rejected": -2.983585834503174, "logps/chosen": -64.10334014892578, "logps/rejected": -64.34742736816406, "loss": 0.6774, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0786934643983841, "rewards/margins": 0.03383960574865341, "rewards/rejected": -0.11253305524587631, "step": 2390 }, { "epoch": 0.4135079255685734, "grad_norm": 2.57030987739563, "learning_rate": 9.656940692536178e-08, "logits/chosen": -3.057115077972412, "logits/rejected": -3.0364222526550293, "logps/chosen": -60.174354553222656, "logps/rejected": -63.84586715698242, "loss": 0.6809, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0787380188703537, "rewards/margins": 0.02708571031689644, "rewards/rejected": -0.10582373291254044, "step": 2400 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -3.064911365509033, "eval_logits/rejected": -3.059225082397461, "eval_logps/chosen": -63.42079162597656, "eval_logps/rejected": -69.33052062988281, "eval_loss": 0.6866453886032104, "eval_rewards/accuracies": 0.5906133651733398, "eval_rewards/chosen": -0.04708903282880783, "eval_rewards/margins": 0.014414963312447071, "eval_rewards/rejected": -0.06150398775935173, "eval_runtime": 358.4887, "eval_samples_per_second": 12.006, "eval_steps_per_second": 1.501, "step": 2400 }, { "epoch": 0.41523087525844243, "grad_norm": 2.556201934814453, "learning_rate": 9.651446175240698e-08, "logits/chosen": -3.02958345413208, "logits/rejected": -2.990233898162842, "logps/chosen": -61.12690353393555, "logps/rejected": -60.686256408691406, "loss": 0.6737, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07276560366153717, "rewards/margins": 0.0414142906665802, "rewards/rejected": -0.11417989432811737, "step": 2410 }, { "epoch": 0.4169538249483115, "grad_norm": 2.8453683853149414, "learning_rate": 9.645909594559304e-08, "logits/chosen": -3.0249388217926025, "logits/rejected": -3.002185344696045, "logps/chosen": -64.44881439208984, "logps/rejected": -66.13243865966797, "loss": 0.6796, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0826670378446579, "rewards/margins": 0.02963237091898918, "rewards/rejected": -0.11229941993951797, "step": 2420 }, { "epoch": 0.41867677463818054, "grad_norm": 2.8599154949188232, "learning_rate": 9.64033100055972e-08, "logits/chosen": -2.986757755279541, "logits/rejected": -2.96449613571167, "logps/chosen": -62.99231719970703, "logps/rejected": -62.6896858215332, "loss": 0.6805, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0814933180809021, "rewards/margins": 0.028070122003555298, "rewards/rejected": -0.1095634326338768, "step": 2430 }, { "epoch": 0.4203997243280496, "grad_norm": 3.06473445892334, "learning_rate": 9.634710443689602e-08, "logits/chosen": -2.9826653003692627, "logits/rejected": -2.9693188667297363, "logps/chosen": -62.84228515625, "logps/rejected": -66.02586364746094, "loss": 0.6759, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07785408198833466, "rewards/margins": 0.037403546273708344, "rewards/rejected": -0.1152576357126236, "step": 2440 }, { "epoch": 0.4221226740179187, "grad_norm": 2.7919325828552246, "learning_rate": 9.629047974776077e-08, "logits/chosen": -2.9399731159210205, "logits/rejected": -2.9167487621307373, "logps/chosen": -62.27665328979492, "logps/rejected": -64.26480865478516, "loss": 0.6802, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09218315780162811, "rewards/margins": 0.027885476127266884, "rewards/rejected": -0.12006862461566925, "step": 2450 }, { "epoch": 0.4238456237077877, "grad_norm": 2.8518171310424805, "learning_rate": 9.623343645025288e-08, "logits/chosen": -2.921736717224121, "logits/rejected": -2.8892157077789307, "logps/chosen": -64.50049591064453, "logps/rejected": -64.17418670654297, "loss": 0.6779, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0835905522108078, "rewards/margins": 0.032821472734212875, "rewards/rejected": -0.11641202867031097, "step": 2460 }, { "epoch": 0.4255685733976568, "grad_norm": 3.0394136905670166, "learning_rate": 9.61759750602193e-08, "logits/chosen": -3.079592227935791, "logits/rejected": -3.058415412902832, "logps/chosen": -62.834922790527344, "logps/rejected": -64.33911895751953, "loss": 0.6753, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07790704071521759, "rewards/margins": 0.0384015329182148, "rewards/rejected": -0.11630856990814209, "step": 2470 }, { "epoch": 0.4272915230875258, "grad_norm": 2.9794809818267822, "learning_rate": 9.611809609728777e-08, "logits/chosen": -2.9777140617370605, "logits/rejected": -2.946880340576172, "logps/chosen": -64.09749603271484, "logps/rejected": -62.37017822265625, "loss": 0.683, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09719328582286835, "rewards/margins": 0.022251714020967484, "rewards/rejected": -0.11944498866796494, "step": 2480 }, { "epoch": 0.4290144727773949, "grad_norm": 2.740145683288574, "learning_rate": 9.605980008486224e-08, "logits/chosen": -2.961674213409424, "logits/rejected": -2.9528236389160156, "logps/chosen": -59.51900100708008, "logps/rejected": -63.45074462890625, "loss": 0.6826, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09480162709951401, "rewards/margins": 0.0237564779818058, "rewards/rejected": -0.11855810880661011, "step": 2490 }, { "epoch": 0.43073742246726393, "grad_norm": 3.2707362174987793, "learning_rate": 9.600108755011803e-08, "logits/chosen": -3.0370676517486572, "logits/rejected": -3.013251304626465, "logps/chosen": -66.4466552734375, "logps/rejected": -68.46018981933594, "loss": 0.6828, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0955594927072525, "rewards/margins": 0.023346083238720894, "rewards/rejected": -0.11890558153390884, "step": 2500 }, { "epoch": 0.43073742246726393, "eval_logits/chosen": -3.0558035373687744, "eval_logits/rejected": -3.05007004737854, "eval_logps/chosen": -64.28133392333984, "eval_logps/rejected": -70.30865478515625, "eval_loss": 0.6861559748649597, "eval_rewards/accuracies": 0.5913103818893433, "eval_rewards/chosen": -0.05569446086883545, "eval_rewards/margins": 0.015590852126479149, "eval_rewards/rejected": -0.07128531485795975, "eval_runtime": 358.8021, "eval_samples_per_second": 11.995, "eval_steps_per_second": 1.499, "step": 2500 }, { "epoch": 0.432460372157133, "grad_norm": 2.9338889122009277, "learning_rate": 9.594195902399708e-08, "logits/chosen": -2.8873167037963867, "logits/rejected": -2.877664566040039, "logps/chosen": -61.8262939453125, "logps/rejected": -65.24763488769531, "loss": 0.6752, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.09440617263317108, "rewards/margins": 0.03825463727116585, "rewards/rejected": -0.13266082108020782, "step": 2510 }, { "epoch": 0.4341833218470021, "grad_norm": 2.6996138095855713, "learning_rate": 9.588241504120325e-08, "logits/chosen": -2.875840663909912, "logits/rejected": -2.8484764099121094, "logps/chosen": -61.40210723876953, "logps/rejected": -61.77978515625, "loss": 0.6753, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08933891355991364, "rewards/margins": 0.0384402871131897, "rewards/rejected": -0.12777918577194214, "step": 2520 }, { "epoch": 0.4359062715368711, "grad_norm": 2.8578429222106934, "learning_rate": 9.582245614019734e-08, "logits/chosen": -2.962174892425537, "logits/rejected": -2.9574642181396484, "logps/chosen": -61.437477111816406, "logps/rejected": -66.27393341064453, "loss": 0.6853, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.10267911851406097, "rewards/margins": 0.018535245209932327, "rewards/rejected": -0.1212143674492836, "step": 2530 }, { "epoch": 0.4376292212267402, "grad_norm": 2.7969298362731934, "learning_rate": 9.576208286319231e-08, "logits/chosen": -3.001952886581421, "logits/rejected": -2.991283416748047, "logps/chosen": -64.27826690673828, "logps/rejected": -67.79606628417969, "loss": 0.6785, "rewards/accuracies": 0.625, "rewards/chosen": -0.10737824440002441, "rewards/margins": 0.032104894518852234, "rewards/rejected": -0.13948313891887665, "step": 2540 }, { "epoch": 0.4393521709166092, "grad_norm": 3.0053772926330566, "learning_rate": 9.570129575614835e-08, "logits/chosen": -2.9459023475646973, "logits/rejected": -2.913362503051758, "logps/chosen": -67.61305236816406, "logps/rejected": -66.90510559082031, "loss": 0.6754, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08929960429668427, "rewards/margins": 0.04003571346402168, "rewards/rejected": -0.12933531403541565, "step": 2550 }, { "epoch": 0.4410751206064783, "grad_norm": 2.983738899230957, "learning_rate": 9.564009536876798e-08, "logits/chosen": -3.0335261821746826, "logits/rejected": -3.009237766265869, "logps/chosen": -64.40341186523438, "logps/rejected": -66.43016052246094, "loss": 0.6782, "rewards/accuracies": 0.625, "rewards/chosen": -0.09143063426017761, "rewards/margins": 0.03266965225338936, "rewards/rejected": -0.12410029023885727, "step": 2560 }, { "epoch": 0.4427980702963473, "grad_norm": 2.8760268688201904, "learning_rate": 9.557848225449097e-08, "logits/chosen": -2.9677720069885254, "logits/rejected": -2.949582576751709, "logps/chosen": -67.64404296875, "logps/rejected": -69.02073669433594, "loss": 0.6772, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09042315185070038, "rewards/margins": 0.034747906029224396, "rewards/rejected": -0.12517106533050537, "step": 2570 }, { "epoch": 0.4445210199862164, "grad_norm": 3.3024773597717285, "learning_rate": 9.551645697048946e-08, "logits/chosen": -2.9901578426361084, "logits/rejected": -2.971513032913208, "logps/chosen": -63.95698165893555, "logps/rejected": -65.8638916015625, "loss": 0.6838, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.10571177303791046, "rewards/margins": 0.021099606528878212, "rewards/rejected": -0.12681138515472412, "step": 2580 }, { "epoch": 0.4462439696760855, "grad_norm": 3.4273412227630615, "learning_rate": 9.545402007766291e-08, "logits/chosen": -2.968909740447998, "logits/rejected": -2.9322829246520996, "logps/chosen": -65.9303207397461, "logps/rejected": -64.69056701660156, "loss": 0.6738, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09235879778862, "rewards/margins": 0.04207988083362579, "rewards/rejected": -0.1344386637210846, "step": 2590 }, { "epoch": 0.4479669193659545, "grad_norm": 3.036872148513794, "learning_rate": 9.539117214063292e-08, "logits/chosen": -2.9415981769561768, "logits/rejected": -2.8942933082580566, "logps/chosen": -67.31944274902344, "logps/rejected": -62.59626007080078, "loss": 0.6754, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10223277658224106, "rewards/margins": 0.039090316742658615, "rewards/rejected": -0.14132311940193176, "step": 2600 }, { "epoch": 0.4479669193659545, "eval_logits/chosen": -3.049037218093872, "eval_logits/rejected": -3.0432827472686768, "eval_logps/chosen": -64.85841369628906, "eval_logps/rejected": -71.00834655761719, "eval_loss": 0.685627281665802, "eval_rewards/accuracies": 0.591775119304657, "eval_rewards/chosen": -0.0614653080701828, "eval_rewards/margins": 0.01681698113679886, "eval_rewards/rejected": -0.07828228920698166, "eval_runtime": 358.2326, "eval_samples_per_second": 12.015, "eval_steps_per_second": 1.502, "step": 2600 }, { "epoch": 0.4496898690558236, "grad_norm": 2.9528679847717285, "learning_rate": 9.532791372773822e-08, "logits/chosen": -2.9208762645721436, "logits/rejected": -2.88173246383667, "logps/chosen": -68.32647705078125, "logps/rejected": -65.62649536132812, "loss": 0.6751, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.099474236369133, "rewards/margins": 0.040066082030534744, "rewards/rejected": -0.13954029977321625, "step": 2610 }, { "epoch": 0.4514128187456926, "grad_norm": 2.840683698654175, "learning_rate": 9.526424541102953e-08, "logits/chosen": -3.0072741508483887, "logits/rejected": -2.977484703063965, "logps/chosen": -61.29169845581055, "logps/rejected": -64.94569396972656, "loss": 0.6723, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10025743395090103, "rewards/margins": 0.044723428785800934, "rewards/rejected": -0.14498087763786316, "step": 2620 }, { "epoch": 0.4531357684355617, "grad_norm": 3.2683465480804443, "learning_rate": 9.520016776626432e-08, "logits/chosen": -2.8932104110717773, "logits/rejected": -2.8613693714141846, "logps/chosen": -64.25384521484375, "logps/rejected": -67.47929382324219, "loss": 0.6704, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09440313279628754, "rewards/margins": 0.048835329711437225, "rewards/rejected": -0.14323846995830536, "step": 2630 }, { "epoch": 0.4548587181254307, "grad_norm": 2.818164348602295, "learning_rate": 9.513568137290167e-08, "logits/chosen": -2.91321063041687, "logits/rejected": -2.8920276165008545, "logps/chosen": -60.74837112426758, "logps/rejected": -62.896446228027344, "loss": 0.6784, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10652945935726166, "rewards/margins": 0.03255968540906906, "rewards/rejected": -0.13908913731575012, "step": 2640 }, { "epoch": 0.4565816678152998, "grad_norm": 2.770519495010376, "learning_rate": 9.507078681409701e-08, "logits/chosen": -2.8849105834960938, "logits/rejected": -2.8674635887145996, "logps/chosen": -61.676353454589844, "logps/rejected": -67.19678497314453, "loss": 0.6765, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.09245527535676956, "rewards/margins": 0.03666946291923523, "rewards/rejected": -0.1291247308254242, "step": 2650 }, { "epoch": 0.4583046175051689, "grad_norm": 3.1621921062469482, "learning_rate": 9.500548467669681e-08, "logits/chosen": -2.9127323627471924, "logits/rejected": -2.8717422485351562, "logps/chosen": -68.6246566772461, "logps/rejected": -67.5105972290039, "loss": 0.6716, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.10334835201501846, "rewards/margins": 0.04701242595911026, "rewards/rejected": -0.15036077797412872, "step": 2660 }, { "epoch": 0.4600275671950379, "grad_norm": 2.8156025409698486, "learning_rate": 9.493977555123336e-08, "logits/chosen": -2.8853299617767334, "logits/rejected": -2.8571767807006836, "logps/chosen": -64.55490112304688, "logps/rejected": -66.96713256835938, "loss": 0.6733, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10608921200037003, "rewards/margins": 0.04296635091304779, "rewards/rejected": -0.14905555546283722, "step": 2670 }, { "epoch": 0.461750516884907, "grad_norm": 3.240689277648926, "learning_rate": 9.48736600319193e-08, "logits/chosen": -2.9307405948638916, "logits/rejected": -2.903907299041748, "logps/chosen": -69.11388397216797, "logps/rejected": -67.31000518798828, "loss": 0.6805, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10866351425647736, "rewards/margins": 0.027808865532279015, "rewards/rejected": -0.13647237420082092, "step": 2680 }, { "epoch": 0.463473466574776, "grad_norm": 3.0627663135528564, "learning_rate": 9.480713871664241e-08, "logits/chosen": -2.9041428565979004, "logits/rejected": -2.8821544647216797, "logps/chosen": -62.15593338012695, "logps/rejected": -65.29117584228516, "loss": 0.6753, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11514836549758911, "rewards/margins": 0.038897983729839325, "rewards/rejected": -0.15404634177684784, "step": 2690 }, { "epoch": 0.4651964162646451, "grad_norm": 2.908506393432617, "learning_rate": 9.474021220696002e-08, "logits/chosen": -2.9324769973754883, "logits/rejected": -2.9060561656951904, "logps/chosen": -63.67100143432617, "logps/rejected": -67.40882873535156, "loss": 0.6768, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1044023260474205, "rewards/margins": 0.03492189943790436, "rewards/rejected": -0.13932421803474426, "step": 2700 }, { "epoch": 0.4651964162646451, "eval_logits/chosen": -3.0427258014678955, "eval_logits/rejected": -3.037022352218628, "eval_logps/chosen": -65.44750213623047, "eval_logps/rejected": -71.71359252929688, "eval_loss": 0.6851440072059631, "eval_rewards/accuracies": 0.595724880695343, "eval_rewards/chosen": -0.06735601276159286, "eval_rewards/margins": 0.017978651449084282, "eval_rewards/rejected": -0.0853346586227417, "eval_runtime": 359.3357, "eval_samples_per_second": 11.978, "eval_steps_per_second": 1.497, "step": 2700 }, { "epoch": 0.4669193659545141, "grad_norm": 3.442694664001465, "learning_rate": 9.467288110809373e-08, "logits/chosen": -2.9548001289367676, "logits/rejected": -2.9361233711242676, "logps/chosen": -69.58091735839844, "logps/rejected": -67.78138732910156, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": -0.10134278237819672, "rewards/margins": 0.022963717579841614, "rewards/rejected": -0.12430648505687714, "step": 2710 }, { "epoch": 0.4686423156443832, "grad_norm": 3.124640464782715, "learning_rate": 9.460514602892386e-08, "logits/chosen": -2.9450113773345947, "logits/rejected": -2.948533773422241, "logps/chosen": -63.363075256347656, "logps/rejected": -67.94403839111328, "loss": 0.6807, "rewards/accuracies": 0.625, "rewards/chosen": -0.11315824836492538, "rewards/margins": 0.02865438722074032, "rewards/rejected": -0.14181265234947205, "step": 2720 }, { "epoch": 0.4703652653342522, "grad_norm": 3.505093812942505, "learning_rate": 9.453700758198396e-08, "logits/chosen": -2.944913387298584, "logits/rejected": -2.925502300262451, "logps/chosen": -69.31742858886719, "logps/rejected": -69.01850128173828, "loss": 0.6794, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10750722885131836, "rewards/margins": 0.030425354838371277, "rewards/rejected": -0.13793259859085083, "step": 2730 }, { "epoch": 0.4720882150241213, "grad_norm": 3.1059038639068604, "learning_rate": 9.446846638345521e-08, "logits/chosen": -2.9309005737304688, "logits/rejected": -2.90991473197937, "logps/chosen": -64.31190490722656, "logps/rejected": -63.59071731567383, "loss": 0.6811, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1145264282822609, "rewards/margins": 0.027553927153348923, "rewards/rejected": -0.14208035171031952, "step": 2740 }, { "epoch": 0.4738111647139904, "grad_norm": 3.1940810680389404, "learning_rate": 9.439952305316097e-08, "logits/chosen": -2.8625340461730957, "logits/rejected": -2.8576200008392334, "logps/chosen": -61.2120361328125, "logps/rejected": -68.39119720458984, "loss": 0.6797, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.11948289722204208, "rewards/margins": 0.030159134417772293, "rewards/rejected": -0.14964203536510468, "step": 2750 }, { "epoch": 0.4755341144038594, "grad_norm": 3.294135332107544, "learning_rate": 9.433017821456108e-08, "logits/chosen": -2.9133965969085693, "logits/rejected": -2.8962509632110596, "logps/chosen": -65.8562240600586, "logps/rejected": -70.20357513427734, "loss": 0.6757, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11581240594387054, "rewards/margins": 0.0381658673286438, "rewards/rejected": -0.15397830307483673, "step": 2760 }, { "epoch": 0.4772570640937285, "grad_norm": 3.4492290019989014, "learning_rate": 9.426043249474624e-08, "logits/chosen": -3.0412955284118652, "logits/rejected": -3.011286973953247, "logps/chosen": -66.13185119628906, "logps/rejected": -67.03375244140625, "loss": 0.6794, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10917095839977264, "rewards/margins": 0.030780524015426636, "rewards/rejected": -0.13995149731636047, "step": 2770 }, { "epoch": 0.4789800137835975, "grad_norm": 3.045886754989624, "learning_rate": 9.41902865244324e-08, "logits/chosen": -2.8621273040771484, "logits/rejected": -2.8325142860412598, "logps/chosen": -64.79789733886719, "logps/rejected": -65.37285614013672, "loss": 0.6807, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11969073861837387, "rewards/margins": 0.027886349707841873, "rewards/rejected": -0.14757707715034485, "step": 2780 }, { "epoch": 0.4807029634734666, "grad_norm": 2.9073054790496826, "learning_rate": 9.411974093795497e-08, "logits/chosen": -2.977447748184204, "logits/rejected": -2.9443159103393555, "logps/chosen": -63.461631774902344, "logps/rejected": -65.23011779785156, "loss": 0.6774, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11701373755931854, "rewards/margins": 0.034158896654844284, "rewards/rejected": -0.15117263793945312, "step": 2790 }, { "epoch": 0.4824259131633356, "grad_norm": 3.480588912963867, "learning_rate": 9.404879637326307e-08, "logits/chosen": -2.9597458839416504, "logits/rejected": -2.934946060180664, "logps/chosen": -71.83543395996094, "logps/rejected": -72.26683807373047, "loss": 0.6766, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.11799754202365875, "rewards/margins": 0.03629498928785324, "rewards/rejected": -0.15429255366325378, "step": 2800 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -3.036466121673584, "eval_logits/rejected": -3.0307648181915283, "eval_logps/chosen": -65.97711181640625, "eval_logps/rejected": -72.36688995361328, "eval_loss": 0.6845990419387817, "eval_rewards/accuracies": 0.5966542959213257, "eval_rewards/chosen": -0.07265209406614304, "eval_rewards/margins": 0.019215548411011696, "eval_rewards/rejected": -0.09186764806509018, "eval_runtime": 360.3298, "eval_samples_per_second": 11.945, "eval_steps_per_second": 1.493, "step": 2800 }, { "epoch": 0.4841488628532047, "grad_norm": 3.248321056365967, "learning_rate": 9.397745347191391e-08, "logits/chosen": -2.8508946895599365, "logits/rejected": -2.826793909072876, "logps/chosen": -66.67686462402344, "logps/rejected": -71.41950988769531, "loss": 0.6734, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11341060698032379, "rewards/margins": 0.04314876347780228, "rewards/rejected": -0.15655937790870667, "step": 2810 }, { "epoch": 0.48587181254307377, "grad_norm": 3.388293504714966, "learning_rate": 9.39057128790668e-08, "logits/chosen": -2.998136520385742, "logits/rejected": -2.9863076210021973, "logps/chosen": -65.70792388916016, "logps/rejected": -67.45301818847656, "loss": 0.6795, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.11380715668201447, "rewards/margins": 0.03052491322159767, "rewards/rejected": -0.14433208107948303, "step": 2820 }, { "epoch": 0.4875947622329428, "grad_norm": 3.0719292163848877, "learning_rate": 9.383357524347748e-08, "logits/chosen": -2.8031680583953857, "logits/rejected": -2.788007974624634, "logps/chosen": -66.85478210449219, "logps/rejected": -70.41021728515625, "loss": 0.6748, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.11354263871908188, "rewards/margins": 0.040914587676525116, "rewards/rejected": -0.154457226395607, "step": 2830 }, { "epoch": 0.48931771192281187, "grad_norm": 3.0653076171875, "learning_rate": 9.376104121749213e-08, "logits/chosen": -2.9727180004119873, "logits/rejected": -2.961493968963623, "logps/chosen": -64.31333923339844, "logps/rejected": -66.9847412109375, "loss": 0.6751, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1167706847190857, "rewards/margins": 0.03927644342184067, "rewards/rejected": -0.15604713559150696, "step": 2840 }, { "epoch": 0.4910406616126809, "grad_norm": 3.3679287433624268, "learning_rate": 9.368811145704154e-08, "logits/chosen": -2.9329311847686768, "logits/rejected": -2.9137070178985596, "logps/chosen": -64.20672607421875, "logps/rejected": -71.63908386230469, "loss": 0.6709, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11960289627313614, "rewards/margins": 0.04830095171928406, "rewards/rejected": -0.1679038554430008, "step": 2850 }, { "epoch": 0.49276361130255, "grad_norm": 2.8686556816101074, "learning_rate": 9.361478662163515e-08, "logits/chosen": -2.9130587577819824, "logits/rejected": -2.8891494274139404, "logps/chosen": -66.36784362792969, "logps/rejected": -67.8857421875, "loss": 0.6724, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11403318494558334, "rewards/margins": 0.04504844546318054, "rewards/rejected": -0.1590816229581833, "step": 2860 }, { "epoch": 0.494486560992419, "grad_norm": 3.327705144882202, "learning_rate": 9.354106737435507e-08, "logits/chosen": -2.953275203704834, "logits/rejected": -2.9302539825439453, "logps/chosen": -67.18695831298828, "logps/rejected": -67.57176208496094, "loss": 0.6779, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11071185022592545, "rewards/margins": 0.03463593125343323, "rewards/rejected": -0.14534778892993927, "step": 2870 }, { "epoch": 0.4962095106822881, "grad_norm": 3.8654720783233643, "learning_rate": 9.346695438185015e-08, "logits/chosen": -3.0281896591186523, "logits/rejected": -3.002101421356201, "logps/chosen": -67.46855926513672, "logps/rejected": -66.77601623535156, "loss": 0.6764, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12120544910430908, "rewards/margins": 0.03729164972901344, "rewards/rejected": -0.15849709510803223, "step": 2880 }, { "epoch": 0.49793246037215716, "grad_norm": 2.923661470413208, "learning_rate": 9.339244831432988e-08, "logits/chosen": -3.037105083465576, "logits/rejected": -3.0316238403320312, "logps/chosen": -64.63816833496094, "logps/rejected": -68.85662078857422, "loss": 0.6715, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11680474132299423, "rewards/margins": 0.04656790941953659, "rewards/rejected": -0.16337266564369202, "step": 2890 }, { "epoch": 0.4996554100620262, "grad_norm": 3.261712074279785, "learning_rate": 9.331754984555838e-08, "logits/chosen": -2.9544241428375244, "logits/rejected": -2.944662570953369, "logps/chosen": -63.31207275390625, "logps/rejected": -65.96617889404297, "loss": 0.6769, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10824018716812134, "rewards/margins": 0.036075785756111145, "rewards/rejected": -0.1443159580230713, "step": 2900 }, { "epoch": 0.4996554100620262, "eval_logits/chosen": -3.028932571411133, "eval_logits/rejected": -3.0232160091400146, "eval_logps/chosen": -66.26190948486328, "eval_logps/rejected": -72.7197265625, "eval_loss": 0.6843085289001465, "eval_rewards/accuracies": 0.6003717184066772, "eval_rewards/chosen": -0.07550010085105896, "eval_rewards/margins": 0.0198960117995739, "eval_rewards/rejected": -0.09539611637592316, "eval_runtime": 361.4703, "eval_samples_per_second": 11.907, "eval_steps_per_second": 1.488, "step": 2900 }, { "epoch": 0.5013783597518953, "grad_norm": 3.804288148880005, "learning_rate": 9.324225965284823e-08, "logits/chosen": -2.994413137435913, "logits/rejected": -2.982327938079834, "logps/chosen": -67.48818969726562, "logps/rejected": -72.01011657714844, "loss": 0.6711, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11381006240844727, "rewards/margins": 0.0472860261797905, "rewards/rejected": -0.16109611093997955, "step": 2910 }, { "epoch": 0.5031013094417643, "grad_norm": 3.164123773574829, "learning_rate": 9.316657841705449e-08, "logits/chosen": -2.9393081665039062, "logits/rejected": -2.935638904571533, "logps/chosen": -65.94876098632812, "logps/rejected": -71.37811279296875, "loss": 0.6775, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1237863302230835, "rewards/margins": 0.034934334456920624, "rewards/rejected": -0.15872065722942352, "step": 2920 }, { "epoch": 0.5048242591316333, "grad_norm": 3.0420029163360596, "learning_rate": 9.309050682256836e-08, "logits/chosen": -2.9117321968078613, "logits/rejected": -2.884338140487671, "logps/chosen": -65.09063720703125, "logps/rejected": -68.64939880371094, "loss": 0.6684, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11639396101236343, "rewards/margins": 0.05342023819684982, "rewards/rejected": -0.16981419920921326, "step": 2930 }, { "epoch": 0.5065472088215024, "grad_norm": 3.0785973072052, "learning_rate": 9.301404555731116e-08, "logits/chosen": -2.9413981437683105, "logits/rejected": -2.9049649238586426, "logps/chosen": -63.005767822265625, "logps/rejected": -66.04293823242188, "loss": 0.6764, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1154542937874794, "rewards/margins": 0.03712349012494087, "rewards/rejected": -0.15257780253887177, "step": 2940 }, { "epoch": 0.5082701585113715, "grad_norm": 2.9522688388824463, "learning_rate": 9.293719531272799e-08, "logits/chosen": -2.9734482765197754, "logits/rejected": -2.9552502632141113, "logps/chosen": -65.55525207519531, "logps/rejected": -70.6024398803711, "loss": 0.6712, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.11234132945537567, "rewards/margins": 0.048349231481552124, "rewards/rejected": -0.1606905460357666, "step": 2950 }, { "epoch": 0.5099931082012406, "grad_norm": 3.1853420734405518, "learning_rate": 9.285995678378151e-08, "logits/chosen": -2.9326086044311523, "logits/rejected": -2.905928134918213, "logps/chosen": -65.61602783203125, "logps/rejected": -68.58934020996094, "loss": 0.6703, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11668062210083008, "rewards/margins": 0.05025554820895195, "rewards/rejected": -0.16693618893623352, "step": 2960 }, { "epoch": 0.5117160578911096, "grad_norm": 3.8613901138305664, "learning_rate": 9.278233066894572e-08, "logits/chosen": -2.8656792640686035, "logits/rejected": -2.838275909423828, "logps/chosen": -67.8607406616211, "logps/rejected": -68.91703796386719, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": -0.1262323409318924, "rewards/margins": 0.038799136877059937, "rewards/rejected": -0.16503146290779114, "step": 2970 }, { "epoch": 0.5134390075809786, "grad_norm": 3.3221471309661865, "learning_rate": 9.270431767019951e-08, "logits/chosen": -2.8628203868865967, "logits/rejected": -2.8370893001556396, "logps/chosen": -64.91873931884766, "logps/rejected": -68.04789733886719, "loss": 0.6711, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10979530960321426, "rewards/margins": 0.04787523299455643, "rewards/rejected": -0.1576705276966095, "step": 2980 }, { "epoch": 0.5151619572708477, "grad_norm": 3.131523370742798, "learning_rate": 9.262591849302047e-08, "logits/chosen": -2.9078562259674072, "logits/rejected": -2.8803505897521973, "logps/chosen": -69.05020141601562, "logps/rejected": -71.10308837890625, "loss": 0.6748, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12440363317728043, "rewards/margins": 0.04048031568527222, "rewards/rejected": -0.16488394141197205, "step": 2990 }, { "epoch": 0.5168849069607168, "grad_norm": 2.872863292694092, "learning_rate": 9.254713384637838e-08, "logits/chosen": -2.918975830078125, "logits/rejected": -2.913259744644165, "logps/chosen": -64.02835845947266, "logps/rejected": -69.40301513671875, "loss": 0.6781, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12016010284423828, "rewards/margins": 0.03492623567581177, "rewards/rejected": -0.15508633852005005, "step": 3000 }, { "epoch": 0.5168849069607168, "eval_logits/chosen": -3.020082950592041, "eval_logits/rejected": -3.014353036880493, "eval_logps/chosen": -66.83290100097656, "eval_logps/rejected": -73.39954376220703, "eval_loss": 0.6838503479957581, "eval_rewards/accuracies": 0.6026951670646667, "eval_rewards/chosen": -0.08121006190776825, "eval_rewards/margins": 0.020984075963497162, "eval_rewards/rejected": -0.10219414532184601, "eval_runtime": 360.8285, "eval_samples_per_second": 11.928, "eval_steps_per_second": 1.491, "step": 3000 }, { "epoch": 0.5186078566505858, "grad_norm": 3.5088436603546143, "learning_rate": 9.246796444272887e-08, "logits/chosen": -2.9433512687683105, "logits/rejected": -2.920409679412842, "logps/chosen": -68.26364135742188, "logps/rejected": -68.73377227783203, "loss": 0.6819, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13075979053974152, "rewards/margins": 0.025632452219724655, "rewards/rejected": -0.15639221668243408, "step": 3010 }, { "epoch": 0.5203308063404548, "grad_norm": 3.4233145713806152, "learning_rate": 9.238841099800693e-08, "logits/chosen": -3.025982141494751, "logits/rejected": -2.9952666759490967, "logps/chosen": -68.6304931640625, "logps/rejected": -68.3849868774414, "loss": 0.6776, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.14391957223415375, "rewards/margins": 0.036774180829524994, "rewards/rejected": -0.18069376051425934, "step": 3020 }, { "epoch": 0.5220537560303239, "grad_norm": 3.3919730186462402, "learning_rate": 9.230847423162053e-08, "logits/chosen": -2.9319372177124023, "logits/rejected": -2.910442352294922, "logps/chosen": -69.41728973388672, "logps/rejected": -71.12834167480469, "loss": 0.6823, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1374237835407257, "rewards/margins": 0.024965396150946617, "rewards/rejected": -0.16238918900489807, "step": 3030 }, { "epoch": 0.523776705720193, "grad_norm": 3.2684762477874756, "learning_rate": 9.222815486644399e-08, "logits/chosen": -3.0326926708221436, "logits/rejected": -3.025906801223755, "logps/chosen": -65.07283020019531, "logps/rejected": -71.50475311279297, "loss": 0.6783, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13885042071342468, "rewards/margins": 0.0330406129360199, "rewards/rejected": -0.17189103364944458, "step": 3040 }, { "epoch": 0.525499655410062, "grad_norm": 2.9352407455444336, "learning_rate": 9.214745362881149e-08, "logits/chosen": -2.8935718536376953, "logits/rejected": -2.8804633617401123, "logps/chosen": -66.72821044921875, "logps/rejected": -71.2332534790039, "loss": 0.6813, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.13170121610164642, "rewards/margins": 0.02705610729753971, "rewards/rejected": -0.1587573140859604, "step": 3050 }, { "epoch": 0.5272226050999311, "grad_norm": 3.934908628463745, "learning_rate": 9.206637124851055e-08, "logits/chosen": -2.92893648147583, "logits/rejected": -2.9105372428894043, "logps/chosen": -68.1148910522461, "logps/rejected": -70.24058532714844, "loss": 0.678, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1243911162018776, "rewards/margins": 0.03433989733457565, "rewards/rejected": -0.15873101353645325, "step": 3060 }, { "epoch": 0.5289455547898001, "grad_norm": 3.168501615524292, "learning_rate": 9.19849084587754e-08, "logits/chosen": -2.889446973800659, "logits/rejected": -2.863997220993042, "logps/chosen": -71.57978820800781, "logps/rejected": -71.05496978759766, "loss": 0.6775, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1363719403743744, "rewards/margins": 0.03479612618684769, "rewards/rejected": -0.17116807401180267, "step": 3070 }, { "epoch": 0.5306685044796692, "grad_norm": 3.003641128540039, "learning_rate": 9.190306599628027e-08, "logits/chosen": -2.8035268783569336, "logits/rejected": -2.786806583404541, "logps/chosen": -65.42008972167969, "logps/rejected": -72.64752960205078, "loss": 0.6797, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.14561638236045837, "rewards/margins": 0.03284695744514465, "rewards/rejected": -0.17846335470676422, "step": 3080 }, { "epoch": 0.5323914541695383, "grad_norm": 3.1618595123291016, "learning_rate": 9.182084460113288e-08, "logits/chosen": -2.887605667114258, "logits/rejected": -2.868880033493042, "logps/chosen": -66.78776550292969, "logps/rejected": -71.1986312866211, "loss": 0.6735, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13175687193870544, "rewards/margins": 0.043186891824007034, "rewards/rejected": -0.17494376003742218, "step": 3090 }, { "epoch": 0.5341144038594073, "grad_norm": 4.041504383087158, "learning_rate": 9.173824501686767e-08, "logits/chosen": -2.9190025329589844, "logits/rejected": -2.889906167984009, "logps/chosen": -66.44412231445312, "logps/rejected": -69.22412109375, "loss": 0.67, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1252371370792389, "rewards/margins": 0.05129619315266609, "rewards/rejected": -0.17653335630893707, "step": 3100 }, { "epoch": 0.5341144038594073, "eval_logits/chosen": -3.0152857303619385, "eval_logits/rejected": -3.0095441341400146, "eval_logps/chosen": -66.92872619628906, "eval_logps/rejected": -73.57532501220703, "eval_loss": 0.6834792494773865, "eval_rewards/accuracies": 0.6003717184066772, "eval_rewards/chosen": -0.0821683257818222, "eval_rewards/margins": 0.021783730015158653, "eval_rewards/rejected": -0.10395205020904541, "eval_runtime": 359.9412, "eval_samples_per_second": 11.958, "eval_steps_per_second": 1.495, "step": 3100 }, { "epoch": 0.5358373535492763, "grad_norm": 3.552891254425049, "learning_rate": 9.165526799043897e-08, "logits/chosen": -2.8621737957000732, "logits/rejected": -2.8714873790740967, "logps/chosen": -64.84125518798828, "logps/rejected": -72.49705505371094, "loss": 0.6787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14228765666484833, "rewards/margins": 0.03320407122373581, "rewards/rejected": -0.17549173533916473, "step": 3110 }, { "epoch": 0.5375603032391454, "grad_norm": 3.3261067867279053, "learning_rate": 9.157191427221447e-08, "logits/chosen": -2.908033609390259, "logits/rejected": -2.8847293853759766, "logps/chosen": -68.35820007324219, "logps/rejected": -73.15019226074219, "loss": 0.6705, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12347419559955597, "rewards/margins": 0.04997570067644119, "rewards/rejected": -0.17344990372657776, "step": 3120 }, { "epoch": 0.5392832529290145, "grad_norm": 3.127723217010498, "learning_rate": 9.148818461596826e-08, "logits/chosen": -2.8254592418670654, "logits/rejected": -2.8169493675231934, "logps/chosen": -64.7706527709961, "logps/rejected": -69.00775146484375, "loss": 0.6801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1268969476222992, "rewards/margins": 0.029782230034470558, "rewards/rejected": -0.1566791832447052, "step": 3130 }, { "epoch": 0.5410062026188835, "grad_norm": 3.4870545864105225, "learning_rate": 9.140407977887403e-08, "logits/chosen": -2.938673496246338, "logits/rejected": -2.9236607551574707, "logps/chosen": -65.6788101196289, "logps/rejected": -72.48387145996094, "loss": 0.6715, "rewards/accuracies": 0.625, "rewards/chosen": -0.13981683552265167, "rewards/margins": 0.047606609761714935, "rewards/rejected": -0.187423437833786, "step": 3140 }, { "epoch": 0.5427291523087526, "grad_norm": 3.362210273742676, "learning_rate": 9.131960052149834e-08, "logits/chosen": -2.9392380714416504, "logits/rejected": -2.910787582397461, "logps/chosen": -66.6327133178711, "logps/rejected": -71.1677474975586, "loss": 0.6691, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12263756990432739, "rewards/margins": 0.053017932921648026, "rewards/rejected": -0.17565548419952393, "step": 3150 }, { "epoch": 0.5444521019986216, "grad_norm": 3.542495012283325, "learning_rate": 9.123474760779359e-08, "logits/chosen": -2.8761775493621826, "logits/rejected": -2.8476626873016357, "logps/chosen": -66.16803741455078, "logps/rejected": -70.11380767822266, "loss": 0.6685, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.12021736055612564, "rewards/margins": 0.05458689481019974, "rewards/rejected": -0.17480425536632538, "step": 3160 }, { "epoch": 0.5461750516884907, "grad_norm": 3.6144564151763916, "learning_rate": 9.114952180509124e-08, "logits/chosen": -2.9056293964385986, "logits/rejected": -2.873357057571411, "logps/chosen": -69.16236877441406, "logps/rejected": -69.51313781738281, "loss": 0.6625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1327294409275055, "rewards/margins": 0.06754171848297119, "rewards/rejected": -0.20027117431163788, "step": 3170 }, { "epoch": 0.5478980013783598, "grad_norm": 3.13999605178833, "learning_rate": 9.106392388409477e-08, "logits/chosen": -2.957420587539673, "logits/rejected": -2.920548677444458, "logps/chosen": -66.63191223144531, "logps/rejected": -69.84432983398438, "loss": 0.6698, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1337551772594452, "rewards/margins": 0.0504077672958374, "rewards/rejected": -0.1841629445552826, "step": 3180 }, { "epoch": 0.5496209510682288, "grad_norm": 3.5873255729675293, "learning_rate": 9.097795461887277e-08, "logits/chosen": -2.9530131816864014, "logits/rejected": -2.9471516609191895, "logps/chosen": -66.12956237792969, "logps/rejected": -77.18992614746094, "loss": 0.6636, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.13831885159015656, "rewards/margins": 0.0669272169470787, "rewards/rejected": -0.20524606108665466, "step": 3190 }, { "epoch": 0.5513439007580979, "grad_norm": 3.6692728996276855, "learning_rate": 9.089161478685192e-08, "logits/chosen": -2.8761074542999268, "logits/rejected": -2.861693859100342, "logps/chosen": -67.71090698242188, "logps/rejected": -69.20539855957031, "loss": 0.6718, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12382230907678604, "rewards/margins": 0.04690946266055107, "rewards/rejected": -0.17073175311088562, "step": 3200 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -3.003952741622925, "eval_logits/rejected": -2.998213052749634, "eval_logps/chosen": -68.10050201416016, "eval_logps/rejected": -74.91475677490234, "eval_loss": 0.6827961802482605, "eval_rewards/accuracies": 0.6015334725379944, "eval_rewards/chosen": -0.09388609230518341, "eval_rewards/margins": 0.023460378870368004, "eval_rewards/rejected": -0.11734647303819656, "eval_runtime": 360.6231, "eval_samples_per_second": 11.935, "eval_steps_per_second": 1.492, "step": 3200 }, { "epoch": 0.5530668504479669, "grad_norm": 3.8423609733581543, "learning_rate": 9.080490516880998e-08, "logits/chosen": -2.79463791847229, "logits/rejected": -2.7705302238464355, "logps/chosen": -67.96089172363281, "logps/rejected": -71.14826202392578, "loss": 0.6721, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14009490609169006, "rewards/margins": 0.04641476646065712, "rewards/rejected": -0.18650969862937927, "step": 3210 }, { "epoch": 0.554789800137836, "grad_norm": 3.280461311340332, "learning_rate": 9.07178265488687e-08, "logits/chosen": -2.828249931335449, "logits/rejected": -2.823260545730591, "logps/chosen": -65.09806823730469, "logps/rejected": -71.36117553710938, "loss": 0.6766, "rewards/accuracies": 0.625, "rewards/chosen": -0.1475423276424408, "rewards/margins": 0.03724092245101929, "rewards/rejected": -0.18478327989578247, "step": 3220 }, { "epoch": 0.556512749827705, "grad_norm": 3.3534491062164307, "learning_rate": 9.063037971448675e-08, "logits/chosen": -2.882533550262451, "logits/rejected": -2.8667454719543457, "logps/chosen": -68.42729949951172, "logps/rejected": -73.65990447998047, "loss": 0.6802, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.14269927144050598, "rewards/margins": 0.030112752690911293, "rewards/rejected": -0.17281204462051392, "step": 3230 }, { "epoch": 0.5582356995175741, "grad_norm": 3.23046612739563, "learning_rate": 9.054256545645258e-08, "logits/chosen": -3.001438617706299, "logits/rejected": -2.9791247844696045, "logps/chosen": -72.44905090332031, "logps/rejected": -72.3000717163086, "loss": 0.6777, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16212376952171326, "rewards/margins": 0.035440459847450256, "rewards/rejected": -0.19756419956684113, "step": 3240 }, { "epoch": 0.5599586492074431, "grad_norm": 3.267273426055908, "learning_rate": 9.045438456887727e-08, "logits/chosen": -2.8932247161865234, "logits/rejected": -2.8726253509521484, "logps/chosen": -66.225830078125, "logps/rejected": -72.84576416015625, "loss": 0.666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13653263449668884, "rewards/margins": 0.05883949249982834, "rewards/rejected": -0.19537213444709778, "step": 3250 }, { "epoch": 0.5616815988973122, "grad_norm": 3.519906759262085, "learning_rate": 9.036583784918741e-08, "logits/chosen": -2.9068236351013184, "logits/rejected": -2.8895974159240723, "logps/chosen": -66.02603149414062, "logps/rejected": -72.74566650390625, "loss": 0.6693, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.14490602910518646, "rewards/margins": 0.05516090244054794, "rewards/rejected": -0.2000669240951538, "step": 3260 }, { "epoch": 0.5634045485871813, "grad_norm": 3.5675628185272217, "learning_rate": 9.027692609811777e-08, "logits/chosen": -2.9633305072784424, "logits/rejected": -2.9593842029571533, "logps/chosen": -70.6051025390625, "logps/rejected": -78.98965454101562, "loss": 0.6675, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.15475505590438843, "rewards/margins": 0.05626174807548523, "rewards/rejected": -0.21101677417755127, "step": 3270 }, { "epoch": 0.5651274982770503, "grad_norm": 3.5583724975585938, "learning_rate": 9.018765011970419e-08, "logits/chosen": -2.9657270908355713, "logits/rejected": -2.9546895027160645, "logps/chosen": -64.42697143554688, "logps/rejected": -71.25099182128906, "loss": 0.6763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1500626802444458, "rewards/margins": 0.038469213992357254, "rewards/rejected": -0.18853187561035156, "step": 3280 }, { "epoch": 0.5668504479669194, "grad_norm": 3.300567388534546, "learning_rate": 9.00980107212762e-08, "logits/chosen": -3.010849952697754, "logits/rejected": -2.980490207672119, "logps/chosen": -67.35631561279297, "logps/rejected": -73.60289001464844, "loss": 0.6652, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.141900897026062, "rewards/margins": 0.060899633914232254, "rewards/rejected": -0.20280051231384277, "step": 3290 }, { "epoch": 0.5685733976567884, "grad_norm": 3.9140853881835938, "learning_rate": 9.000800871344979e-08, "logits/chosen": -2.8878273963928223, "logits/rejected": -2.8666417598724365, "logps/chosen": -70.4729995727539, "logps/rejected": -73.83319854736328, "loss": 0.6724, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.15556514263153076, "rewards/margins": 0.04699654132127762, "rewards/rejected": -0.20256169140338898, "step": 3300 }, { "epoch": 0.5685733976567884, "eval_logits/chosen": -2.990844488143921, "eval_logits/rejected": -2.985057830810547, "eval_logps/chosen": -68.70267486572266, "eval_logps/rejected": -75.66940307617188, "eval_loss": 0.6821591854095459, "eval_rewards/accuracies": 0.6050186157226562, "eval_rewards/chosen": -0.0999077558517456, "eval_rewards/margins": 0.0249850582331419, "eval_rewards/rejected": -0.12489282339811325, "eval_runtime": 360.6919, "eval_samples_per_second": 11.933, "eval_steps_per_second": 1.492, "step": 3300 }, { "epoch": 0.5702963473466575, "grad_norm": 3.486947536468506, "learning_rate": 8.991764491012004e-08, "logits/chosen": -2.8264219760894775, "logits/rejected": -2.7986056804656982, "logps/chosen": -67.07157135009766, "logps/rejected": -70.15736389160156, "loss": 0.6684, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13972768187522888, "rewards/margins": 0.0548524372279644, "rewards/rejected": -0.1945801079273224, "step": 3310 }, { "epoch": 0.5720192970365265, "grad_norm": 4.112346649169922, "learning_rate": 8.982692012845379e-08, "logits/chosen": -2.8834726810455322, "logits/rejected": -2.8484654426574707, "logps/chosen": -69.16401672363281, "logps/rejected": -70.00126647949219, "loss": 0.6676, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.15456373989582062, "rewards/margins": 0.055997978895902634, "rewards/rejected": -0.21056172251701355, "step": 3320 }, { "epoch": 0.5737422467263956, "grad_norm": 3.7715468406677246, "learning_rate": 8.973583518888222e-08, "logits/chosen": -2.983250141143799, "logits/rejected": -2.952721118927002, "logps/chosen": -67.77142333984375, "logps/rejected": -73.4288101196289, "loss": 0.6646, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.13558022677898407, "rewards/margins": 0.0619480200111866, "rewards/rejected": -0.19752824306488037, "step": 3330 }, { "epoch": 0.5754651964162646, "grad_norm": 3.686140537261963, "learning_rate": 8.964439091509344e-08, "logits/chosen": -2.921363115310669, "logits/rejected": -2.9037246704101562, "logps/chosen": -71.49298095703125, "logps/rejected": -75.01543426513672, "loss": 0.6687, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15579842031002045, "rewards/margins": 0.05456198379397392, "rewards/rejected": -0.21036040782928467, "step": 3340 }, { "epoch": 0.5771881461061337, "grad_norm": 3.439488410949707, "learning_rate": 8.955258813402509e-08, "logits/chosen": -2.8145248889923096, "logits/rejected": -2.8000881671905518, "logps/chosen": -71.99406433105469, "logps/rejected": -76.73139953613281, "loss": 0.6793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16215400397777557, "rewards/margins": 0.03191717341542244, "rewards/rejected": -0.1940712034702301, "step": 3350 }, { "epoch": 0.5789110957960028, "grad_norm": 3.584379196166992, "learning_rate": 8.946042767585676e-08, "logits/chosen": -2.9319262504577637, "logits/rejected": -2.89823842048645, "logps/chosen": -70.10853576660156, "logps/rejected": -72.36190795898438, "loss": 0.6714, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.15293297171592712, "rewards/margins": 0.048371512442827225, "rewards/rejected": -0.20130451023578644, "step": 3360 }, { "epoch": 0.5806340454858718, "grad_norm": 3.598647117614746, "learning_rate": 8.936791037400258e-08, "logits/chosen": -2.870156764984131, "logits/rejected": -2.860452175140381, "logps/chosen": -66.26335144042969, "logps/rejected": -74.37271881103516, "loss": 0.6707, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15308070182800293, "rewards/margins": 0.05050992965698242, "rewards/rejected": -0.20359063148498535, "step": 3370 }, { "epoch": 0.5823569951757409, "grad_norm": 3.889470100402832, "learning_rate": 8.927503706510364e-08, "logits/chosen": -2.962991237640381, "logits/rejected": -2.932013988494873, "logps/chosen": -69.73117065429688, "logps/rejected": -70.13800048828125, "loss": 0.6674, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14270296692848206, "rewards/margins": 0.05723966285586357, "rewards/rejected": -0.19994261860847473, "step": 3380 }, { "epoch": 0.5840799448656099, "grad_norm": 3.23209547996521, "learning_rate": 8.91818085890204e-08, "logits/chosen": -2.8833847045898438, "logits/rejected": -2.8576457500457764, "logps/chosen": -69.89290618896484, "logps/rejected": -71.88001251220703, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": -0.1616523414850235, "rewards/margins": 0.0428120493888855, "rewards/rejected": -0.204464390873909, "step": 3390 }, { "epoch": 0.585802894555479, "grad_norm": 3.751899480819702, "learning_rate": 8.908822578882518e-08, "logits/chosen": -2.931617021560669, "logits/rejected": -2.8930277824401855, "logps/chosen": -67.65100860595703, "logps/rejected": -72.86280822753906, "loss": 0.6625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1479429006576538, "rewards/margins": 0.06683917343616486, "rewards/rejected": -0.21478208899497986, "step": 3400 }, { "epoch": 0.585802894555479, "eval_logits/chosen": -2.981968879699707, "eval_logits/rejected": -2.9761545658111572, "eval_logps/chosen": -68.8060302734375, "eval_logps/rejected": -75.84404754638672, "eval_loss": 0.6818387508392334, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.10094138979911804, "eval_rewards/margins": 0.025697872042655945, "eval_rewards/rejected": -0.12663927674293518, "eval_runtime": 359.5659, "eval_samples_per_second": 11.97, "eval_steps_per_second": 1.496, "step": 3400 }, { "epoch": 0.587525844245348, "grad_norm": 3.432067632675171, "learning_rate": 8.899428951079443e-08, "logits/chosen": -2.8233437538146973, "logits/rejected": -2.7948660850524902, "logps/chosen": -64.45096588134766, "logps/rejected": -67.34065246582031, "loss": 0.6687, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14877715706825256, "rewards/margins": 0.05394697189331055, "rewards/rejected": -0.2027241289615631, "step": 3410 }, { "epoch": 0.5892487939352171, "grad_norm": 4.382596015930176, "learning_rate": 8.890000060440115e-08, "logits/chosen": -2.845134735107422, "logits/rejected": -2.821000576019287, "logps/chosen": -66.77651977539062, "logps/rejected": -72.5694808959961, "loss": 0.6694, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14009198546409607, "rewards/margins": 0.05261436849832535, "rewards/rejected": -0.19270634651184082, "step": 3420 }, { "epoch": 0.5909717436250862, "grad_norm": 3.418834924697876, "learning_rate": 8.880535992230718e-08, "logits/chosen": -2.9278876781463623, "logits/rejected": -2.9076414108276367, "logps/chosen": -66.03239440917969, "logps/rejected": -73.9791030883789, "loss": 0.6721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14691507816314697, "rewards/margins": 0.046559590846300125, "rewards/rejected": -0.1934746652841568, "step": 3430 }, { "epoch": 0.5926946933149552, "grad_norm": 3.650676727294922, "learning_rate": 8.871036832035547e-08, "logits/chosen": -2.8495564460754395, "logits/rejected": -2.8339741230010986, "logps/chosen": -69.0568618774414, "logps/rejected": -73.52063751220703, "loss": 0.6743, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.16483992338180542, "rewards/margins": 0.04235614091157913, "rewards/rejected": -0.20719607174396515, "step": 3440 }, { "epoch": 0.5944176430048242, "grad_norm": 3.656323194503784, "learning_rate": 8.861502665756244e-08, "logits/chosen": -3.0892930030822754, "logits/rejected": -3.049489974975586, "logps/chosen": -74.71489715576172, "logps/rejected": -75.1200180053711, "loss": 0.6713, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15335428714752197, "rewards/margins": 0.048375897109508514, "rewards/rejected": -0.20173020660877228, "step": 3450 }, { "epoch": 0.5961405926946933, "grad_norm": 3.811377763748169, "learning_rate": 8.851933579611007e-08, "logits/chosen": -2.8736202716827393, "logits/rejected": -2.861013889312744, "logps/chosen": -65.99501037597656, "logps/rejected": -71.20674133300781, "loss": 0.6767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13815616071224213, "rewards/margins": 0.03790999576449394, "rewards/rejected": -0.17606614530086517, "step": 3460 }, { "epoch": 0.5978635423845624, "grad_norm": 3.395078182220459, "learning_rate": 8.842329660133815e-08, "logits/chosen": -2.9512839317321777, "logits/rejected": -2.9359641075134277, "logps/chosen": -70.45713806152344, "logps/rejected": -74.82063293457031, "loss": 0.6686, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14646930992603302, "rewards/margins": 0.05837889388203621, "rewards/rejected": -0.20484820008277893, "step": 3470 }, { "epoch": 0.5995864920744314, "grad_norm": 3.6889381408691406, "learning_rate": 8.832690994173655e-08, "logits/chosen": -2.8805530071258545, "logits/rejected": -2.8497469425201416, "logps/chosen": -69.79116821289062, "logps/rejected": -73.75949096679688, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": -0.1467658281326294, "rewards/margins": 0.0719979852437973, "rewards/rejected": -0.2187638282775879, "step": 3480 }, { "epoch": 0.6013094417643005, "grad_norm": 4.063599109649658, "learning_rate": 8.823017668893726e-08, "logits/chosen": -2.88657283782959, "logits/rejected": -2.858245372772217, "logps/chosen": -69.35597229003906, "logps/rejected": -75.87593078613281, "loss": 0.6645, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.150865837931633, "rewards/margins": 0.06266475468873978, "rewards/rejected": -0.21353061497211456, "step": 3490 }, { "epoch": 0.6030323914541695, "grad_norm": 3.57769513130188, "learning_rate": 8.813309771770652e-08, "logits/chosen": -2.834378480911255, "logits/rejected": -2.826758861541748, "logps/chosen": -68.08917999267578, "logps/rejected": -74.96446228027344, "loss": 0.6742, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1608649641275406, "rewards/margins": 0.04216255620121956, "rewards/rejected": -0.20302753150463104, "step": 3500 }, { "epoch": 0.6030323914541695, "eval_logits/chosen": -2.9745237827301025, "eval_logits/rejected": -2.9687376022338867, "eval_logps/chosen": -69.42023468017578, "eval_logps/rejected": -76.5616683959961, "eval_loss": 0.6814299821853638, "eval_rewards/accuracies": 0.6082713603973389, "eval_rewards/chosen": -0.10708339512348175, "eval_rewards/margins": 0.026732003316283226, "eval_rewards/rejected": -0.13381539285182953, "eval_runtime": 360.2439, "eval_samples_per_second": 11.947, "eval_steps_per_second": 1.493, "step": 3500 }, { "epoch": 0.6047553411440386, "grad_norm": 3.326735019683838, "learning_rate": 8.803567390593694e-08, "logits/chosen": -2.775590181350708, "logits/rejected": -2.751251697540283, "logps/chosen": -70.67406463623047, "logps/rejected": -72.92584991455078, "loss": 0.6722, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17106808722019196, "rewards/margins": 0.04674633592367172, "rewards/rejected": -0.21781444549560547, "step": 3510 }, { "epoch": 0.6064782908339077, "grad_norm": 3.649550437927246, "learning_rate": 8.793790613463953e-08, "logits/chosen": -2.8101563453674316, "logits/rejected": -2.7903876304626465, "logps/chosen": -70.46080017089844, "logps/rejected": -72.71427154541016, "loss": 0.6681, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.15126517415046692, "rewards/margins": 0.05522242188453674, "rewards/rejected": -0.20648758113384247, "step": 3520 }, { "epoch": 0.6082012405237767, "grad_norm": 3.7469167709350586, "learning_rate": 8.783979528793584e-08, "logits/chosen": -2.970606803894043, "logits/rejected": -2.928586006164551, "logps/chosen": -70.70912170410156, "logps/rejected": -72.60647583007812, "loss": 0.6616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14751648902893066, "rewards/margins": 0.0694083645939827, "rewards/rejected": -0.21692487597465515, "step": 3530 }, { "epoch": 0.6099241902136457, "grad_norm": 3.8952109813690186, "learning_rate": 8.774134225304974e-08, "logits/chosen": -2.797332286834717, "logits/rejected": -2.7737374305725098, "logps/chosen": -70.36532592773438, "logps/rejected": -74.63444519042969, "loss": 0.6728, "rewards/accuracies": 0.625, "rewards/chosen": -0.1645258665084839, "rewards/margins": 0.04614800959825516, "rewards/rejected": -0.21067388355731964, "step": 3540 }, { "epoch": 0.6116471399035148, "grad_norm": 3.9337563514709473, "learning_rate": 8.764254792029964e-08, "logits/chosen": -2.8850772380828857, "logits/rejected": -2.8755321502685547, "logps/chosen": -72.20113372802734, "logps/rejected": -73.35503387451172, "loss": 0.6799, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17637988924980164, "rewards/margins": 0.03165814280509949, "rewards/rejected": -0.20803804695606232, "step": 3550 }, { "epoch": 0.6133700895933839, "grad_norm": 3.5271224975585938, "learning_rate": 8.754341318309028e-08, "logits/chosen": -2.816992998123169, "logits/rejected": -2.7901835441589355, "logps/chosen": -70.08598327636719, "logps/rejected": -74.62560272216797, "loss": 0.6711, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17186033725738525, "rewards/margins": 0.04891379550099373, "rewards/rejected": -0.22077412903308868, "step": 3560 }, { "epoch": 0.6150930392832529, "grad_norm": 3.995802164077759, "learning_rate": 8.744393893790476e-08, "logits/chosen": -2.9562668800354004, "logits/rejected": -2.9225308895111084, "logps/chosen": -75.40304565429688, "logps/rejected": -74.3688735961914, "loss": 0.6856, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18530330061912537, "rewards/margins": 0.026163259521126747, "rewards/rejected": -0.21146658062934875, "step": 3570 }, { "epoch": 0.616815988973122, "grad_norm": 3.804481267929077, "learning_rate": 8.73441260842963e-08, "logits/chosen": -2.8924849033355713, "logits/rejected": -2.870704174041748, "logps/chosen": -74.10020446777344, "logps/rejected": -79.51061248779297, "loss": 0.6656, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.15905411541461945, "rewards/margins": 0.061337023973464966, "rewards/rejected": -0.22039110958576202, "step": 3580 }, { "epoch": 0.618538938662991, "grad_norm": 3.658123016357422, "learning_rate": 8.724397552488023e-08, "logits/chosen": -2.880936861038208, "logits/rejected": -2.865353584289551, "logps/chosen": -72.49365234375, "logps/rejected": -77.39817810058594, "loss": 0.6698, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18068650364875793, "rewards/margins": 0.05343535542488098, "rewards/rejected": -0.23412184417247772, "step": 3590 }, { "epoch": 0.6202618883528601, "grad_norm": 4.0332818031311035, "learning_rate": 8.714348816532577e-08, "logits/chosen": -2.861480236053467, "logits/rejected": -2.8369815349578857, "logps/chosen": -71.11116027832031, "logps/rejected": -72.80549621582031, "loss": 0.6722, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.16464100778102875, "rewards/margins": 0.04780580848455429, "rewards/rejected": -0.21244683861732483, "step": 3600 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -2.9655280113220215, "eval_logits/rejected": -2.9596989154815674, "eval_logps/chosen": -69.97339630126953, "eval_logps/rejected": -77.21550750732422, "eval_loss": 0.6810084581375122, "eval_rewards/accuracies": 0.609897792339325, "eval_rewards/chosen": -0.11261503398418427, "eval_rewards/margins": 0.027738766744732857, "eval_rewards/rejected": -0.14035379886627197, "eval_runtime": 360.9369, "eval_samples_per_second": 11.925, "eval_steps_per_second": 1.491, "step": 3600 }, { "epoch": 0.6219848380427292, "grad_norm": 3.6863250732421875, "learning_rate": 8.704266491434787e-08, "logits/chosen": -2.827634572982788, "logits/rejected": -2.8120434284210205, "logps/chosen": -70.19799041748047, "logps/rejected": -72.06962585449219, "loss": 0.6765, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.18704785406589508, "rewards/margins": 0.039228882640600204, "rewards/rejected": -0.2262767106294632, "step": 3610 }, { "epoch": 0.6237077877325982, "grad_norm": 3.8912832736968994, "learning_rate": 8.694150668369892e-08, "logits/chosen": -2.786785125732422, "logits/rejected": -2.762024402618408, "logps/chosen": -70.12651062011719, "logps/rejected": -75.05070495605469, "loss": 0.6616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14618650078773499, "rewards/margins": 0.06834978610277176, "rewards/rejected": -0.21453626453876495, "step": 3620 }, { "epoch": 0.6254307374224672, "grad_norm": 3.563636064529419, "learning_rate": 8.68400143881606e-08, "logits/chosen": -2.940396785736084, "logits/rejected": -2.9337635040283203, "logps/chosen": -67.58637237548828, "logps/rejected": -79.30843353271484, "loss": 0.6603, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.16445663571357727, "rewards/margins": 0.0717763751745224, "rewards/rejected": -0.23623299598693848, "step": 3630 }, { "epoch": 0.6271536871123363, "grad_norm": 4.187912940979004, "learning_rate": 8.673818894553557e-08, "logits/chosen": -2.8915393352508545, "logits/rejected": -2.8772926330566406, "logps/chosen": -70.48597717285156, "logps/rejected": -77.50422668457031, "loss": 0.6668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1544286608695984, "rewards/margins": 0.058818262070417404, "rewards/rejected": -0.2132469117641449, "step": 3640 }, { "epoch": 0.6288766368022054, "grad_norm": 3.832031488418579, "learning_rate": 8.663603127663912e-08, "logits/chosen": -2.842319965362549, "logits/rejected": -2.846832513809204, "logps/chosen": -66.86103820800781, "logps/rejected": -75.29658508300781, "loss": 0.6709, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.15779516100883484, "rewards/margins": 0.049841057509183884, "rewards/rejected": -0.20763623714447021, "step": 3650 }, { "epoch": 0.6305995864920745, "grad_norm": 3.538102149963379, "learning_rate": 8.653354230529094e-08, "logits/chosen": -2.894085645675659, "logits/rejected": -2.864441394805908, "logps/chosen": -70.91181945800781, "logps/rejected": -77.59912109375, "loss": 0.6612, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17053073644638062, "rewards/margins": 0.07006730139255524, "rewards/rejected": -0.24059805274009705, "step": 3660 }, { "epoch": 0.6323225361819435, "grad_norm": 3.7949655055999756, "learning_rate": 8.643072295830669e-08, "logits/chosen": -2.8500585556030273, "logits/rejected": -2.8231747150421143, "logps/chosen": -70.67984771728516, "logps/rejected": -75.11799621582031, "loss": 0.6726, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.17828655242919922, "rewards/margins": 0.04741708189249039, "rewards/rejected": -0.225703626871109, "step": 3670 }, { "epoch": 0.6340454858718125, "grad_norm": 3.9998621940612793, "learning_rate": 8.632757416548961e-08, "logits/chosen": -2.8689355850219727, "logits/rejected": -2.830148696899414, "logps/chosen": -72.94480895996094, "logps/rejected": -73.00611877441406, "loss": 0.6736, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18268983066082, "rewards/margins": 0.04636243358254433, "rewards/rejected": -0.22905227541923523, "step": 3680 }, { "epoch": 0.6357684355616816, "grad_norm": 4.77199649810791, "learning_rate": 8.62240968596222e-08, "logits/chosen": -2.806609630584717, "logits/rejected": -2.814436912536621, "logps/chosen": -69.28324890136719, "logps/rejected": -82.42266845703125, "loss": 0.6658, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.17189538478851318, "rewards/margins": 0.061388492584228516, "rewards/rejected": -0.2332838773727417, "step": 3690 }, { "epoch": 0.6374913852515507, "grad_norm": 3.955845355987549, "learning_rate": 8.612029197645772e-08, "logits/chosen": -2.8485870361328125, "logits/rejected": -2.822967052459717, "logps/chosen": -74.45802307128906, "logps/rejected": -82.77164459228516, "loss": 0.664, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.16860488057136536, "rewards/margins": 0.06524287164211273, "rewards/rejected": -0.23384776711463928, "step": 3700 }, { "epoch": 0.6374913852515507, "eval_logits/chosen": -2.954319953918457, "eval_logits/rejected": -2.9484646320343018, "eval_logps/chosen": -70.8017807006836, "eval_logps/rejected": -78.20402526855469, "eval_loss": 0.6803364157676697, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.12089894711971283, "eval_rewards/margins": 0.029340064153075218, "eval_rewards/rejected": -0.1502390056848526, "eval_runtime": 359.9254, "eval_samples_per_second": 11.958, "eval_steps_per_second": 1.495, "step": 3700 }, { "epoch": 0.6392143349414197, "grad_norm": 4.2370991706848145, "learning_rate": 8.601616045471168e-08, "logits/chosen": -2.8805429935455322, "logits/rejected": -2.849026918411255, "logps/chosen": -71.5788345336914, "logps/rejected": -73.38832092285156, "loss": 0.6726, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1771015226840973, "rewards/margins": 0.04644893854856491, "rewards/rejected": -0.2235504388809204, "step": 3710 }, { "epoch": 0.6409372846312887, "grad_norm": 3.7011959552764893, "learning_rate": 8.591170323605347e-08, "logits/chosen": -2.893097400665283, "logits/rejected": -2.875398635864258, "logps/chosen": -68.0528564453125, "logps/rejected": -76.62176513671875, "loss": 0.661, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.16156095266342163, "rewards/margins": 0.07080355286598206, "rewards/rejected": -0.2323645055294037, "step": 3720 }, { "epoch": 0.6426602343211578, "grad_norm": 4.4856791496276855, "learning_rate": 8.580692126509778e-08, "logits/chosen": -2.853728771209717, "logits/rejected": -2.830976963043213, "logps/chosen": -71.31932830810547, "logps/rejected": -74.6063461303711, "loss": 0.6615, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16801698505878448, "rewards/margins": 0.07099028676748276, "rewards/rejected": -0.23900727927684784, "step": 3730 }, { "epoch": 0.6443831840110269, "grad_norm": 4.300098896026611, "learning_rate": 8.570181548939604e-08, "logits/chosen": -2.85255765914917, "logits/rejected": -2.8277950286865234, "logps/chosen": -70.86285400390625, "logps/rejected": -79.48960876464844, "loss": 0.6683, "rewards/accuracies": 0.625, "rewards/chosen": -0.17427803575992584, "rewards/margins": 0.05652335286140442, "rewards/rejected": -0.23080138862133026, "step": 3740 }, { "epoch": 0.646106133700896, "grad_norm": 4.036457538604736, "learning_rate": 8.559638685942782e-08, "logits/chosen": -2.9201862812042236, "logits/rejected": -2.892306089401245, "logps/chosen": -69.86690521240234, "logps/rejected": -76.1387710571289, "loss": 0.6639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16548243165016174, "rewards/margins": 0.06511317193508148, "rewards/rejected": -0.23059561848640442, "step": 3750 }, { "epoch": 0.647829083390765, "grad_norm": 4.326759338378906, "learning_rate": 8.54906363285924e-08, "logits/chosen": -2.8713104724884033, "logits/rejected": -2.8680050373077393, "logps/chosen": -74.909423828125, "logps/rejected": -80.24607849121094, "loss": 0.671, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1903577744960785, "rewards/margins": 0.05150656774640083, "rewards/rejected": -0.24186435341835022, "step": 3760 }, { "epoch": 0.649552033080634, "grad_norm": 4.169382095336914, "learning_rate": 8.538456485319994e-08, "logits/chosen": -2.843334197998047, "logits/rejected": -2.8231163024902344, "logps/chosen": -71.5437240600586, "logps/rejected": -76.70804595947266, "loss": 0.6752, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18002033233642578, "rewards/margins": 0.04156037047505379, "rewards/rejected": -0.22158071398735046, "step": 3770 }, { "epoch": 0.6512749827705031, "grad_norm": 3.6488869190216064, "learning_rate": 8.527817339246297e-08, "logits/chosen": -2.8893494606018066, "logits/rejected": -2.8518636226654053, "logps/chosen": -72.23634338378906, "logps/rejected": -73.34587097167969, "loss": 0.6675, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.18157723546028137, "rewards/margins": 0.05787067487835884, "rewards/rejected": -0.23944790661334991, "step": 3780 }, { "epoch": 0.6529979324603722, "grad_norm": 3.9068140983581543, "learning_rate": 8.517146290848767e-08, "logits/chosen": -2.842933177947998, "logits/rejected": -2.814704656600952, "logps/chosen": -71.16899108886719, "logps/rejected": -74.94173431396484, "loss": 0.6627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17180873453617096, "rewards/margins": 0.0677022859454155, "rewards/rejected": -0.23951101303100586, "step": 3790 }, { "epoch": 0.6547208821502413, "grad_norm": 5.100893974304199, "learning_rate": 8.506443436626513e-08, "logits/chosen": -2.8600306510925293, "logits/rejected": -2.828758716583252, "logps/chosen": -75.09590148925781, "logps/rejected": -76.66107177734375, "loss": 0.6644, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1740262657403946, "rewards/margins": 0.06390801072120667, "rewards/rejected": -0.23793426156044006, "step": 3800 }, { "epoch": 0.6547208821502413, "eval_logits/chosen": -2.944425344467163, "eval_logits/rejected": -2.9385876655578613, "eval_logps/chosen": -71.985107421875, "eval_logps/rejected": -79.5918197631836, "eval_loss": 0.6795005202293396, "eval_rewards/accuracies": 0.6110594868659973, "eval_rewards/chosen": -0.132732093334198, "eval_rewards/margins": 0.031384874135255814, "eval_rewards/rejected": -0.16411694884300232, "eval_runtime": 360.5198, "eval_samples_per_second": 11.938, "eval_steps_per_second": 1.492, "step": 3800 }, { "epoch": 0.6564438318401102, "grad_norm": 4.450614929199219, "learning_rate": 8.495708873366273e-08, "logits/chosen": -2.882823944091797, "logits/rejected": -2.8404746055603027, "logps/chosen": -76.32991027832031, "logps/rejected": -77.35725402832031, "loss": 0.6645, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18861815333366394, "rewards/margins": 0.0643153265118599, "rewards/rejected": -0.25293344259262085, "step": 3810 }, { "epoch": 0.6581667815299793, "grad_norm": 4.633328914642334, "learning_rate": 8.48494269814153e-08, "logits/chosen": -2.851984739303589, "logits/rejected": -2.835164785385132, "logps/chosen": -72.11973571777344, "logps/rejected": -76.26155090332031, "loss": 0.6702, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.18021515011787415, "rewards/margins": 0.051523733884096146, "rewards/rejected": -0.23173888027668, "step": 3820 }, { "epoch": 0.6598897312198484, "grad_norm": 4.264226913452148, "learning_rate": 8.474145008311633e-08, "logits/chosen": -2.895481586456299, "logits/rejected": -2.8515303134918213, "logps/chosen": -76.1996078491211, "logps/rejected": -75.78540802001953, "loss": 0.6619, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17522504925727844, "rewards/margins": 0.06902584433555603, "rewards/rejected": -0.24425086379051208, "step": 3830 }, { "epoch": 0.6616126809097175, "grad_norm": 4.619011878967285, "learning_rate": 8.463315901520923e-08, "logits/chosen": -2.823070526123047, "logits/rejected": -2.8097739219665527, "logps/chosen": -70.88066101074219, "logps/rejected": -83.83921813964844, "loss": 0.6574, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16794264316558838, "rewards/margins": 0.08013279736042023, "rewards/rejected": -0.2480754405260086, "step": 3840 }, { "epoch": 0.6633356305995864, "grad_norm": 4.062577247619629, "learning_rate": 8.452455475697845e-08, "logits/chosen": -2.9455015659332275, "logits/rejected": -2.930697441101074, "logps/chosen": -75.87129211425781, "logps/rejected": -77.36614227294922, "loss": 0.6708, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1857018619775772, "rewards/margins": 0.05030056834220886, "rewards/rejected": -0.23600240051746368, "step": 3850 }, { "epoch": 0.6650585802894555, "grad_norm": 4.330991268157959, "learning_rate": 8.44156382905407e-08, "logits/chosen": -2.9505677223205566, "logits/rejected": -2.925079107284546, "logps/chosen": -75.33770751953125, "logps/rejected": -77.89942932128906, "loss": 0.6776, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.21157272160053253, "rewards/margins": 0.03799385949969292, "rewards/rejected": -0.24956655502319336, "step": 3860 }, { "epoch": 0.6667815299793246, "grad_norm": 4.8397111892700195, "learning_rate": 8.430641060083593e-08, "logits/chosen": -2.8415279388427734, "logits/rejected": -2.8184056282043457, "logps/chosen": -77.4899673461914, "logps/rejected": -80.97842407226562, "loss": 0.6633, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1863713264465332, "rewards/margins": 0.06677299737930298, "rewards/rejected": -0.2531442940235138, "step": 3870 }, { "epoch": 0.6685044796691937, "grad_norm": 4.591561317443848, "learning_rate": 8.419687267561858e-08, "logits/chosen": -2.875171184539795, "logits/rejected": -2.879774332046509, "logps/chosen": -72.00981140136719, "logps/rejected": -78.77186584472656, "loss": 0.6753, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2113080769777298, "rewards/margins": 0.041494350880384445, "rewards/rejected": -0.25280243158340454, "step": 3880 }, { "epoch": 0.6702274293590628, "grad_norm": 4.114559173583984, "learning_rate": 8.408702550544853e-08, "logits/chosen": -2.9188356399536133, "logits/rejected": -2.892014503479004, "logps/chosen": -73.74089050292969, "logps/rejected": -78.37785339355469, "loss": 0.6636, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.19554482400417328, "rewards/margins": 0.06676265597343445, "rewards/rejected": -0.26230746507644653, "step": 3890 }, { "epoch": 0.6719503790489317, "grad_norm": 4.3273820877075195, "learning_rate": 8.39768700836822e-08, "logits/chosen": -2.896733283996582, "logits/rejected": -2.842710018157959, "logps/chosen": -76.47528839111328, "logps/rejected": -75.9754409790039, "loss": 0.6664, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19646716117858887, "rewards/margins": 0.060664378106594086, "rewards/rejected": -0.25713151693344116, "step": 3900 }, { "epoch": 0.6719503790489317, "eval_logits/chosen": -2.9358367919921875, "eval_logits/rejected": -2.9299936294555664, "eval_logps/chosen": -73.20443725585938, "eval_logps/rejected": -81.02217102050781, "eval_loss": 0.6786462664604187, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.14492543041706085, "eval_rewards/margins": 0.03349505737423897, "eval_rewards/rejected": -0.17842045426368713, "eval_runtime": 359.9545, "eval_samples_per_second": 11.957, "eval_steps_per_second": 1.495, "step": 3900 }, { "epoch": 0.6736733287388008, "grad_norm": 3.9224681854248047, "learning_rate": 8.386640740646353e-08, "logits/chosen": -2.869636297225952, "logits/rejected": -2.854032516479492, "logps/chosen": -76.14567565917969, "logps/rejected": -82.44049835205078, "loss": 0.6604, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20504307746887207, "rewards/margins": 0.07330290973186493, "rewards/rejected": -0.2783460021018982, "step": 3910 }, { "epoch": 0.6753962784286699, "grad_norm": 4.396719932556152, "learning_rate": 8.375563847271506e-08, "logits/chosen": -2.9339499473571777, "logits/rejected": -2.8893306255340576, "logps/chosen": -77.58342742919922, "logps/rejected": -77.1610336303711, "loss": 0.6589, "rewards/accuracies": 0.6875, "rewards/chosen": -0.189140185713768, "rewards/margins": 0.07799036800861359, "rewards/rejected": -0.2671305537223816, "step": 3920 }, { "epoch": 0.677119228118539, "grad_norm": 4.47017765045166, "learning_rate": 8.364456428412874e-08, "logits/chosen": -2.937316417694092, "logits/rejected": -2.9325146675109863, "logps/chosen": -71.93880462646484, "logps/rejected": -79.04782104492188, "loss": 0.6691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.20680996775627136, "rewards/margins": 0.055847086012363434, "rewards/rejected": -0.2626570761203766, "step": 3930 }, { "epoch": 0.6788421778084079, "grad_norm": 4.382420539855957, "learning_rate": 8.353318584515705e-08, "logits/chosen": -2.7662198543548584, "logits/rejected": -2.7476911544799805, "logps/chosen": -72.57732391357422, "logps/rejected": -78.52955627441406, "loss": 0.6678, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19077327847480774, "rewards/margins": 0.05754157155752182, "rewards/rejected": -0.24831485748291016, "step": 3940 }, { "epoch": 0.680565127498277, "grad_norm": 4.424745082855225, "learning_rate": 8.342150416300375e-08, "logits/chosen": -2.7651352882385254, "logits/rejected": -2.7198243141174316, "logps/chosen": -75.12512969970703, "logps/rejected": -77.98560333251953, "loss": 0.6601, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19651785492897034, "rewards/margins": 0.07387096434831619, "rewards/rejected": -0.27038878202438354, "step": 3950 }, { "epoch": 0.6822880771881461, "grad_norm": 4.411109924316406, "learning_rate": 8.330952024761493e-08, "logits/chosen": -2.8689446449279785, "logits/rejected": -2.869525194168091, "logps/chosen": -72.38044738769531, "logps/rejected": -87.84962463378906, "loss": 0.656, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.19677282869815826, "rewards/margins": 0.0877741351723671, "rewards/rejected": -0.28454697132110596, "step": 3960 }, { "epoch": 0.6840110268780152, "grad_norm": 4.500481605529785, "learning_rate": 8.319723511166973e-08, "logits/chosen": -2.8778367042541504, "logits/rejected": -2.8542728424072266, "logps/chosen": -74.01570129394531, "logps/rejected": -80.1430435180664, "loss": 0.6614, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19942805171012878, "rewards/margins": 0.07196511328220367, "rewards/rejected": -0.2713931202888489, "step": 3970 }, { "epoch": 0.6857339765678843, "grad_norm": 4.250563621520996, "learning_rate": 8.308464977057131e-08, "logits/chosen": -2.7574124336242676, "logits/rejected": -2.7379393577575684, "logps/chosen": -74.24089813232422, "logps/rejected": -81.76484680175781, "loss": 0.659, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20901378989219666, "rewards/margins": 0.07665853202342987, "rewards/rejected": -0.2856723368167877, "step": 3980 }, { "epoch": 0.6874569262577532, "grad_norm": 4.266327381134033, "learning_rate": 8.297176524243754e-08, "logits/chosen": -2.862718105316162, "logits/rejected": -2.8492565155029297, "logps/chosen": -74.35490417480469, "logps/rejected": -82.00604248046875, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": -0.21138229966163635, "rewards/margins": 0.05601108819246292, "rewards/rejected": -0.26739341020584106, "step": 3990 }, { "epoch": 0.6891798759476223, "grad_norm": 4.685428619384766, "learning_rate": 8.285858254809193e-08, "logits/chosen": -2.930467367172241, "logits/rejected": -2.8961825370788574, "logps/chosen": -78.38148498535156, "logps/rejected": -80.21168518066406, "loss": 0.6653, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.21177248656749725, "rewards/margins": 0.06271915137767792, "rewards/rejected": -0.27449163794517517, "step": 4000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -2.923618793487549, "eval_logits/rejected": -2.9177629947662354, "eval_logps/chosen": -74.3039779663086, "eval_logps/rejected": -82.26918029785156, "eval_loss": 0.6780802607536316, "eval_rewards/accuracies": 0.6057156324386597, "eval_rewards/chosen": -0.15592080354690552, "eval_rewards/margins": 0.034969717264175415, "eval_rewards/rejected": -0.19089052081108093, "eval_runtime": 360.2025, "eval_samples_per_second": 11.949, "eval_steps_per_second": 1.494, "step": 4000 }, { "epoch": 0.6909028256374914, "grad_norm": 4.136673927307129, "learning_rate": 8.274510271105428e-08, "logits/chosen": -2.8100674152374268, "logits/rejected": -2.8002233505249023, "logps/chosen": -69.55921936035156, "logps/rejected": -82.14431762695312, "loss": 0.6583, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.21539802849292755, "rewards/margins": 0.07587047666311264, "rewards/rejected": -0.291268527507782, "step": 4010 }, { "epoch": 0.6926257753273605, "grad_norm": 4.985778331756592, "learning_rate": 8.26313267575315e-08, "logits/chosen": -2.8186466693878174, "logits/rejected": -2.8054795265197754, "logps/chosen": -73.369873046875, "logps/rejected": -80.44734954833984, "loss": 0.6661, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2118435651063919, "rewards/margins": 0.061372630298137665, "rewards/rejected": -0.273216187953949, "step": 4020 }, { "epoch": 0.6943487250172296, "grad_norm": 4.250854969024658, "learning_rate": 8.251725571640831e-08, "logits/chosen": -2.8195624351501465, "logits/rejected": -2.7978997230529785, "logps/chosen": -75.46442413330078, "logps/rejected": -83.16146087646484, "loss": 0.6568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1941043883562088, "rewards/margins": 0.07958976924419403, "rewards/rejected": -0.27369412779808044, "step": 4030 }, { "epoch": 0.6960716747070985, "grad_norm": 4.681697368621826, "learning_rate": 8.240289061923791e-08, "logits/chosen": -2.904080867767334, "logits/rejected": -2.8924145698547363, "logps/chosen": -77.6021728515625, "logps/rejected": -81.96739196777344, "loss": 0.6646, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.21409961581230164, "rewards/margins": 0.06508499383926392, "rewards/rejected": -0.27918460965156555, "step": 4040 }, { "epoch": 0.6977946243969676, "grad_norm": 4.321097373962402, "learning_rate": 8.228823250023268e-08, "logits/chosen": -2.812398672103882, "logits/rejected": -2.7948527336120605, "logps/chosen": -70.7357406616211, "logps/rejected": -81.06127166748047, "loss": 0.6577, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20170316100120544, "rewards/margins": 0.08067599684000015, "rewards/rejected": -0.2823791801929474, "step": 4050 }, { "epoch": 0.6995175740868367, "grad_norm": 4.3770551681518555, "learning_rate": 8.21732823962548e-08, "logits/chosen": -2.8143866062164307, "logits/rejected": -2.782468557357788, "logps/chosen": -79.16787719726562, "logps/rejected": -81.5352783203125, "loss": 0.6644, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.22808189690113068, "rewards/margins": 0.06586854159832001, "rewards/rejected": -0.2939504384994507, "step": 4060 }, { "epoch": 0.7012405237767058, "grad_norm": 4.432036876678467, "learning_rate": 8.205804134680696e-08, "logits/chosen": -2.816871166229248, "logits/rejected": -2.7946834564208984, "logps/chosen": -76.47846984863281, "logps/rejected": -82.78707122802734, "loss": 0.6607, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22646565735340118, "rewards/margins": 0.07262485474348068, "rewards/rejected": -0.2990904748439789, "step": 4070 }, { "epoch": 0.7029634734665747, "grad_norm": 4.366625785827637, "learning_rate": 8.194251039402279e-08, "logits/chosen": -2.782827377319336, "logits/rejected": -2.775156259536743, "logps/chosen": -73.0945053100586, "logps/rejected": -82.41346740722656, "loss": 0.6611, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.21194684505462646, "rewards/margins": 0.0719664990901947, "rewards/rejected": -0.28391334414482117, "step": 4080 }, { "epoch": 0.7046864231564438, "grad_norm": 4.243492603302002, "learning_rate": 8.182669058265762e-08, "logits/chosen": -2.8675835132598877, "logits/rejected": -2.837397813796997, "logps/chosen": -75.27025604248047, "logps/rejected": -82.02632904052734, "loss": 0.6589, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1979300081729889, "rewards/margins": 0.07707500457763672, "rewards/rejected": -0.2750049829483032, "step": 4090 }, { "epoch": 0.7064093728463129, "grad_norm": 4.171043872833252, "learning_rate": 8.17105829600789e-08, "logits/chosen": -2.8199868202209473, "logits/rejected": -2.791888952255249, "logps/chosen": -71.82962799072266, "logps/rejected": -77.13992309570312, "loss": 0.6532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18986184895038605, "rewards/margins": 0.08909063041210175, "rewards/rejected": -0.2789524495601654, "step": 4100 }, { "epoch": 0.7064093728463129, "eval_logits/chosen": -2.906397819519043, "eval_logits/rejected": -2.900517702102661, "eval_logps/chosen": -74.83631134033203, "eval_logps/rejected": -82.92964172363281, "eval_loss": 0.677558422088623, "eval_rewards/accuracies": 0.6124535202980042, "eval_rewards/chosen": -0.16124409437179565, "eval_rewards/margins": 0.03625102713704109, "eval_rewards/rejected": -0.19749511778354645, "eval_runtime": 359.7757, "eval_samples_per_second": 11.963, "eval_steps_per_second": 1.495, "step": 4100 }, { "epoch": 0.708132322536182, "grad_norm": 4.226618766784668, "learning_rate": 8.159418857625685e-08, "logits/chosen": -2.787879705429077, "logits/rejected": -2.776662826538086, "logps/chosen": -79.5539779663086, "logps/rejected": -79.31879425048828, "loss": 0.6755, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.23491552472114563, "rewards/margins": 0.04324030503630638, "rewards/rejected": -0.2781558632850647, "step": 4110 }, { "epoch": 0.709855272226051, "grad_norm": 4.805150032043457, "learning_rate": 8.147750848375478e-08, "logits/chosen": -2.7447409629821777, "logits/rejected": -2.7471377849578857, "logps/chosen": -73.45333862304688, "logps/rejected": -84.03813171386719, "loss": 0.6688, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.23488454520702362, "rewards/margins": 0.05762225389480591, "rewards/rejected": -0.29250678420066833, "step": 4120 }, { "epoch": 0.71157822191592, "grad_norm": 4.33480978012085, "learning_rate": 8.13605437377198e-08, "logits/chosen": -2.8447792530059814, "logits/rejected": -2.822270393371582, "logps/chosen": -73.93051147460938, "logps/rejected": -81.64012145996094, "loss": 0.6513, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19962568581104279, "rewards/margins": 0.09144510328769684, "rewards/rejected": -0.2910707890987396, "step": 4130 }, { "epoch": 0.7133011716057891, "grad_norm": 4.122574806213379, "learning_rate": 8.124329539587311e-08, "logits/chosen": -2.8442020416259766, "logits/rejected": -2.8172695636749268, "logps/chosen": -78.91192626953125, "logps/rejected": -77.90412902832031, "loss": 0.678, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.23386287689208984, "rewards/margins": 0.03814762085676193, "rewards/rejected": -0.2720105051994324, "step": 4140 }, { "epoch": 0.7150241212956582, "grad_norm": 4.575338840484619, "learning_rate": 8.112576451850046e-08, "logits/chosen": -2.865945816040039, "logits/rejected": -2.84014630317688, "logps/chosen": -79.77760314941406, "logps/rejected": -81.44580841064453, "loss": 0.6583, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21935932338237762, "rewards/margins": 0.07686015963554382, "rewards/rejected": -0.29621952772140503, "step": 4150 }, { "epoch": 0.7167470709855273, "grad_norm": 4.371513366699219, "learning_rate": 8.100795216844264e-08, "logits/chosen": -2.760072708129883, "logits/rejected": -2.7301230430603027, "logps/chosen": -74.31935119628906, "logps/rejected": -79.0527114868164, "loss": 0.6607, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2264113873243332, "rewards/margins": 0.07299733906984329, "rewards/rejected": -0.2994087338447571, "step": 4160 }, { "epoch": 0.7184700206753962, "grad_norm": 4.953804016113281, "learning_rate": 8.088985941108584e-08, "logits/chosen": -2.812103509902954, "logits/rejected": -2.804077625274658, "logps/chosen": -73.244140625, "logps/rejected": -82.40103912353516, "loss": 0.6621, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.21848125755786896, "rewards/margins": 0.06967557966709137, "rewards/rejected": -0.28815680742263794, "step": 4170 }, { "epoch": 0.7201929703652653, "grad_norm": 4.888758182525635, "learning_rate": 8.077148731435188e-08, "logits/chosen": -2.8084912300109863, "logits/rejected": -2.7935614585876465, "logps/chosen": -76.97647094726562, "logps/rejected": -85.00114440917969, "loss": 0.6621, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2163088619709015, "rewards/margins": 0.0708131492137909, "rewards/rejected": -0.2871219515800476, "step": 4180 }, { "epoch": 0.7219159200551344, "grad_norm": 5.011810302734375, "learning_rate": 8.065283694868883e-08, "logits/chosen": -2.7904458045959473, "logits/rejected": -2.7769782543182373, "logps/chosen": -74.02743530273438, "logps/rejected": -83.00337219238281, "loss": 0.6595, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22959303855895996, "rewards/margins": 0.07523669302463531, "rewards/rejected": -0.3048296868801117, "step": 4190 }, { "epoch": 0.7236388697450035, "grad_norm": 4.535158157348633, "learning_rate": 8.053390938706102e-08, "logits/chosen": -2.8413708209991455, "logits/rejected": -2.8256707191467285, "logps/chosen": -77.91133117675781, "logps/rejected": -79.3990478515625, "loss": 0.6733, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.22204849123954773, "rewards/margins": 0.046053197234869, "rewards/rejected": -0.26810169219970703, "step": 4200 }, { "epoch": 0.7236388697450035, "eval_logits/chosen": -2.8949339389801025, "eval_logits/rejected": -2.889023542404175, "eval_logps/chosen": -75.9118881225586, "eval_logps/rejected": -84.16388702392578, "eval_loss": 0.6769329309463501, "eval_rewards/accuracies": 0.6087360382080078, "eval_rewards/chosen": -0.17199990153312683, "eval_rewards/margins": 0.03783779591321945, "eval_rewards/rejected": -0.20983768999576569, "eval_runtime": 359.9545, "eval_samples_per_second": 11.957, "eval_steps_per_second": 1.495, "step": 4200 }, { "epoch": 0.7253618194348725, "grad_norm": 4.928792953491211, "learning_rate": 8.041470570493958e-08, "logits/chosen": -2.823927640914917, "logits/rejected": -2.802546977996826, "logps/chosen": -79.63623046875, "logps/rejected": -82.85804748535156, "loss": 0.6692, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.24339476227760315, "rewards/margins": 0.0574357807636261, "rewards/rejected": -0.30083051323890686, "step": 4210 }, { "epoch": 0.7270847691247415, "grad_norm": 4.286323547363281, "learning_rate": 8.029522698029257e-08, "logits/chosen": -2.681990623474121, "logits/rejected": -2.663794994354248, "logps/chosen": -74.3034439086914, "logps/rejected": -81.97090148925781, "loss": 0.6616, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22084763646125793, "rewards/margins": 0.07181050628423691, "rewards/rejected": -0.29265812039375305, "step": 4220 }, { "epoch": 0.7288077188146106, "grad_norm": 4.317086219787598, "learning_rate": 8.017547429357531e-08, "logits/chosen": -2.8607168197631836, "logits/rejected": -2.8414225578308105, "logps/chosen": -78.4534683227539, "logps/rejected": -85.2705078125, "loss": 0.658, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.23147408664226532, "rewards/margins": 0.08023453503847122, "rewards/rejected": -0.31170862913131714, "step": 4230 }, { "epoch": 0.7305306685044797, "grad_norm": 4.675597190856934, "learning_rate": 8.005544872772054e-08, "logits/chosen": -2.7655177116394043, "logits/rejected": -2.7412803173065186, "logps/chosen": -77.64450073242188, "logps/rejected": -87.1747055053711, "loss": 0.6567, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2290038764476776, "rewards/margins": 0.08320339769124985, "rewards/rejected": -0.31220728158950806, "step": 4240 }, { "epoch": 0.7322536181943488, "grad_norm": 4.733819484710693, "learning_rate": 7.993515136812874e-08, "logits/chosen": -2.779085636138916, "logits/rejected": -2.7630178928375244, "logps/chosen": -78.02629089355469, "logps/rejected": -81.59638977050781, "loss": 0.668, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2297748625278473, "rewards/margins": 0.059504471719264984, "rewards/rejected": -0.28927934169769287, "step": 4250 }, { "epoch": 0.7339765678842178, "grad_norm": 4.640880107879639, "learning_rate": 7.981458330265815e-08, "logits/chosen": -2.7458930015563965, "logits/rejected": -2.731210231781006, "logps/chosen": -76.95973205566406, "logps/rejected": -84.32952880859375, "loss": 0.6629, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22431564331054688, "rewards/margins": 0.07044248282909393, "rewards/rejected": -0.294758141040802, "step": 4260 }, { "epoch": 0.7356995175740868, "grad_norm": 4.803411960601807, "learning_rate": 7.969374562161509e-08, "logits/chosen": -2.7621819972991943, "logits/rejected": -2.737330675125122, "logps/chosen": -71.40404510498047, "logps/rejected": -78.76124572753906, "loss": 0.6554, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19795607030391693, "rewards/margins": 0.08418522775173187, "rewards/rejected": -0.2821413278579712, "step": 4270 }, { "epoch": 0.7374224672639559, "grad_norm": 4.555239200592041, "learning_rate": 7.957263941774402e-08, "logits/chosen": -2.7401576042175293, "logits/rejected": -2.714449882507324, "logps/chosen": -74.34765625, "logps/rejected": -82.82798767089844, "loss": 0.6584, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.22630128264427185, "rewards/margins": 0.08142432570457458, "rewards/rejected": -0.30772560834884644, "step": 4280 }, { "epoch": 0.739145416953825, "grad_norm": 4.382991313934326, "learning_rate": 7.945126578621763e-08, "logits/chosen": -2.778887987136841, "logits/rejected": -2.755769729614258, "logps/chosen": -79.17945861816406, "logps/rejected": -82.90766906738281, "loss": 0.6621, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2393958568572998, "rewards/margins": 0.07353595644235611, "rewards/rejected": -0.3129318356513977, "step": 4290 }, { "epoch": 0.740868366643694, "grad_norm": 4.42786979675293, "learning_rate": 7.932962582462707e-08, "logits/chosen": -2.796304225921631, "logits/rejected": -2.7852556705474854, "logps/chosen": -76.05867767333984, "logps/rejected": -83.70006561279297, "loss": 0.6618, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2350235879421234, "rewards/margins": 0.07132681459188461, "rewards/rejected": -0.3063504099845886, "step": 4300 }, { "epoch": 0.740868366643694, "eval_logits/chosen": -2.885286569595337, "eval_logits/rejected": -2.8793962001800537, "eval_logps/chosen": -76.69395446777344, "eval_logps/rejected": -85.07231140136719, "eval_loss": 0.6764479875564575, "eval_rewards/accuracies": 0.6057156324386597, "eval_rewards/chosen": -0.17982058227062225, "eval_rewards/margins": 0.039101339876651764, "eval_rewards/rejected": -0.21892189979553223, "eval_runtime": 359.6614, "eval_samples_per_second": 11.967, "eval_steps_per_second": 1.496, "step": 4300 }, { "epoch": 0.742591316333563, "grad_norm": 4.963508605957031, "learning_rate": 7.920772063297185e-08, "logits/chosen": -2.7422854900360107, "logits/rejected": -2.7238478660583496, "logps/chosen": -77.0846176147461, "logps/rejected": -82.02983856201172, "loss": 0.6602, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2303268015384674, "rewards/margins": 0.07409859448671341, "rewards/rejected": -0.30442532896995544, "step": 4310 }, { "epoch": 0.7443142660234321, "grad_norm": 4.418318271636963, "learning_rate": 7.908555131365e-08, "logits/chosen": -2.8225841522216797, "logits/rejected": -2.7945821285247803, "logps/chosen": -82.75288391113281, "logps/rejected": -89.62847137451172, "loss": 0.6444, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.24110881984233856, "rewards/margins": 0.10840406268835068, "rewards/rejected": -0.34951287508010864, "step": 4320 }, { "epoch": 0.7460372157133012, "grad_norm": 4.678439140319824, "learning_rate": 7.896311897144809e-08, "logits/chosen": -2.729358434677124, "logits/rejected": -2.6988437175750732, "logps/chosen": -81.18726348876953, "logps/rejected": -80.73506164550781, "loss": 0.6713, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.25696468353271484, "rewards/margins": 0.05245956778526306, "rewards/rejected": -0.3094242513179779, "step": 4330 }, { "epoch": 0.7477601654031703, "grad_norm": 4.287242412567139, "learning_rate": 7.884042471353122e-08, "logits/chosen": -2.7333014011383057, "logits/rejected": -2.715153694152832, "logps/chosen": -73.27593994140625, "logps/rejected": -83.72303771972656, "loss": 0.6543, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.24279603362083435, "rewards/margins": 0.08945179730653763, "rewards/rejected": -0.3322478234767914, "step": 4340 }, { "epoch": 0.7494831150930393, "grad_norm": 5.045931816101074, "learning_rate": 7.8717469649433e-08, "logits/chosen": -2.7423102855682373, "logits/rejected": -2.727919578552246, "logps/chosen": -73.34207916259766, "logps/rejected": -83.40766906738281, "loss": 0.6583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22332239151000977, "rewards/margins": 0.07879458367824554, "rewards/rejected": -0.3021170198917389, "step": 4350 }, { "epoch": 0.7512060647829083, "grad_norm": 4.683145999908447, "learning_rate": 7.859425489104556e-08, "logits/chosen": -2.785099506378174, "logits/rejected": -2.7638652324676514, "logps/chosen": -80.40673065185547, "logps/rejected": -84.97071838378906, "loss": 0.6658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.267350971698761, "rewards/margins": 0.06585561484098434, "rewards/rejected": -0.3332065939903259, "step": 4360 }, { "epoch": 0.7529290144727774, "grad_norm": 4.6525797843933105, "learning_rate": 7.847078155260942e-08, "logits/chosen": -2.8002583980560303, "logits/rejected": -2.7860169410705566, "logps/chosen": -79.53623962402344, "logps/rejected": -86.96631622314453, "loss": 0.6603, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24577634036540985, "rewards/margins": 0.07589911669492722, "rewards/rejected": -0.32167547941207886, "step": 4370 }, { "epoch": 0.7546519641626465, "grad_norm": 4.781350135803223, "learning_rate": 7.834705075070352e-08, "logits/chosen": -2.8228507041931152, "logits/rejected": -2.8000426292419434, "logps/chosen": -79.38736724853516, "logps/rejected": -82.51011657714844, "loss": 0.6719, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2508380115032196, "rewards/margins": 0.0524689257144928, "rewards/rejected": -0.30330690741539, "step": 4380 }, { "epoch": 0.7563749138525155, "grad_norm": 4.321199893951416, "learning_rate": 7.8223063604235e-08, "logits/chosen": -2.75453782081604, "logits/rejected": -2.7411556243896484, "logps/chosen": -78.19532775878906, "logps/rejected": -86.12273406982422, "loss": 0.6592, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24958685040473938, "rewards/margins": 0.07666632533073425, "rewards/rejected": -0.32625311613082886, "step": 4390 }, { "epoch": 0.7580978635423845, "grad_norm": 4.8006591796875, "learning_rate": 7.809882123442921e-08, "logits/chosen": -2.7704663276672363, "logits/rejected": -2.7536025047302246, "logps/chosen": -81.3585433959961, "logps/rejected": -86.84334564208984, "loss": 0.6625, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2445327341556549, "rewards/margins": 0.06969234347343445, "rewards/rejected": -0.3142250180244446, "step": 4400 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -2.8745179176330566, "eval_logits/rejected": -2.8686115741729736, "eval_logps/chosen": -78.0712890625, "eval_logps/rejected": -86.64641571044922, "eval_loss": 0.6756924986839294, "eval_rewards/accuracies": 0.6052509546279907, "eval_rewards/chosen": -0.19359391927719116, "eval_rewards/margins": 0.04106910154223442, "eval_rewards/rejected": -0.2346630096435547, "eval_runtime": 359.8765, "eval_samples_per_second": 11.96, "eval_steps_per_second": 1.495, "step": 4400 }, { "epoch": 0.7598208132322536, "grad_norm": 4.7382097244262695, "learning_rate": 7.797432476481942e-08, "logits/chosen": -2.7340545654296875, "logits/rejected": -2.7181458473205566, "logps/chosen": -81.9361572265625, "logps/rejected": -89.33692932128906, "loss": 0.6622, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2704450488090515, "rewards/margins": 0.07261786609888077, "rewards/rejected": -0.3430629074573517, "step": 4410 }, { "epoch": 0.7615437629221227, "grad_norm": 5.843469142913818, "learning_rate": 7.784957532123681e-08, "logits/chosen": -2.8479888439178467, "logits/rejected": -2.805508852005005, "logps/chosen": -80.79518127441406, "logps/rejected": -83.56358337402344, "loss": 0.6634, "rewards/accuracies": 0.625, "rewards/chosen": -0.2597918212413788, "rewards/margins": 0.07114500552415848, "rewards/rejected": -0.33093681931495667, "step": 4420 }, { "epoch": 0.7632667126119917, "grad_norm": 3.8952701091766357, "learning_rate": 7.772457403180022e-08, "logits/chosen": -2.740231513977051, "logits/rejected": -2.7093002796173096, "logps/chosen": -79.71388244628906, "logps/rejected": -83.66574096679688, "loss": 0.6669, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.25931215286254883, "rewards/margins": 0.06413620710372925, "rewards/rejected": -0.3234483599662781, "step": 4430 }, { "epoch": 0.7649896623018608, "grad_norm": 5.432117938995361, "learning_rate": 7.759932202690592e-08, "logits/chosen": -2.8032429218292236, "logits/rejected": -2.773001194000244, "logps/chosen": -86.66961669921875, "logps/rejected": -91.77088928222656, "loss": 0.6515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2726757824420929, "rewards/margins": 0.09480321407318115, "rewards/rejected": -0.36747899651527405, "step": 4440 }, { "epoch": 0.7667126119917298, "grad_norm": 4.670837879180908, "learning_rate": 7.747382043921741e-08, "logits/chosen": -2.676553964614868, "logits/rejected": -2.6650469303131104, "logps/chosen": -81.74214172363281, "logps/rejected": -87.35073852539062, "loss": 0.663, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2602686882019043, "rewards/margins": 0.0713680237531662, "rewards/rejected": -0.3316367268562317, "step": 4450 }, { "epoch": 0.7684355616815989, "grad_norm": 5.040444850921631, "learning_rate": 7.734807040365525e-08, "logits/chosen": -2.7904114723205566, "logits/rejected": -2.783573627471924, "logps/chosen": -78.90962219238281, "logps/rejected": -85.83683776855469, "loss": 0.6638, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2617763876914978, "rewards/margins": 0.06999503076076508, "rewards/rejected": -0.3317714333534241, "step": 4460 }, { "epoch": 0.770158511371468, "grad_norm": 5.055511951446533, "learning_rate": 7.722207305738668e-08, "logits/chosen": -2.8303465843200684, "logits/rejected": -2.8013291358947754, "logps/chosen": -82.27478790283203, "logps/rejected": -85.03372192382812, "loss": 0.656, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26352792978286743, "rewards/margins": 0.08442261070013046, "rewards/rejected": -0.3479505479335785, "step": 4470 }, { "epoch": 0.771881461061337, "grad_norm": 5.498683929443359, "learning_rate": 7.709582953981541e-08, "logits/chosen": -2.853652238845825, "logits/rejected": -2.8468830585479736, "logps/chosen": -78.6220703125, "logps/rejected": -86.2085952758789, "loss": 0.6668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.276112824678421, "rewards/margins": 0.06350714713335037, "rewards/rejected": -0.3396199345588684, "step": 4480 }, { "epoch": 0.7736044107512061, "grad_norm": 5.022222518920898, "learning_rate": 7.696934099257128e-08, "logits/chosen": -2.744422197341919, "logits/rejected": -2.718071937561035, "logps/chosen": -79.3916015625, "logps/rejected": -85.3674087524414, "loss": 0.6663, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2825077176094055, "rewards/margins": 0.07279295474290848, "rewards/rejected": -0.3553006649017334, "step": 4490 }, { "epoch": 0.7753273604410751, "grad_norm": 5.13820743560791, "learning_rate": 7.684260855949997e-08, "logits/chosen": -2.7579360008239746, "logits/rejected": -2.7426867485046387, "logps/chosen": -83.91657257080078, "logps/rejected": -88.30168151855469, "loss": 0.6605, "rewards/accuracies": 0.65625, "rewards/chosen": -0.275812566280365, "rewards/margins": 0.07757066935300827, "rewards/rejected": -0.35338321328163147, "step": 4500 }, { "epoch": 0.7753273604410751, "eval_logits/chosen": -2.864949941635132, "eval_logits/rejected": -2.8590142726898193, "eval_logps/chosen": -79.67756652832031, "eval_logps/rejected": -88.53424072265625, "eval_loss": 0.6745909452438354, "eval_rewards/accuracies": 0.6066449880599976, "eval_rewards/chosen": -0.20965667068958282, "eval_rewards/margins": 0.043884556740522385, "eval_rewards/rejected": -0.2535412013530731, "eval_runtime": 359.4763, "eval_samples_per_second": 11.973, "eval_steps_per_second": 1.497, "step": 4500 }, { "epoch": 0.7770503101309442, "grad_norm": 5.928679466247559, "learning_rate": 7.671563338665262e-08, "logits/chosen": -2.8647067546844482, "logits/rejected": -2.8383216857910156, "logps/chosen": -80.19285583496094, "logps/rejected": -91.64847564697266, "loss": 0.6525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.26760441064834595, "rewards/margins": 0.09270481765270233, "rewards/rejected": -0.3603092133998871, "step": 4510 }, { "epoch": 0.7787732598208132, "grad_norm": 5.4021406173706055, "learning_rate": 7.65884166222755e-08, "logits/chosen": -2.8022427558898926, "logits/rejected": -2.781902313232422, "logps/chosen": -81.70062255859375, "logps/rejected": -84.92115783691406, "loss": 0.6701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.29102572798728943, "rewards/margins": 0.05737787485122681, "rewards/rejected": -0.34840360283851624, "step": 4520 }, { "epoch": 0.7804962095106823, "grad_norm": 5.309082508087158, "learning_rate": 7.646095941679962e-08, "logits/chosen": -2.7916605472564697, "logits/rejected": -2.782623767852783, "logps/chosen": -81.60151672363281, "logps/rejected": -86.02545928955078, "loss": 0.6752, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2782711982727051, "rewards/margins": 0.0471639521420002, "rewards/rejected": -0.32543516159057617, "step": 4530 }, { "epoch": 0.7822191592005513, "grad_norm": 6.1212687492370605, "learning_rate": 7.633326292283028e-08, "logits/chosen": -2.704936981201172, "logits/rejected": -2.6931707859039307, "logps/chosen": -79.34037780761719, "logps/rejected": -85.31973266601562, "loss": 0.6595, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2660858631134033, "rewards/margins": 0.0777621865272522, "rewards/rejected": -0.3438480496406555, "step": 4540 }, { "epoch": 0.7839421088904204, "grad_norm": 5.877651691436768, "learning_rate": 7.620532829513672e-08, "logits/chosen": -2.802441120147705, "logits/rejected": -2.7796969413757324, "logps/chosen": -82.77192687988281, "logps/rejected": -89.22770690917969, "loss": 0.6521, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.26413458585739136, "rewards/margins": 0.09422699362039566, "rewards/rejected": -0.35836154222488403, "step": 4550 }, { "epoch": 0.7856650585802895, "grad_norm": 4.796073913574219, "learning_rate": 7.607715669064162e-08, "logits/chosen": -2.711344003677368, "logits/rejected": -2.674924612045288, "logps/chosen": -81.03084564208984, "logps/rejected": -88.92893981933594, "loss": 0.6442, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2481379508972168, "rewards/margins": 0.11026115715503693, "rewards/rejected": -0.35839909315109253, "step": 4560 }, { "epoch": 0.7873880082701585, "grad_norm": 5.433099746704102, "learning_rate": 7.594874926841069e-08, "logits/chosen": -2.7318739891052246, "logits/rejected": -2.707764148712158, "logps/chosen": -79.96495056152344, "logps/rejected": -89.24745178222656, "loss": 0.6575, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2887458801269531, "rewards/margins": 0.08619584143161774, "rewards/rejected": -0.37494176626205444, "step": 4570 }, { "epoch": 0.7891109579600276, "grad_norm": 5.255424976348877, "learning_rate": 7.582010718964212e-08, "logits/chosen": -2.701725721359253, "logits/rejected": -2.675565242767334, "logps/chosen": -78.10517883300781, "logps/rejected": -84.2778549194336, "loss": 0.655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2384556531906128, "rewards/margins": 0.08670667558908463, "rewards/rejected": -0.3251623511314392, "step": 4580 }, { "epoch": 0.7908339076498966, "grad_norm": 5.248478412628174, "learning_rate": 7.569123161765611e-08, "logits/chosen": -2.7991232872009277, "logits/rejected": -2.7724077701568604, "logps/chosen": -83.50713348388672, "logps/rejected": -86.54850006103516, "loss": 0.6708, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2862285375595093, "rewards/margins": 0.05719224363565445, "rewards/rejected": -0.3434208035469055, "step": 4590 }, { "epoch": 0.7925568573397657, "grad_norm": 5.3829522132873535, "learning_rate": 7.556212371788441e-08, "logits/chosen": -2.682486057281494, "logits/rejected": -2.6631035804748535, "logps/chosen": -82.19569396972656, "logps/rejected": -94.45573425292969, "loss": 0.6437, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2883058190345764, "rewards/margins": 0.11274129152297974, "rewards/rejected": -0.40104714035987854, "step": 4600 }, { "epoch": 0.7925568573397657, "eval_logits/chosen": -2.8572607040405273, "eval_logits/rejected": -2.851304531097412, "eval_logps/chosen": -81.13443756103516, "eval_logps/rejected": -90.21497344970703, "eval_loss": 0.6737370491027832, "eval_rewards/accuracies": 0.6071096658706665, "eval_rewards/chosen": -0.22422541677951813, "eval_rewards/margins": 0.04612297564744949, "eval_rewards/rejected": -0.2703484296798706, "eval_runtime": 360.2206, "eval_samples_per_second": 11.948, "eval_steps_per_second": 1.494, "step": 4600 }, { "epoch": 0.7942798070296347, "grad_norm": 4.989021301269531, "learning_rate": 7.543278465785973e-08, "logits/chosen": -2.7107458114624023, "logits/rejected": -2.700726270675659, "logps/chosen": -82.72367858886719, "logps/rejected": -87.36424255371094, "loss": 0.6745, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.31585732102394104, "rewards/margins": 0.04831356555223465, "rewards/rejected": -0.3641708493232727, "step": 4610 }, { "epoch": 0.7960027567195038, "grad_norm": 6.328110694885254, "learning_rate": 7.530321560720508e-08, "logits/chosen": -2.74727201461792, "logits/rejected": -2.7238094806671143, "logps/chosen": -83.11624908447266, "logps/rejected": -92.23828887939453, "loss": 0.6525, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.273975133895874, "rewards/margins": 0.09328394383192062, "rewards/rejected": -0.36725908517837524, "step": 4620 }, { "epoch": 0.7977257064093728, "grad_norm": 5.385335922241211, "learning_rate": 7.517341773762341e-08, "logits/chosen": -2.780961513519287, "logits/rejected": -2.747697114944458, "logps/chosen": -86.0129165649414, "logps/rejected": -93.71475219726562, "loss": 0.6565, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2872922420501709, "rewards/margins": 0.08390410244464874, "rewards/rejected": -0.37119635939598083, "step": 4630 }, { "epoch": 0.7994486560992419, "grad_norm": 5.627150535583496, "learning_rate": 7.504339222288683e-08, "logits/chosen": -2.7807700634002686, "logits/rejected": -2.7660746574401855, "logps/chosen": -87.76912689208984, "logps/rejected": -95.35968017578125, "loss": 0.6655, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3301452398300171, "rewards/margins": 0.06794091314077377, "rewards/rejected": -0.39808616042137146, "step": 4640 }, { "epoch": 0.801171605789111, "grad_norm": 5.463457107543945, "learning_rate": 7.491314023882607e-08, "logits/chosen": -2.781244993209839, "logits/rejected": -2.7495713233947754, "logps/chosen": -84.5881576538086, "logps/rejected": -89.88493347167969, "loss": 0.6561, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.304373562335968, "rewards/margins": 0.08668698370456696, "rewards/rejected": -0.39106056094169617, "step": 4650 }, { "epoch": 0.80289455547898, "grad_norm": 6.089754104614258, "learning_rate": 7.478266296331988e-08, "logits/chosen": -2.7720773220062256, "logits/rejected": -2.754800319671631, "logps/chosen": -82.82508087158203, "logps/rejected": -91.43245697021484, "loss": 0.6551, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.28517812490463257, "rewards/margins": 0.0863594189286232, "rewards/rejected": -0.37153756618499756, "step": 4660 }, { "epoch": 0.8046175051688491, "grad_norm": 4.796602725982666, "learning_rate": 7.46519615762843e-08, "logits/chosen": -2.763443946838379, "logits/rejected": -2.7395286560058594, "logps/chosen": -80.69648742675781, "logps/rejected": -85.73301696777344, "loss": 0.6568, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2826913595199585, "rewards/margins": 0.08366372436285019, "rewards/rejected": -0.3663550913333893, "step": 4670 }, { "epoch": 0.8063404548587181, "grad_norm": 5.143124580383301, "learning_rate": 7.452103725966201e-08, "logits/chosen": -2.750540256500244, "logits/rejected": -2.7209665775299072, "logps/chosen": -82.34010314941406, "logps/rejected": -89.6563949584961, "loss": 0.6594, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2984951436519623, "rewards/margins": 0.08116915076971054, "rewards/rejected": -0.3796643614768982, "step": 4680 }, { "epoch": 0.8080634045485872, "grad_norm": 5.349360942840576, "learning_rate": 7.438989119741173e-08, "logits/chosen": -2.7934322357177734, "logits/rejected": -2.7713704109191895, "logps/chosen": -86.65959930419922, "logps/rejected": -95.86109924316406, "loss": 0.6485, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3089648187160492, "rewards/margins": 0.10664431750774384, "rewards/rejected": -0.41560912132263184, "step": 4690 }, { "epoch": 0.8097863542384562, "grad_norm": 5.843964576721191, "learning_rate": 7.425852457549736e-08, "logits/chosen": -2.763597249984741, "logits/rejected": -2.755293130874634, "logps/chosen": -83.36461639404297, "logps/rejected": -97.21564483642578, "loss": 0.6526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.31630265712738037, "rewards/margins": 0.09895575791597366, "rewards/rejected": -0.4152584671974182, "step": 4700 }, { "epoch": 0.8097863542384562, "eval_logits/chosen": -2.8489065170288086, "eval_logits/rejected": -2.8429276943206787, "eval_logps/chosen": -82.56464385986328, "eval_logps/rejected": -91.90455627441406, "eval_loss": 0.6727486848831177, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -0.23852740228176117, "eval_rewards/margins": 0.0487169586122036, "eval_rewards/rejected": -0.2872443199157715, "eval_runtime": 359.9733, "eval_samples_per_second": 11.956, "eval_steps_per_second": 1.495, "step": 4700 }, { "epoch": 0.8115093039283253, "grad_norm": 5.262428283691406, "learning_rate": 7.41269385818774e-08, "logits/chosen": -2.810303211212158, "logits/rejected": -2.7742409706115723, "logps/chosen": -88.20140075683594, "logps/rejected": -92.91087341308594, "loss": 0.653, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.3025413155555725, "rewards/margins": 0.1027384027838707, "rewards/rejected": -0.4052796959877014, "step": 4710 }, { "epoch": 0.8132322536181944, "grad_norm": 5.286714553833008, "learning_rate": 7.399513440649412e-08, "logits/chosen": -2.6862411499023438, "logits/rejected": -2.657238483428955, "logps/chosen": -86.468017578125, "logps/rejected": -93.57646179199219, "loss": 0.6442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29665708541870117, "rewards/margins": 0.11476937681436539, "rewards/rejected": -0.41142645478248596, "step": 4720 }, { "epoch": 0.8149552033080634, "grad_norm": 5.766880989074707, "learning_rate": 7.386311324126282e-08, "logits/chosen": -2.680243968963623, "logits/rejected": -2.656158924102783, "logps/chosen": -86.38379669189453, "logps/rejected": -90.43648529052734, "loss": 0.667, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3263756334781647, "rewards/margins": 0.06575731933116913, "rewards/rejected": -0.3921329379081726, "step": 4730 }, { "epoch": 0.8166781529979324, "grad_norm": 5.2600297927856445, "learning_rate": 7.373087628006106e-08, "logits/chosen": -2.7628931999206543, "logits/rejected": -2.751981258392334, "logps/chosen": -83.48346710205078, "logps/rejected": -92.15267181396484, "loss": 0.6531, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2904190421104431, "rewards/margins": 0.09240089356899261, "rewards/rejected": -0.38281992077827454, "step": 4740 }, { "epoch": 0.8184011026878015, "grad_norm": 5.543763637542725, "learning_rate": 7.359842471871787e-08, "logits/chosen": -2.7339413166046143, "logits/rejected": -2.7162299156188965, "logps/chosen": -83.31120300292969, "logps/rejected": -92.21416473388672, "loss": 0.6511, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.27919134497642517, "rewards/margins": 0.09701523184776306, "rewards/rejected": -0.37620657682418823, "step": 4750 }, { "epoch": 0.8201240523776706, "grad_norm": 6.084500789642334, "learning_rate": 7.346575975500291e-08, "logits/chosen": -2.73199725151062, "logits/rejected": -2.7106871604919434, "logps/chosen": -84.28748321533203, "logps/rejected": -91.7631607055664, "loss": 0.657, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3181541860103607, "rewards/margins": 0.08389735966920853, "rewards/rejected": -0.40205153822898865, "step": 4760 }, { "epoch": 0.8218470020675396, "grad_norm": 5.495306491851807, "learning_rate": 7.333288258861567e-08, "logits/chosen": -2.7941107749938965, "logits/rejected": -2.773500919342041, "logps/chosen": -92.41073608398438, "logps/rejected": -96.80774688720703, "loss": 0.6574, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3282061517238617, "rewards/margins": 0.08787950128316879, "rewards/rejected": -0.4160856604576111, "step": 4770 }, { "epoch": 0.8235699517574087, "grad_norm": 6.325778007507324, "learning_rate": 7.31997944211746e-08, "logits/chosen": -2.6705031394958496, "logits/rejected": -2.651705741882324, "logps/chosen": -81.9596939086914, "logps/rejected": -96.26366424560547, "loss": 0.6408, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.30489587783813477, "rewards/margins": 0.11830363422632217, "rewards/rejected": -0.42319950461387634, "step": 4780 }, { "epoch": 0.8252929014472777, "grad_norm": 4.711288928985596, "learning_rate": 7.306649645620623e-08, "logits/chosen": -2.7664904594421387, "logits/rejected": -2.7429041862487793, "logps/chosen": -91.65912628173828, "logps/rejected": -91.39378356933594, "loss": 0.6751, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.3386782109737396, "rewards/margins": 0.0473986491560936, "rewards/rejected": -0.3860768675804138, "step": 4790 }, { "epoch": 0.8270158511371468, "grad_norm": 5.897047996520996, "learning_rate": 7.293298989913435e-08, "logits/chosen": -2.6948139667510986, "logits/rejected": -2.6794960498809814, "logps/chosen": -82.49443054199219, "logps/rejected": -88.50199127197266, "loss": 0.6604, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3049885630607605, "rewards/margins": 0.07673478126525879, "rewards/rejected": -0.3817233443260193, "step": 4800 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -2.840977191925049, "eval_logits/rejected": -2.8350658416748047, "eval_logps/chosen": -83.65935516357422, "eval_logps/rejected": -93.16956329345703, "eval_loss": 0.6720952987670898, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.2494746446609497, "eval_rewards/margins": 0.05041969567537308, "eval_rewards/rejected": -0.2998943626880646, "eval_runtime": 360.2892, "eval_samples_per_second": 11.946, "eval_steps_per_second": 1.493, "step": 4800 }, { "epoch": 0.8287388008270159, "grad_norm": 5.640036106109619, "learning_rate": 7.279927595726899e-08, "logits/chosen": -2.7054238319396973, "logits/rejected": -2.6721575260162354, "logps/chosen": -88.3663558959961, "logps/rejected": -93.75032043457031, "loss": 0.6497, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3115006685256958, "rewards/margins": 0.1022612601518631, "rewards/rejected": -0.4137619137763977, "step": 4810 }, { "epoch": 0.8304617505168849, "grad_norm": 5.218100547790527, "learning_rate": 7.266535583979565e-08, "logits/chosen": -2.7404797077178955, "logits/rejected": -2.7312655448913574, "logps/chosen": -82.75204467773438, "logps/rejected": -92.75565338134766, "loss": 0.6541, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3159467577934265, "rewards/margins": 0.09343425929546356, "rewards/rejected": -0.40938109159469604, "step": 4820 }, { "epoch": 0.832184700206754, "grad_norm": 4.796080589294434, "learning_rate": 7.253123075776428e-08, "logits/chosen": -2.7532737255096436, "logits/rejected": -2.731318950653076, "logps/chosen": -86.41776275634766, "logps/rejected": -91.73139953613281, "loss": 0.6688, "rewards/accuracies": 0.5625, "rewards/chosen": -0.32840853929519653, "rewards/margins": 0.06262843310832977, "rewards/rejected": -0.3910369277000427, "step": 4830 }, { "epoch": 0.833907649896623, "grad_norm": 5.210707664489746, "learning_rate": 7.239690192407829e-08, "logits/chosen": -2.7101588249206543, "logits/rejected": -2.680236339569092, "logps/chosen": -88.89765930175781, "logps/rejected": -96.16397094726562, "loss": 0.6465, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.32727310061454773, "rewards/margins": 0.110235795378685, "rewards/rejected": -0.43750882148742676, "step": 4840 }, { "epoch": 0.8356305995864921, "grad_norm": 5.94487190246582, "learning_rate": 7.226237055348368e-08, "logits/chosen": -2.7761588096618652, "logits/rejected": -2.7590034008026123, "logps/chosen": -85.37464141845703, "logps/rejected": -95.1301498413086, "loss": 0.6502, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3124055862426758, "rewards/margins": 0.10277421772480011, "rewards/rejected": -0.4151798188686371, "step": 4850 }, { "epoch": 0.8373535492763611, "grad_norm": 6.092998504638672, "learning_rate": 7.2127637862558e-08, "logits/chosen": -2.783973217010498, "logits/rejected": -2.7518773078918457, "logps/chosen": -91.34198760986328, "logps/rejected": -90.67448425292969, "loss": 0.6636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3288485109806061, "rewards/margins": 0.07213427126407623, "rewards/rejected": -0.4009827673435211, "step": 4860 }, { "epoch": 0.8390764989662302, "grad_norm": 6.207915782928467, "learning_rate": 7.199270506969934e-08, "logits/chosen": -2.7969024181365967, "logits/rejected": -2.771622657775879, "logps/chosen": -91.00338745117188, "logps/rejected": -94.34010314941406, "loss": 0.6543, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33533793687820435, "rewards/margins": 0.09314209222793579, "rewards/rejected": -0.4284800589084625, "step": 4870 }, { "epoch": 0.8407994486560992, "grad_norm": 5.539631366729736, "learning_rate": 7.185757339511533e-08, "logits/chosen": -2.6904730796813965, "logits/rejected": -2.6575820446014404, "logps/chosen": -84.82585906982422, "logps/rejected": -90.90750885009766, "loss": 0.6566, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3164503872394562, "rewards/margins": 0.08641557395458221, "rewards/rejected": -0.4028659760951996, "step": 4880 }, { "epoch": 0.8425223983459683, "grad_norm": 5.719715595245361, "learning_rate": 7.172224406081215e-08, "logits/chosen": -2.7437620162963867, "logits/rejected": -2.7225348949432373, "logps/chosen": -88.09601593017578, "logps/rejected": -96.74874114990234, "loss": 0.6477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.32675427198410034, "rewards/margins": 0.1053960919380188, "rewards/rejected": -0.4321504235267639, "step": 4890 }, { "epoch": 0.8442453480358374, "grad_norm": 5.439476490020752, "learning_rate": 7.158671829058332e-08, "logits/chosen": -2.719465732574463, "logits/rejected": -2.6880860328674316, "logps/chosen": -87.46734619140625, "logps/rejected": -92.49640655517578, "loss": 0.6664, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.34264951944351196, "rewards/margins": 0.07987688481807709, "rewards/rejected": -0.42252641916275024, "step": 4900 }, { "epoch": 0.8442453480358374, "eval_logits/chosen": -2.8323721885681152, "eval_logits/rejected": -2.826402425765991, "eval_logps/chosen": -84.92655181884766, "eval_logps/rejected": -94.65947723388672, "eval_loss": 0.6712405681610107, "eval_rewards/accuracies": 0.604786217212677, "eval_rewards/chosen": -0.2621465027332306, "eval_rewards/margins": 0.052647095173597336, "eval_rewards/rejected": -0.3147936165332794, "eval_runtime": 360.6945, "eval_samples_per_second": 11.933, "eval_steps_per_second": 1.492, "step": 4900 }, { "epoch": 0.8459682977257064, "grad_norm": 6.13719367980957, "learning_rate": 7.145099730999888e-08, "logits/chosen": -2.8455302715301514, "logits/rejected": -2.81510591506958, "logps/chosen": -89.68118286132812, "logps/rejected": -94.53526306152344, "loss": 0.6571, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3334195017814636, "rewards/margins": 0.08758939802646637, "rewards/rejected": -0.4210088849067688, "step": 4910 }, { "epoch": 0.8476912474155754, "grad_norm": 5.903871536254883, "learning_rate": 7.131508234639405e-08, "logits/chosen": -2.683255195617676, "logits/rejected": -2.6657886505126953, "logps/chosen": -86.96060180664062, "logps/rejected": -98.50045013427734, "loss": 0.643, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3312074542045593, "rewards/margins": 0.11914181709289551, "rewards/rejected": -0.45034927129745483, "step": 4920 }, { "epoch": 0.8494141971054445, "grad_norm": 5.683137893676758, "learning_rate": 7.117897462885836e-08, "logits/chosen": -2.853965997695923, "logits/rejected": -2.8416576385498047, "logps/chosen": -88.86614990234375, "logps/rejected": -102.5183334350586, "loss": 0.6575, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.35723018646240234, "rewards/margins": 0.08792047947645187, "rewards/rejected": -0.4451506733894348, "step": 4930 }, { "epoch": 0.8511371467953136, "grad_norm": 5.63087272644043, "learning_rate": 7.104267538822435e-08, "logits/chosen": -2.7871501445770264, "logits/rejected": -2.758396625518799, "logps/chosen": -90.38795471191406, "logps/rejected": -97.97694396972656, "loss": 0.6483, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3446611762046814, "rewards/margins": 0.10284022986888885, "rewards/rejected": -0.44750142097473145, "step": 4940 }, { "epoch": 0.8528600964851827, "grad_norm": 6.1002044677734375, "learning_rate": 7.090618585705657e-08, "logits/chosen": -2.5923728942871094, "logits/rejected": -2.578038215637207, "logps/chosen": -85.99813079833984, "logps/rejected": -94.92589569091797, "loss": 0.6521, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3117091953754425, "rewards/margins": 0.09636653959751129, "rewards/rejected": -0.4080757200717926, "step": 4950 }, { "epoch": 0.8545830461750517, "grad_norm": 5.581972122192383, "learning_rate": 7.076950726964034e-08, "logits/chosen": -2.7105612754821777, "logits/rejected": -2.7103323936462402, "logps/chosen": -84.18190002441406, "logps/rejected": -98.76054382324219, "loss": 0.6481, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3192813992500305, "rewards/margins": 0.10413311421871185, "rewards/rejected": -0.42341452836990356, "step": 4960 }, { "epoch": 0.8563059958649207, "grad_norm": 6.258480072021484, "learning_rate": 7.063264086197066e-08, "logits/chosen": -2.6861400604248047, "logits/rejected": -2.6755423545837402, "logps/chosen": -84.96774291992188, "logps/rejected": -96.87010192871094, "loss": 0.6457, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.33594292402267456, "rewards/margins": 0.1142006665468216, "rewards/rejected": -0.4501435160636902, "step": 4970 }, { "epoch": 0.8580289455547898, "grad_norm": 6.162121295928955, "learning_rate": 7.049558787174099e-08, "logits/chosen": -2.7526276111602783, "logits/rejected": -2.7450337409973145, "logps/chosen": -85.32099914550781, "logps/rejected": -96.36564636230469, "loss": 0.6561, "rewards/accuracies": 0.625, "rewards/chosen": -0.3246348798274994, "rewards/margins": 0.09016862511634827, "rewards/rejected": -0.41480350494384766, "step": 4980 }, { "epoch": 0.8597518952446589, "grad_norm": 5.951625347137451, "learning_rate": 7.035834953833208e-08, "logits/chosen": -2.6703968048095703, "logits/rejected": -2.6620566844940186, "logps/chosen": -85.4616928100586, "logps/rejected": -97.10552978515625, "loss": 0.6581, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3367730677127838, "rewards/margins": 0.08541145920753479, "rewards/rejected": -0.42218446731567383, "step": 4990 }, { "epoch": 0.8614748449345279, "grad_norm": 6.248295783996582, "learning_rate": 7.022092710280073e-08, "logits/chosen": -2.6949782371520996, "logits/rejected": -2.6747851371765137, "logps/chosen": -88.69740295410156, "logps/rejected": -96.07904052734375, "loss": 0.6499, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3289342522621155, "rewards/margins": 0.10480217635631561, "rewards/rejected": -0.4337364137172699, "step": 5000 }, { "epoch": 0.8614748449345279, "eval_logits/chosen": -2.817162275314331, "eval_logits/rejected": -2.8111133575439453, "eval_logps/chosen": -85.77025604248047, "eval_logps/rejected": -95.64825439453125, "eval_loss": 0.6707120537757874, "eval_rewards/accuracies": 0.5954925417900085, "eval_rewards/chosen": -0.2705835700035095, "eval_rewards/margins": 0.054097648710012436, "eval_rewards/rejected": -0.32468119263648987, "eval_runtime": 360.2067, "eval_samples_per_second": 11.949, "eval_steps_per_second": 1.494, "step": 5000 }, { "epoch": 0.8631977946243969, "grad_norm": 7.2535319328308105, "learning_rate": 7.008332180786861e-08, "logits/chosen": -2.6919608116149902, "logits/rejected": -2.6655592918395996, "logps/chosen": -89.97347259521484, "logps/rejected": -97.08216094970703, "loss": 0.6554, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.36922687292099, "rewards/margins": 0.09045498073101044, "rewards/rejected": -0.45968183875083923, "step": 5010 }, { "epoch": 0.864920744314266, "grad_norm": 6.6150126457214355, "learning_rate": 6.994553489791103e-08, "logits/chosen": -2.6872353553771973, "logits/rejected": -2.662476062774658, "logps/chosen": -87.15821075439453, "logps/rejected": -95.46469116210938, "loss": 0.6541, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3365879952907562, "rewards/margins": 0.09809447824954987, "rewards/rejected": -0.4346825182437897, "step": 5020 }, { "epoch": 0.8666436940041351, "grad_norm": 7.26581335067749, "learning_rate": 6.980756761894559e-08, "logits/chosen": -2.6435627937316895, "logits/rejected": -2.6336257457733154, "logps/chosen": -86.87774658203125, "logps/rejected": -96.81118774414062, "loss": 0.6585, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.34260568022727966, "rewards/margins": 0.08591968566179276, "rewards/rejected": -0.4285253882408142, "step": 5030 }, { "epoch": 0.8683666436940042, "grad_norm": 6.305263519287109, "learning_rate": 6.966942121862102e-08, "logits/chosen": -2.8530077934265137, "logits/rejected": -2.8140368461608887, "logps/chosen": -92.75401306152344, "logps/rejected": -93.6357421875, "loss": 0.663, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3559655547142029, "rewards/margins": 0.07592367380857468, "rewards/rejected": -0.43188923597335815, "step": 5040 }, { "epoch": 0.8700895933838731, "grad_norm": 6.878271102905273, "learning_rate": 6.953109694620587e-08, "logits/chosen": -2.733891725540161, "logits/rejected": -2.7108194828033447, "logps/chosen": -90.18342590332031, "logps/rejected": -98.98524475097656, "loss": 0.6534, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.34579095244407654, "rewards/margins": 0.09699489176273346, "rewards/rejected": -0.4427858293056488, "step": 5050 }, { "epoch": 0.8718125430737422, "grad_norm": 5.571224212646484, "learning_rate": 6.939259605257717e-08, "logits/chosen": -2.597719430923462, "logits/rejected": -2.5918784141540527, "logps/chosen": -85.84352111816406, "logps/rejected": -94.81729125976562, "loss": 0.6632, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3537163734436035, "rewards/margins": 0.07410316169261932, "rewards/rejected": -0.42781955003738403, "step": 5060 }, { "epoch": 0.8735354927636113, "grad_norm": 6.979639053344727, "learning_rate": 6.925391979020918e-08, "logits/chosen": -2.6971375942230225, "logits/rejected": -2.67332124710083, "logps/chosen": -88.00436401367188, "logps/rejected": -93.44691467285156, "loss": 0.6519, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.35256052017211914, "rewards/margins": 0.10063686221837997, "rewards/rejected": -0.4531974196434021, "step": 5070 }, { "epoch": 0.8752584424534804, "grad_norm": 5.353731155395508, "learning_rate": 6.911506941316199e-08, "logits/chosen": -2.883151054382324, "logits/rejected": -2.8466620445251465, "logps/chosen": -90.44625091552734, "logps/rejected": -94.98185729980469, "loss": 0.646, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3294186592102051, "rewards/margins": 0.11147884279489517, "rewards/rejected": -0.44089746475219727, "step": 5080 }, { "epoch": 0.8769813921433495, "grad_norm": 6.316625595092773, "learning_rate": 6.89760461770703e-08, "logits/chosen": -2.7551684379577637, "logits/rejected": -2.724618673324585, "logps/chosen": -88.63999938964844, "logps/rejected": -98.74547576904297, "loss": 0.6384, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3434928357601166, "rewards/margins": 0.12895774841308594, "rewards/rejected": -0.4724505543708801, "step": 5090 }, { "epoch": 0.8787043418332184, "grad_norm": 8.141860008239746, "learning_rate": 6.88368513391319e-08, "logits/chosen": -2.5979244709014893, "logits/rejected": -2.5645573139190674, "logps/chosen": -91.8878402709961, "logps/rejected": -96.7591323852539, "loss": 0.6628, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3682095408439636, "rewards/margins": 0.07615621387958527, "rewards/rejected": -0.44436579942703247, "step": 5100 }, { "epoch": 0.8787043418332184, "eval_logits/chosen": -2.8094429969787598, "eval_logits/rejected": -2.8035061359405518, "eval_logps/chosen": -87.14309692382812, "eval_logps/rejected": -97.29234313964844, "eval_loss": 0.6696966886520386, "eval_rewards/accuracies": 0.5968866348266602, "eval_rewards/chosen": -0.28431203961372375, "eval_rewards/margins": 0.05681019648909569, "eval_rewards/rejected": -0.34112221002578735, "eval_runtime": 360.5514, "eval_samples_per_second": 11.937, "eval_steps_per_second": 1.492, "step": 5100 }, { "epoch": 0.8804272915230875, "grad_norm": 7.131643772125244, "learning_rate": 6.869748615809644e-08, "logits/chosen": -2.7321696281433105, "logits/rejected": -2.6981236934661865, "logps/chosen": -95.87647247314453, "logps/rejected": -99.60868835449219, "loss": 0.6653, "rewards/accuracies": 0.625, "rewards/chosen": -0.3800361156463623, "rewards/margins": 0.07765528559684753, "rewards/rejected": -0.45769137144088745, "step": 5110 }, { "epoch": 0.8821502412129566, "grad_norm": 6.144846439361572, "learning_rate": 6.855795189425398e-08, "logits/chosen": -2.7883951663970947, "logits/rejected": -2.7596137523651123, "logps/chosen": -91.65258026123047, "logps/rejected": -94.02896881103516, "loss": 0.6695, "rewards/accuracies": 0.59375, "rewards/chosen": -0.37779197096824646, "rewards/margins": 0.05963408201932907, "rewards/rejected": -0.4374260902404785, "step": 5120 }, { "epoch": 0.8838731909028257, "grad_norm": 6.057769298553467, "learning_rate": 6.841824980942361e-08, "logits/chosen": -2.804337739944458, "logits/rejected": -2.7736172676086426, "logps/chosen": -90.45610046386719, "logps/rejected": -95.85780334472656, "loss": 0.6565, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.368899941444397, "rewards/margins": 0.0899074450135231, "rewards/rejected": -0.4588073790073395, "step": 5130 }, { "epoch": 0.8855961405926946, "grad_norm": 5.70902156829834, "learning_rate": 6.827838116694204e-08, "logits/chosen": -2.7242846488952637, "logits/rejected": -2.6901285648345947, "logps/chosen": -92.74956512451172, "logps/rejected": -96.52131652832031, "loss": 0.6574, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3493127226829529, "rewards/margins": 0.08618960529565811, "rewards/rejected": -0.435502290725708, "step": 5140 }, { "epoch": 0.8873190902825637, "grad_norm": 6.267994403839111, "learning_rate": 6.81383472316522e-08, "logits/chosen": -2.7582485675811768, "logits/rejected": -2.749330997467041, "logps/chosen": -86.73070526123047, "logps/rejected": -98.75907897949219, "loss": 0.6494, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.33600908517837524, "rewards/margins": 0.10284741222858429, "rewards/rejected": -0.4388565123081207, "step": 5150 }, { "epoch": 0.8890420399724328, "grad_norm": 6.955835819244385, "learning_rate": 6.79981492698917e-08, "logits/chosen": -2.7331669330596924, "logits/rejected": -2.698927640914917, "logps/chosen": -86.40743255615234, "logps/rejected": -93.25131225585938, "loss": 0.6508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.34990599751472473, "rewards/margins": 0.10248558223247528, "rewards/rejected": -0.4523916244506836, "step": 5160 }, { "epoch": 0.8907649896623019, "grad_norm": 6.757615089416504, "learning_rate": 6.785778854948155e-08, "logits/chosen": -2.7200756072998047, "logits/rejected": -2.6970667839050293, "logps/chosen": -87.436767578125, "logps/rejected": -98.19063568115234, "loss": 0.6336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3214018940925598, "rewards/margins": 0.14203134179115295, "rewards/rejected": -0.46343326568603516, "step": 5170 }, { "epoch": 0.892487939352171, "grad_norm": 7.490974426269531, "learning_rate": 6.771726633971452e-08, "logits/chosen": -2.8090243339538574, "logits/rejected": -2.7990384101867676, "logps/chosen": -94.57676696777344, "logps/rejected": -101.28718566894531, "loss": 0.664, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3917374312877655, "rewards/margins": 0.07604822516441345, "rewards/rejected": -0.46778565645217896, "step": 5180 }, { "epoch": 0.8942108890420399, "grad_norm": 7.311649799346924, "learning_rate": 6.757658391134377e-08, "logits/chosen": -2.691488027572632, "logits/rejected": -2.6800453662872314, "logps/chosen": -90.04177856445312, "logps/rejected": -96.65260314941406, "loss": 0.6734, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3720157742500305, "rewards/margins": 0.05560746788978577, "rewards/rejected": -0.4276232123374939, "step": 5190 }, { "epoch": 0.895933838731909, "grad_norm": 5.600946426391602, "learning_rate": 6.743574253657136e-08, "logits/chosen": -2.757327079772949, "logits/rejected": -2.7360892295837402, "logps/chosen": -87.47918701171875, "logps/rejected": -99.45861053466797, "loss": 0.6513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.35400596261024475, "rewards/margins": 0.09994912892580032, "rewards/rejected": -0.4539550244808197, "step": 5200 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -2.8031022548675537, "eval_logits/rejected": -2.7971508502960205, "eval_logps/chosen": -87.38237762451172, "eval_logps/rejected": -97.62223815917969, "eval_loss": 0.6693427562713623, "eval_rewards/accuracies": 0.5952602028846741, "eval_rewards/chosen": -0.28670480847358704, "eval_rewards/margins": 0.0577162504196167, "eval_rewards/rejected": -0.3444210886955261, "eval_runtime": 360.2553, "eval_samples_per_second": 11.947, "eval_steps_per_second": 1.493, "step": 5200 }, { "epoch": 0.8976567884217781, "grad_norm": 6.237802982330322, "learning_rate": 6.729474348903667e-08, "logits/chosen": -2.713639497756958, "logits/rejected": -2.698694944381714, "logps/chosen": -87.16832733154297, "logps/rejected": -93.84297180175781, "loss": 0.6605, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.35529080033302307, "rewards/margins": 0.083133764564991, "rewards/rejected": -0.43842458724975586, "step": 5210 }, { "epoch": 0.8993797381116472, "grad_norm": 5.577020168304443, "learning_rate": 6.715358804380495e-08, "logits/chosen": -2.7461514472961426, "logits/rejected": -2.733440637588501, "logps/chosen": -86.21382904052734, "logps/rejected": -96.14476776123047, "loss": 0.6684, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3360275626182556, "rewards/margins": 0.06560958921909332, "rewards/rejected": -0.40163716673851013, "step": 5220 }, { "epoch": 0.9011026878015161, "grad_norm": 6.821839809417725, "learning_rate": 6.701227747735576e-08, "logits/chosen": -2.6805355548858643, "logits/rejected": -2.669785976409912, "logps/chosen": -90.50724029541016, "logps/rejected": -104.52873229980469, "loss": 0.6384, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3524440824985504, "rewards/margins": 0.13172096014022827, "rewards/rejected": -0.4841650128364563, "step": 5230 }, { "epoch": 0.9028256374913852, "grad_norm": 6.107116222381592, "learning_rate": 6.687081306757142e-08, "logits/chosen": -2.676337718963623, "logits/rejected": -2.654982328414917, "logps/chosen": -88.69650268554688, "logps/rejected": -105.2641830444336, "loss": 0.6248, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3592351973056793, "rewards/margins": 0.15993066132068634, "rewards/rejected": -0.5191658735275269, "step": 5240 }, { "epoch": 0.9045485871812543, "grad_norm": 5.749131679534912, "learning_rate": 6.67291960937255e-08, "logits/chosen": -2.6306498050689697, "logits/rejected": -2.6170966625213623, "logps/chosen": -87.65843200683594, "logps/rejected": -96.63198852539062, "loss": 0.6574, "rewards/accuracies": 0.625, "rewards/chosen": -0.36680710315704346, "rewards/margins": 0.09071727842092514, "rewards/rejected": -0.45752429962158203, "step": 5250 }, { "epoch": 0.9062715368711234, "grad_norm": 8.217474937438965, "learning_rate": 6.658742783647119e-08, "logits/chosen": -2.7182517051696777, "logits/rejected": -2.6786859035491943, "logps/chosen": -91.67625427246094, "logps/rejected": -96.38301086425781, "loss": 0.6523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.34890884160995483, "rewards/margins": 0.09981432557106018, "rewards/rejected": -0.4487232267856598, "step": 5260 }, { "epoch": 0.9079944865609925, "grad_norm": 6.370172023773193, "learning_rate": 6.644550957782975e-08, "logits/chosen": -2.811823606491089, "logits/rejected": -2.7978882789611816, "logps/chosen": -88.22168731689453, "logps/rejected": -98.79576110839844, "loss": 0.6468, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3666745722293854, "rewards/margins": 0.10982397943735123, "rewards/rejected": -0.4764985144138336, "step": 5270 }, { "epoch": 0.9097174362508614, "grad_norm": 6.619154930114746, "learning_rate": 6.630344260117892e-08, "logits/chosen": -2.715485095977783, "logits/rejected": -2.6864123344421387, "logps/chosen": -93.1703872680664, "logps/rejected": -98.26622009277344, "loss": 0.6578, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3880094289779663, "rewards/margins": 0.0879502147436142, "rewards/rejected": -0.4759596884250641, "step": 5280 }, { "epoch": 0.9114403859407305, "grad_norm": 6.778704643249512, "learning_rate": 6.61612281912413e-08, "logits/chosen": -2.6618595123291016, "logits/rejected": -2.6204473972320557, "logps/chosen": -91.83499145507812, "logps/rejected": -97.12114715576172, "loss": 0.6565, "rewards/accuracies": 0.65625, "rewards/chosen": -0.35979264974594116, "rewards/margins": 0.09102566540241241, "rewards/rejected": -0.4508183002471924, "step": 5290 }, { "epoch": 0.9131633356305996, "grad_norm": 6.4996337890625, "learning_rate": 6.601886763407278e-08, "logits/chosen": -2.6700820922851562, "logits/rejected": -2.6571602821350098, "logps/chosen": -90.00111389160156, "logps/rejected": -100.92208099365234, "loss": 0.6475, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.34902939200401306, "rewards/margins": 0.10871565341949463, "rewards/rejected": -0.4577450752258301, "step": 5300 }, { "epoch": 0.9131633356305996, "eval_logits/chosen": -2.7942566871643066, "eval_logits/rejected": -2.7882344722747803, "eval_logps/chosen": -87.72479248046875, "eval_logps/rejected": -98.02127075195312, "eval_loss": 0.6691641211509705, "eval_rewards/accuracies": 0.5987453460693359, "eval_rewards/chosen": -0.2901288866996765, "eval_rewards/margins": 0.05828263610601425, "eval_rewards/rejected": -0.34841153025627136, "eval_runtime": 360.1225, "eval_samples_per_second": 11.951, "eval_steps_per_second": 1.494, "step": 5300 }, { "epoch": 0.9148862853204687, "grad_norm": 5.797268390655518, "learning_rate": 6.587636221705082e-08, "logits/chosen": -2.7919394969940186, "logits/rejected": -2.7814671993255615, "logps/chosen": -96.14761352539062, "logps/rejected": -98.30670166015625, "loss": 0.6632, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3801385760307312, "rewards/margins": 0.07896140217781067, "rewards/rejected": -0.45910006761550903, "step": 5310 }, { "epoch": 0.9166092350103378, "grad_norm": 6.824923038482666, "learning_rate": 6.573371322886288e-08, "logits/chosen": -2.7079017162323, "logits/rejected": -2.6571567058563232, "logps/chosen": -94.98011779785156, "logps/rejected": -98.27288055419922, "loss": 0.648, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.37426260113716125, "rewards/margins": 0.10587947070598602, "rewards/rejected": -0.48014211654663086, "step": 5320 }, { "epoch": 0.9183321847002067, "grad_norm": 5.241591453552246, "learning_rate": 6.559092195949476e-08, "logits/chosen": -2.6528475284576416, "logits/rejected": -2.654944896697998, "logps/chosen": -88.13108825683594, "logps/rejected": -101.67539978027344, "loss": 0.6494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3727738559246063, "rewards/margins": 0.10972573608160019, "rewards/rejected": -0.4824995994567871, "step": 5330 }, { "epoch": 0.9200551343900758, "grad_norm": 6.645832061767578, "learning_rate": 6.544798970021886e-08, "logits/chosen": -2.6900384426116943, "logits/rejected": -2.6418492794036865, "logps/chosen": -93.05720520019531, "logps/rejected": -97.80923461914062, "loss": 0.6524, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3669526278972626, "rewards/margins": 0.09738953411579132, "rewards/rejected": -0.4643422067165375, "step": 5340 }, { "epoch": 0.9217780840799449, "grad_norm": 6.164369583129883, "learning_rate": 6.530491774358266e-08, "logits/chosen": -2.642723560333252, "logits/rejected": -2.6087820529937744, "logps/chosen": -90.41754913330078, "logps/rejected": -98.78402709960938, "loss": 0.6562, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.377750962972641, "rewards/margins": 0.09037534892559052, "rewards/rejected": -0.4681262969970703, "step": 5350 }, { "epoch": 0.923501033769814, "grad_norm": 7.229325294494629, "learning_rate": 6.516170738339683e-08, "logits/chosen": -2.7518601417541504, "logits/rejected": -2.7438347339630127, "logps/chosen": -88.50611877441406, "logps/rejected": -104.66204833984375, "loss": 0.6399, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.35292014479637146, "rewards/margins": 0.1288849264383316, "rewards/rejected": -0.4818050265312195, "step": 5360 }, { "epoch": 0.9252239834596829, "grad_norm": 7.459567546844482, "learning_rate": 6.501835991472373e-08, "logits/chosen": -2.6610798835754395, "logits/rejected": -2.6450963020324707, "logps/chosen": -87.69316101074219, "logps/rejected": -96.68251037597656, "loss": 0.6652, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.35768431425094604, "rewards/margins": 0.07509956508874893, "rewards/rejected": -0.43278390169143677, "step": 5370 }, { "epoch": 0.926946933149552, "grad_norm": 6.591744899749756, "learning_rate": 6.487487663386553e-08, "logits/chosen": -2.76351261138916, "logits/rejected": -2.726323366165161, "logps/chosen": -93.70985412597656, "logps/rejected": -99.14210510253906, "loss": 0.6465, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3631172776222229, "rewards/margins": 0.11200074106454849, "rewards/rejected": -0.47511807084083557, "step": 5380 }, { "epoch": 0.9286698828394211, "grad_norm": 6.302175521850586, "learning_rate": 6.473125883835259e-08, "logits/chosen": -2.6813721656799316, "logits/rejected": -2.6422839164733887, "logps/chosen": -98.21910858154297, "logps/rejected": -98.19676208496094, "loss": 0.6668, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.39651817083358765, "rewards/margins": 0.06931675970554352, "rewards/rejected": -0.46583491563796997, "step": 5390 }, { "epoch": 0.9303928325292902, "grad_norm": 7.194708824157715, "learning_rate": 6.458750782693171e-08, "logits/chosen": -2.6976680755615234, "logits/rejected": -2.6807875633239746, "logps/chosen": -88.2069091796875, "logps/rejected": -101.64945983886719, "loss": 0.6494, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3710055947303772, "rewards/margins": 0.10657094419002533, "rewards/rejected": -0.4775765538215637, "step": 5400 }, { "epoch": 0.9303928325292902, "eval_logits/chosen": -2.788670778274536, "eval_logits/rejected": -2.782686471939087, "eval_logps/chosen": -88.10902404785156, "eval_logps/rejected": -98.53675079345703, "eval_loss": 0.6686705350875854, "eval_rewards/accuracies": 0.6015334725379944, "eval_rewards/chosen": -0.29397135972976685, "eval_rewards/margins": 0.059594981372356415, "eval_rewards/rejected": -0.35356634855270386, "eval_runtime": 360.3657, "eval_samples_per_second": 11.943, "eval_steps_per_second": 1.493, "step": 5400 }, { "epoch": 0.9321157822191593, "grad_norm": 7.650205612182617, "learning_rate": 6.444362489955433e-08, "logits/chosen": -2.709641933441162, "logits/rejected": -2.6979405879974365, "logps/chosen": -95.5630874633789, "logps/rejected": -100.59437561035156, "loss": 0.6632, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.38923701643943787, "rewards/margins": 0.07583045214414597, "rewards/rejected": -0.46506747603416443, "step": 5410 }, { "epoch": 0.9338387319090282, "grad_norm": 7.145179748535156, "learning_rate": 6.429961135736483e-08, "logits/chosen": -2.7654125690460205, "logits/rejected": -2.7453110218048096, "logps/chosen": -91.00215911865234, "logps/rejected": -99.94932556152344, "loss": 0.6576, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.37283921241760254, "rewards/margins": 0.08913250267505646, "rewards/rejected": -0.4619717001914978, "step": 5420 }, { "epoch": 0.9355616815988973, "grad_norm": 8.034782409667969, "learning_rate": 6.415546850268881e-08, "logits/chosen": -2.699660539627075, "logits/rejected": -2.688311815261841, "logps/chosen": -93.40458679199219, "logps/rejected": -101.03926849365234, "loss": 0.6553, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.39806491136550903, "rewards/margins": 0.0948207750916481, "rewards/rejected": -0.49288567900657654, "step": 5430 }, { "epoch": 0.9372846312887664, "grad_norm": 6.361593723297119, "learning_rate": 6.401119763902118e-08, "logits/chosen": -2.700340747833252, "logits/rejected": -2.6840767860412598, "logps/chosen": -93.75382232666016, "logps/rejected": -105.26869201660156, "loss": 0.6406, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3755238950252533, "rewards/margins": 0.12471933662891388, "rewards/rejected": -0.5002433061599731, "step": 5440 }, { "epoch": 0.9390075809786355, "grad_norm": 8.197189331054688, "learning_rate": 6.386680007101444e-08, "logits/chosen": -2.6286511421203613, "logits/rejected": -2.618351936340332, "logps/chosen": -95.17610168457031, "logps/rejected": -103.09773254394531, "loss": 0.6528, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3959697186946869, "rewards/margins": 0.1002323180437088, "rewards/rejected": -0.4962020516395569, "step": 5450 }, { "epoch": 0.9407305306685044, "grad_norm": 7.160200595855713, "learning_rate": 6.372227710446696e-08, "logits/chosen": -2.70961594581604, "logits/rejected": -2.6946840286254883, "logps/chosen": -91.92683410644531, "logps/rejected": -97.2591552734375, "loss": 0.6656, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.38083431124687195, "rewards/margins": 0.07164783775806427, "rewards/rejected": -0.4524821639060974, "step": 5460 }, { "epoch": 0.9424534803583735, "grad_norm": 7.070869445800781, "learning_rate": 6.357763004631103e-08, "logits/chosen": -2.66728138923645, "logits/rejected": -2.6487419605255127, "logps/chosen": -90.99283599853516, "logps/rejected": -96.91069030761719, "loss": 0.6711, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.3963419795036316, "rewards/margins": 0.0679207444190979, "rewards/rejected": -0.46426278352737427, "step": 5470 }, { "epoch": 0.9441764300482426, "grad_norm": 5.866538047790527, "learning_rate": 6.343286020460114e-08, "logits/chosen": -2.695925235748291, "logits/rejected": -2.6727564334869385, "logps/chosen": -91.25939178466797, "logps/rejected": -102.32554626464844, "loss": 0.6423, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3831619322299957, "rewards/margins": 0.1236589327454567, "rewards/rejected": -0.506820797920227, "step": 5480 }, { "epoch": 0.9458993797381117, "grad_norm": 7.357505798339844, "learning_rate": 6.328796888850211e-08, "logits/chosen": -2.6660616397857666, "logits/rejected": -2.639159679412842, "logps/chosen": -93.99574279785156, "logps/rejected": -99.41559600830078, "loss": 0.6594, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.38430607318878174, "rewards/margins": 0.08772842586040497, "rewards/rejected": -0.4720345139503479, "step": 5490 }, { "epoch": 0.9476223294279807, "grad_norm": 7.893945217132568, "learning_rate": 6.314295740827728e-08, "logits/chosen": -2.7434821128845215, "logits/rejected": -2.727250099182129, "logps/chosen": -90.81407928466797, "logps/rejected": -104.48640441894531, "loss": 0.6412, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.37003234028816223, "rewards/margins": 0.12817493081092834, "rewards/rejected": -0.4982072412967682, "step": 5500 }, { "epoch": 0.9476223294279807, "eval_logits/chosen": -2.779411792755127, "eval_logits/rejected": -2.773380756378174, "eval_logps/chosen": -88.95333862304688, "eval_logps/rejected": -99.52513122558594, "eval_loss": 0.6681700944900513, "eval_rewards/accuracies": 0.5996747016906738, "eval_rewards/chosen": -0.3024143576622009, "eval_rewards/margins": 0.061035752296447754, "eval_rewards/rejected": -0.36345013976097107, "eval_runtime": 359.9936, "eval_samples_per_second": 11.956, "eval_steps_per_second": 1.494, "step": 5500 }, { "epoch": 0.9493452791178497, "grad_norm": 6.8784027099609375, "learning_rate": 6.299782707527664e-08, "logits/chosen": -2.657125949859619, "logits/rejected": -2.6420211791992188, "logps/chosen": -93.97222900390625, "logps/rejected": -100.09397888183594, "loss": 0.6679, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4074481427669525, "rewards/margins": 0.0661727637052536, "rewards/rejected": -0.47362083196640015, "step": 5510 }, { "epoch": 0.9510682288077188, "grad_norm": 7.177425384521484, "learning_rate": 6.285257920192492e-08, "logits/chosen": -2.6515066623687744, "logits/rejected": -2.627837657928467, "logps/chosen": -92.08159637451172, "logps/rejected": -99.34222412109375, "loss": 0.6451, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.36257150769233704, "rewards/margins": 0.11540888249874115, "rewards/rejected": -0.477980375289917, "step": 5520 }, { "epoch": 0.9527911784975879, "grad_norm": 7.7968549728393555, "learning_rate": 6.270721510170987e-08, "logits/chosen": -2.692481517791748, "logits/rejected": -2.6599838733673096, "logps/chosen": -98.7787857055664, "logps/rejected": -100.84993743896484, "loss": 0.6579, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.38872066140174866, "rewards/margins": 0.08904504776000977, "rewards/rejected": -0.4777657091617584, "step": 5530 }, { "epoch": 0.954514128187457, "grad_norm": 7.713639736175537, "learning_rate": 6.25617360891702e-08, "logits/chosen": -2.675360679626465, "logits/rejected": -2.6800215244293213, "logps/chosen": -90.1320571899414, "logps/rejected": -106.236083984375, "loss": 0.6447, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.37784865498542786, "rewards/margins": 0.1174989715218544, "rewards/rejected": -0.4953475892543793, "step": 5540 }, { "epoch": 0.956237077877326, "grad_norm": 6.391312122344971, "learning_rate": 6.241614347988388e-08, "logits/chosen": -2.556227922439575, "logits/rejected": -2.541020154953003, "logps/chosen": -92.57128143310547, "logps/rejected": -100.6026840209961, "loss": 0.6694, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3995343744754791, "rewards/margins": 0.0657266154885292, "rewards/rejected": -0.46526098251342773, "step": 5550 }, { "epoch": 0.957960027567195, "grad_norm": 7.574127197265625, "learning_rate": 6.227043859045603e-08, "logits/chosen": -2.679574489593506, "logits/rejected": -2.6547327041625977, "logps/chosen": -89.28507995605469, "logps/rejected": -96.2481918334961, "loss": 0.6562, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3739834725856781, "rewards/margins": 0.09359666705131531, "rewards/rejected": -0.4675801396369934, "step": 5560 }, { "epoch": 0.9596829772570641, "grad_norm": 7.61766242980957, "learning_rate": 6.212462273850721e-08, "logits/chosen": -2.7123208045959473, "logits/rejected": -2.7033305168151855, "logps/chosen": -87.42466735839844, "logps/rejected": -104.3492202758789, "loss": 0.6509, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.36758172512054443, "rewards/margins": 0.10654322057962418, "rewards/rejected": -0.4741249680519104, "step": 5570 }, { "epoch": 0.9614059269469332, "grad_norm": 5.746629238128662, "learning_rate": 6.197869724266139e-08, "logits/chosen": -2.5936295986175537, "logits/rejected": -2.5804390907287598, "logps/chosen": -92.78849029541016, "logps/rejected": -103.72259521484375, "loss": 0.6507, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3961091935634613, "rewards/margins": 0.10325653851032257, "rewards/rejected": -0.49936574697494507, "step": 5580 }, { "epoch": 0.9631288766368022, "grad_norm": 9.365887641906738, "learning_rate": 6.183266342253406e-08, "logits/chosen": -2.758596420288086, "logits/rejected": -2.7179176807403564, "logps/chosen": -95.80296325683594, "logps/rejected": -103.4892578125, "loss": 0.6583, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.389718234539032, "rewards/margins": 0.09365256130695343, "rewards/rejected": -0.483370840549469, "step": 5590 }, { "epoch": 0.9648518263266712, "grad_norm": 7.137005805969238, "learning_rate": 6.168652259872033e-08, "logits/chosen": -2.7192625999450684, "logits/rejected": -2.7171120643615723, "logps/chosen": -91.826171875, "logps/rejected": -106.61534118652344, "loss": 0.6531, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3932226002216339, "rewards/margins": 0.09984948486089706, "rewards/rejected": -0.4930720925331116, "step": 5600 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -2.774271011352539, "eval_logits/rejected": -2.7682688236236572, "eval_logps/chosen": -88.65845489501953, "eval_logps/rejected": -99.27580261230469, "eval_loss": 0.6679658889770508, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -0.2994656264781952, "eval_rewards/margins": 0.06149120256304741, "eval_rewards/rejected": -0.3609568178653717, "eval_runtime": 360.2429, "eval_samples_per_second": 11.947, "eval_steps_per_second": 1.493, "step": 5600 }, { "epoch": 0.9665747760165403, "grad_norm": 6.414742469787598, "learning_rate": 6.154027609278288e-08, "logits/chosen": -2.728668451309204, "logits/rejected": -2.712214231491089, "logps/chosen": -90.20558166503906, "logps/rejected": -106.26041412353516, "loss": 0.6449, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.37805065512657166, "rewards/margins": 0.11896820366382599, "rewards/rejected": -0.4970189034938812, "step": 5610 }, { "epoch": 0.9682977257064094, "grad_norm": 8.864455223083496, "learning_rate": 6.139392522724017e-08, "logits/chosen": -2.6541590690612793, "logits/rejected": -2.633671998977661, "logps/chosen": -96.14595031738281, "logps/rejected": -100.43621063232422, "loss": 0.6595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.411262571811676, "rewards/margins": 0.08785083144903183, "rewards/rejected": -0.49911341071128845, "step": 5620 }, { "epoch": 0.9700206753962785, "grad_norm": 6.868548393249512, "learning_rate": 6.12474713255543e-08, "logits/chosen": -2.7163074016571045, "logits/rejected": -2.688546657562256, "logps/chosen": -95.85604858398438, "logps/rejected": -97.7147216796875, "loss": 0.6711, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3994835913181305, "rewards/margins": 0.06611791998147964, "rewards/rejected": -0.46560150384902954, "step": 5630 }, { "epoch": 0.9717436250861475, "grad_norm": 7.378805160522461, "learning_rate": 6.110091571211919e-08, "logits/chosen": -2.6807336807250977, "logits/rejected": -2.6671555042266846, "logps/chosen": -90.3287124633789, "logps/rejected": -102.43013763427734, "loss": 0.6536, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3638276755809784, "rewards/margins": 0.09953577816486359, "rewards/rejected": -0.4633634686470032, "step": 5640 }, { "epoch": 0.9734665747760165, "grad_norm": 6.890263557434082, "learning_rate": 6.095425971224856e-08, "logits/chosen": -2.6029608249664307, "logits/rejected": -2.595309019088745, "logps/chosen": -89.53279876708984, "logps/rejected": -102.14915466308594, "loss": 0.6472, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3752228021621704, "rewards/margins": 0.12423346191644669, "rewards/rejected": -0.4994562566280365, "step": 5650 }, { "epoch": 0.9751895244658856, "grad_norm": 7.2957963943481445, "learning_rate": 6.080750465216388e-08, "logits/chosen": -2.6116573810577393, "logits/rejected": -2.6080095767974854, "logps/chosen": -90.91778564453125, "logps/rejected": -102.00828552246094, "loss": 0.6466, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3905393183231354, "rewards/margins": 0.11953528970479965, "rewards/rejected": -0.5100746154785156, "step": 5660 }, { "epoch": 0.9769124741557547, "grad_norm": 7.507359981536865, "learning_rate": 6.06606518589825e-08, "logits/chosen": -2.648078680038452, "logits/rejected": -2.6287286281585693, "logps/chosen": -89.63655090332031, "logps/rejected": -99.84394836425781, "loss": 0.6459, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3559412956237793, "rewards/margins": 0.11415255069732666, "rewards/rejected": -0.47009381651878357, "step": 5670 }, { "epoch": 0.9786354238456237, "grad_norm": 7.136532306671143, "learning_rate": 6.05137026607055e-08, "logits/chosen": -2.692141532897949, "logits/rejected": -2.654029369354248, "logps/chosen": -94.61227416992188, "logps/rejected": -103.2687759399414, "loss": 0.6459, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.40233784914016724, "rewards/margins": 0.1138642430305481, "rewards/rejected": -0.5162020921707153, "step": 5680 }, { "epoch": 0.9803583735354927, "grad_norm": 7.768203258514404, "learning_rate": 6.036665838620579e-08, "logits/chosen": -2.721024990081787, "logits/rejected": -2.691474676132202, "logps/chosen": -96.23184967041016, "logps/rejected": -101.6405258178711, "loss": 0.6441, "rewards/accuracies": 0.65625, "rewards/chosen": -0.38233327865600586, "rewards/margins": 0.11741151660680771, "rewards/rejected": -0.49974480271339417, "step": 5690 }, { "epoch": 0.9820813232253618, "grad_norm": 8.61497974395752, "learning_rate": 6.021952036521611e-08, "logits/chosen": -2.5390217304229736, "logits/rejected": -2.5234203338623047, "logps/chosen": -97.38616180419922, "logps/rejected": -103.48213958740234, "loss": 0.652, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3960030674934387, "rewards/margins": 0.10574601590633392, "rewards/rejected": -0.5017490983009338, "step": 5700 }, { "epoch": 0.9820813232253618, "eval_logits/chosen": -2.7663583755493164, "eval_logits/rejected": -2.760427951812744, "eval_logps/chosen": -89.92341613769531, "eval_logps/rejected": -100.78007507324219, "eval_loss": 0.6671208739280701, "eval_rewards/accuracies": 0.6040892004966736, "eval_rewards/chosen": -0.3121151626110077, "eval_rewards/margins": 0.06388425081968307, "eval_rewards/rejected": -0.37599942088127136, "eval_runtime": 360.2055, "eval_samples_per_second": 11.949, "eval_steps_per_second": 1.494, "step": 5700 }, { "epoch": 0.9838042729152309, "grad_norm": 7.650971412658691, "learning_rate": 6.007228992831685e-08, "logits/chosen": -2.7118613719940186, "logits/rejected": -2.686074733734131, "logps/chosen": -95.1128921508789, "logps/rejected": -105.84073638916016, "loss": 0.6409, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.399996817111969, "rewards/margins": 0.1325821727514267, "rewards/rejected": -0.5325790643692017, "step": 5710 }, { "epoch": 0.9855272226051, "grad_norm": 7.109969615936279, "learning_rate": 5.992496840692423e-08, "logits/chosen": -2.682396411895752, "logits/rejected": -2.6602180004119873, "logps/chosen": -91.23838806152344, "logps/rejected": -103.3809585571289, "loss": 0.6328, "rewards/accuracies": 0.75, "rewards/chosen": -0.39226388931274414, "rewards/margins": 0.1431894153356552, "rewards/rejected": -0.5354533195495605, "step": 5720 }, { "epoch": 0.987250172294969, "grad_norm": 6.852600574493408, "learning_rate": 5.97775571332781e-08, "logits/chosen": -2.7351083755493164, "logits/rejected": -2.725668430328369, "logps/chosen": -90.90850830078125, "logps/rejected": -101.49790954589844, "loss": 0.6515, "rewards/accuracies": 0.625, "rewards/chosen": -0.3918391466140747, "rewards/margins": 0.10430805385112762, "rewards/rejected": -0.49614715576171875, "step": 5730 }, { "epoch": 0.988973121984838, "grad_norm": 6.903168201446533, "learning_rate": 5.963005744042997e-08, "logits/chosen": -2.6414573192596436, "logits/rejected": -2.61142635345459, "logps/chosen": -102.45942687988281, "logps/rejected": -108.10794830322266, "loss": 0.6517, "rewards/accuracies": 0.65625, "rewards/chosen": -0.432508647441864, "rewards/margins": 0.10413438081741333, "rewards/rejected": -0.5366430878639221, "step": 5740 }, { "epoch": 0.9906960716747071, "grad_norm": 7.210351467132568, "learning_rate": 5.94824706622309e-08, "logits/chosen": -2.7761495113372803, "logits/rejected": -2.7487683296203613, "logps/chosen": -96.80901336669922, "logps/rejected": -99.75078582763672, "loss": 0.6561, "rewards/accuracies": 0.65625, "rewards/chosen": -0.40144777297973633, "rewards/margins": 0.09334973990917206, "rewards/rejected": -0.4947974681854248, "step": 5750 }, { "epoch": 0.9924190213645762, "grad_norm": 7.566880702972412, "learning_rate": 5.933479813331951e-08, "logits/chosen": -2.6539306640625, "logits/rejected": -2.628377914428711, "logps/chosen": -97.50645446777344, "logps/rejected": -107.49908447265625, "loss": 0.6541, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.410966694355011, "rewards/margins": 0.10833420604467392, "rewards/rejected": -0.5193008184432983, "step": 5760 }, { "epoch": 0.9941419710544452, "grad_norm": 7.751824855804443, "learning_rate": 5.918704118910984e-08, "logits/chosen": -2.6133615970611572, "logits/rejected": -2.594430923461914, "logps/chosen": -88.68501281738281, "logps/rejected": -102.99845886230469, "loss": 0.643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3683035373687744, "rewards/margins": 0.12121020257472992, "rewards/rejected": -0.48951372504234314, "step": 5770 }, { "epoch": 0.9958649207443143, "grad_norm": 7.475341320037842, "learning_rate": 5.903920116577931e-08, "logits/chosen": -2.6427292823791504, "logits/rejected": -2.623394727706909, "logps/chosen": -91.09282684326172, "logps/rejected": -105.16011047363281, "loss": 0.6413, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.38629263639450073, "rewards/margins": 0.1256280243396759, "rewards/rejected": -0.511920690536499, "step": 5780 }, { "epoch": 0.9975878704341833, "grad_norm": 6.766174793243408, "learning_rate": 5.889127940025662e-08, "logits/chosen": -2.583322048187256, "logits/rejected": -2.55000638961792, "logps/chosen": -87.63530731201172, "logps/rejected": -101.72434997558594, "loss": 0.6448, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3616120517253876, "rewards/margins": 0.11629859358072281, "rewards/rejected": -0.4779106080532074, "step": 5790 }, { "epoch": 0.9993108201240524, "grad_norm": 7.446423530578613, "learning_rate": 5.874327723020972e-08, "logits/chosen": -2.668041467666626, "logits/rejected": -2.6409144401550293, "logps/chosen": -89.25969696044922, "logps/rejected": -104.5272445678711, "loss": 0.6355, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.3850882649421692, "rewards/margins": 0.1383116990327835, "rewards/rejected": -0.5233998894691467, "step": 5800 }, { "epoch": 0.9993108201240524, "eval_logits/chosen": -2.754894495010376, "eval_logits/rejected": -2.7488529682159424, "eval_logps/chosen": -91.43659210205078, "eval_logps/rejected": -102.54092407226562, "eval_loss": 0.6662805080413818, "eval_rewards/accuracies": 0.6057156324386597, "eval_rewards/chosen": -0.3272469639778137, "eval_rewards/margins": 0.06636104732751846, "eval_rewards/rejected": -0.3936080038547516, "eval_runtime": 359.9403, "eval_samples_per_second": 11.958, "eval_steps_per_second": 1.495, "step": 5800 }, { "epoch": 1.0010337698139213, "grad_norm": 7.47341775894165, "learning_rate": 5.85951959940336e-08, "logits/chosen": -2.723707675933838, "logits/rejected": -2.708026170730591, "logps/chosen": -94.59834289550781, "logps/rejected": -106.0676040649414, "loss": 0.644, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4052727222442627, "rewards/margins": 0.11942293494939804, "rewards/rejected": -0.524695634841919, "step": 5810 }, { "epoch": 1.0027567195037905, "grad_norm": 6.446918487548828, "learning_rate": 5.8447037030838295e-08, "logits/chosen": -2.7201502323150635, "logits/rejected": -2.6872313022613525, "logps/chosen": -93.95738220214844, "logps/rejected": -105.8100357055664, "loss": 0.6283, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.4093136787414551, "rewards/margins": 0.15548773109912872, "rewards/rejected": -0.5648013949394226, "step": 5820 }, { "epoch": 1.0044796691936595, "grad_norm": 7.902465343475342, "learning_rate": 5.829880168043672e-08, "logits/chosen": -2.610410213470459, "logits/rejected": -2.580620765686035, "logps/chosen": -93.01567077636719, "logps/rejected": -104.25047302246094, "loss": 0.6459, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.39519113302230835, "rewards/margins": 0.11866496503353119, "rewards/rejected": -0.513856053352356, "step": 5830 }, { "epoch": 1.0062026188835287, "grad_norm": 6.444275379180908, "learning_rate": 5.8150491283332556e-08, "logits/chosen": -2.639526128768921, "logits/rejected": -2.6196341514587402, "logps/chosen": -91.62089538574219, "logps/rejected": -108.6521987915039, "loss": 0.6335, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.392328679561615, "rewards/margins": 0.14849011600017548, "rewards/rejected": -0.5408187508583069, "step": 5840 }, { "epoch": 1.0079255685733977, "grad_norm": 7.668493747711182, "learning_rate": 5.800210718070815e-08, "logits/chosen": -2.698667287826538, "logits/rejected": -2.678873300552368, "logps/chosen": -90.6561050415039, "logps/rejected": -110.56428527832031, "loss": 0.6251, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.3974132537841797, "rewards/margins": 0.16324903070926666, "rewards/rejected": -0.5606622695922852, "step": 5850 }, { "epoch": 1.0096485182632666, "grad_norm": 6.586671352386475, "learning_rate": 5.785365071441235e-08, "logits/chosen": -2.565764904022217, "logits/rejected": -2.555816650390625, "logps/chosen": -93.14619445800781, "logps/rejected": -110.57508850097656, "loss": 0.6331, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41372841596603394, "rewards/margins": 0.15966841578483582, "rewards/rejected": -0.5733968019485474, "step": 5860 }, { "epoch": 1.0113714679531358, "grad_norm": 9.994606018066406, "learning_rate": 5.7705123226948425e-08, "logits/chosen": -2.6034083366394043, "logits/rejected": -2.583834648132324, "logps/chosen": -99.65670013427734, "logps/rejected": -112.95194244384766, "loss": 0.635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.43475866317749023, "rewards/margins": 0.14449259638786316, "rewards/rejected": -0.5792513489723206, "step": 5870 }, { "epoch": 1.0130944176430048, "grad_norm": 8.589122772216797, "learning_rate": 5.7556526061461874e-08, "logits/chosen": -2.670046329498291, "logits/rejected": -2.6585898399353027, "logps/chosen": -94.46307373046875, "logps/rejected": -103.35002136230469, "loss": 0.6515, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.39721113443374634, "rewards/margins": 0.10498642921447754, "rewards/rejected": -0.5021975636482239, "step": 5880 }, { "epoch": 1.014817367332874, "grad_norm": 8.38769245147705, "learning_rate": 5.740786056172833e-08, "logits/chosen": -2.6955230236053467, "logits/rejected": -2.681936502456665, "logps/chosen": -96.41594696044922, "logps/rejected": -111.12618255615234, "loss": 0.6465, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.41250571608543396, "rewards/margins": 0.11666438728570938, "rewards/rejected": -0.529170036315918, "step": 5890 }, { "epoch": 1.016540317022743, "grad_norm": 7.229883193969727, "learning_rate": 5.7259128072141324e-08, "logits/chosen": -2.609078884124756, "logits/rejected": -2.5904786586761475, "logps/chosen": -98.62474060058594, "logps/rejected": -111.3572769165039, "loss": 0.6362, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.42958712577819824, "rewards/margins": 0.13729210197925568, "rewards/rejected": -0.5668792128562927, "step": 5900 }, { "epoch": 1.016540317022743, "eval_logits/chosen": -2.7389726638793945, "eval_logits/rejected": -2.7328853607177734, "eval_logps/chosen": -93.74750518798828, "eval_logps/rejected": -105.1657943725586, "eval_loss": 0.6653571724891663, "eval_rewards/accuracies": 0.6043215394020081, "eval_rewards/chosen": -0.3503560423851013, "eval_rewards/margins": 0.06950072199106216, "eval_rewards/rejected": -0.4198567569255829, "eval_runtime": 360.5595, "eval_samples_per_second": 11.937, "eval_steps_per_second": 1.492, "step": 5900 }, { "epoch": 1.018263266712612, "grad_norm": 8.26405143737793, "learning_rate": 5.7110329937700216e-08, "logits/chosen": -2.6024017333984375, "logits/rejected": -2.6053740978240967, "logps/chosen": -94.26959228515625, "logps/rejected": -109.2395248413086, "loss": 0.6541, "rewards/accuracies": 0.625, "rewards/chosen": -0.41757115721702576, "rewards/margins": 0.09765584766864777, "rewards/rejected": -0.5152269601821899, "step": 5910 }, { "epoch": 1.019986216402481, "grad_norm": 7.7567667961120605, "learning_rate": 5.696146750399802e-08, "logits/chosen": -2.6418375968933105, "logits/rejected": -2.6241607666015625, "logps/chosen": -94.14632415771484, "logps/rejected": -106.68177795410156, "loss": 0.6513, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4266262948513031, "rewards/margins": 0.10511897504329681, "rewards/rejected": -0.5317453145980835, "step": 5920 }, { "epoch": 1.02170916609235, "grad_norm": 6.675997734069824, "learning_rate": 5.681254211720915e-08, "logits/chosen": -2.659510374069214, "logits/rejected": -2.6527857780456543, "logps/chosen": -97.44358825683594, "logps/rejected": -108.7308120727539, "loss": 0.6548, "rewards/accuracies": 0.65625, "rewards/chosen": -0.47215375304222107, "rewards/margins": 0.10348248481750488, "rewards/rejected": -0.5756362676620483, "step": 5930 }, { "epoch": 1.0234321157822193, "grad_norm": 7.2116899490356445, "learning_rate": 5.6663555124077354e-08, "logits/chosen": -2.6509711742401123, "logits/rejected": -2.613877296447754, "logps/chosen": -95.33317565917969, "logps/rejected": -108.7905502319336, "loss": 0.6332, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.41096001863479614, "rewards/margins": 0.1462816298007965, "rewards/rejected": -0.5572416186332703, "step": 5940 }, { "epoch": 1.0251550654720882, "grad_norm": 7.672728061676025, "learning_rate": 5.651450787190351e-08, "logits/chosen": -2.529658079147339, "logits/rejected": -2.5256409645080566, "logps/chosen": -95.80914306640625, "logps/rejected": -106.40169525146484, "loss": 0.6514, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.42103952169418335, "rewards/margins": 0.10326598584651947, "rewards/rejected": -0.5243055820465088, "step": 5950 }, { "epoch": 1.0268780151619572, "grad_norm": 7.341794967651367, "learning_rate": 5.6365401708533353e-08, "logits/chosen": -2.589254379272461, "logits/rejected": -2.541999340057373, "logps/chosen": -103.13002014160156, "logps/rejected": -105.22901916503906, "loss": 0.6673, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4480670094490051, "rewards/margins": 0.08621273934841156, "rewards/rejected": -0.5342798233032227, "step": 5960 }, { "epoch": 1.0286009648518264, "grad_norm": 8.059379577636719, "learning_rate": 5.6216237982345426e-08, "logits/chosen": -2.5731124877929688, "logits/rejected": -2.5565097332000732, "logps/chosen": -94.64891052246094, "logps/rejected": -106.91328430175781, "loss": 0.6368, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.42751556634902954, "rewards/margins": 0.14247098565101624, "rewards/rejected": -0.5699866414070129, "step": 5970 }, { "epoch": 1.0303239145416954, "grad_norm": 7.262861251831055, "learning_rate": 5.606701804223879e-08, "logits/chosen": -2.5789666175842285, "logits/rejected": -2.5790278911590576, "logps/chosen": -92.66761016845703, "logps/rejected": -111.3015365600586, "loss": 0.6325, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40097588300704956, "rewards/margins": 0.146892249584198, "rewards/rejected": -0.5478681325912476, "step": 5980 }, { "epoch": 1.0320468642315643, "grad_norm": 7.83336877822876, "learning_rate": 5.5917743237620865e-08, "logits/chosen": -2.6466517448425293, "logits/rejected": -2.613440990447998, "logps/chosen": -97.26774597167969, "logps/rejected": -106.04005432128906, "loss": 0.6457, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.42194968461990356, "rewards/margins": 0.11849778890609741, "rewards/rejected": -0.540447473526001, "step": 5990 }, { "epoch": 1.0337698139214335, "grad_norm": 7.2638068199157715, "learning_rate": 5.576841491839517e-08, "logits/chosen": -2.626448154449463, "logits/rejected": -2.6101455688476562, "logps/chosen": -96.3993911743164, "logps/rejected": -106.6590347290039, "loss": 0.6587, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44295287132263184, "rewards/margins": 0.10090150684118271, "rewards/rejected": -0.5438543558120728, "step": 6000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -2.732137680053711, "eval_logits/rejected": -2.7260348796844482, "eval_logps/chosen": -93.24308013916016, "eval_logps/rejected": -104.63257598876953, "eval_loss": 0.6654485464096069, "eval_rewards/accuracies": 0.6075743436813354, "eval_rewards/chosen": -0.3453117907047272, "eval_rewards/margins": 0.06921263039112091, "eval_rewards/rejected": -0.4145244359970093, "eval_runtime": 360.5197, "eval_samples_per_second": 11.938, "eval_steps_per_second": 1.492, "step": 6000 }, { "epoch": 1.0354927636113025, "grad_norm": 8.994678497314453, "learning_rate": 5.561903443494922e-08, "logits/chosen": -2.632159948348999, "logits/rejected": -2.616946220397949, "logps/chosen": -97.70941162109375, "logps/rejected": -104.2746810913086, "loss": 0.6661, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4467419981956482, "rewards/margins": 0.08194559812545776, "rewards/rejected": -0.528687596321106, "step": 6010 }, { "epoch": 1.0372157133011717, "grad_norm": 7.749782562255859, "learning_rate": 5.546960313814221e-08, "logits/chosen": -2.781144380569458, "logits/rejected": -2.7513694763183594, "logps/chosen": -99.9555435180664, "logps/rejected": -105.5430908203125, "loss": 0.6597, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4556824564933777, "rewards/margins": 0.1007528156042099, "rewards/rejected": -0.556435227394104, "step": 6020 }, { "epoch": 1.0389386629910407, "grad_norm": 6.4451680183410645, "learning_rate": 5.532012237929288e-08, "logits/chosen": -2.4611928462982178, "logits/rejected": -2.44938588142395, "logps/chosen": -94.34858703613281, "logps/rejected": -104.7852783203125, "loss": 0.6572, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4451059401035309, "rewards/margins": 0.09958353638648987, "rewards/rejected": -0.5446893572807312, "step": 6030 }, { "epoch": 1.0406616126809096, "grad_norm": 7.477050304412842, "learning_rate": 5.517059351016723e-08, "logits/chosen": -2.642655849456787, "logits/rejected": -2.624969244003296, "logps/chosen": -96.06242370605469, "logps/rejected": -105.86860656738281, "loss": 0.6458, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4235418438911438, "rewards/margins": 0.11838100850582123, "rewards/rejected": -0.5419228076934814, "step": 6040 }, { "epoch": 1.0423845623707788, "grad_norm": 8.111739158630371, "learning_rate": 5.502101788296634e-08, "logits/chosen": -2.5507383346557617, "logits/rejected": -2.5269408226013184, "logps/chosen": -104.08492279052734, "logps/rejected": -112.29386901855469, "loss": 0.6354, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.44742804765701294, "rewards/margins": 0.13851603865623474, "rewards/rejected": -0.5859440565109253, "step": 6050 }, { "epoch": 1.0441075120606478, "grad_norm": 7.987109661102295, "learning_rate": 5.487139685031413e-08, "logits/chosen": -2.6121532917022705, "logits/rejected": -2.590766429901123, "logps/chosen": -98.3913803100586, "logps/rejected": -106.94645690917969, "loss": 0.6555, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.44896596670150757, "rewards/margins": 0.10085372626781464, "rewards/rejected": -0.5498196482658386, "step": 6060 }, { "epoch": 1.045830461750517, "grad_norm": 7.485147476196289, "learning_rate": 5.4721731765245116e-08, "logits/chosen": -2.6229617595672607, "logits/rejected": -2.601656675338745, "logps/chosen": -96.12904357910156, "logps/rejected": -107.45579528808594, "loss": 0.6375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4218669831752777, "rewards/margins": 0.1354655772447586, "rewards/rejected": -0.5573326349258423, "step": 6070 }, { "epoch": 1.047553411440386, "grad_norm": 7.552558422088623, "learning_rate": 5.4572023981192184e-08, "logits/chosen": -2.5975961685180664, "logits/rejected": -2.579049587249756, "logps/chosen": -102.36415100097656, "logps/rejected": -110.1705322265625, "loss": 0.6395, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4328855872154236, "rewards/margins": 0.13304778933525085, "rewards/rejected": -0.565933346748352, "step": 6080 }, { "epoch": 1.049276361130255, "grad_norm": 7.410242080688477, "learning_rate": 5.442227485197435e-08, "logits/chosen": -2.657362461090088, "logits/rejected": -2.6392769813537598, "logps/chosen": -92.25110626220703, "logps/rejected": -108.11601257324219, "loss": 0.626, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4084300100803375, "rewards/margins": 0.1592157930135727, "rewards/rejected": -0.5676458477973938, "step": 6090 }, { "epoch": 1.050999310820124, "grad_norm": 7.989211082458496, "learning_rate": 5.4272485731784536e-08, "logits/chosen": -2.6268820762634277, "logits/rejected": -2.6230292320251465, "logps/chosen": -101.92762756347656, "logps/rejected": -113.89131164550781, "loss": 0.6337, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.45144420862197876, "rewards/margins": 0.14876195788383484, "rewards/rejected": -0.6002060770988464, "step": 6100 }, { "epoch": 1.050999310820124, "eval_logits/chosen": -2.7237114906311035, "eval_logits/rejected": -2.717660903930664, "eval_logps/chosen": -93.63307189941406, "eval_logps/rejected": -105.14704895019531, "eval_loss": 0.664944052696228, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.34921181201934814, "eval_rewards/margins": 0.07045748829841614, "eval_rewards/rejected": -0.4196692109107971, "eval_runtime": 360.0647, "eval_samples_per_second": 11.953, "eval_steps_per_second": 1.494, "step": 6100 }, { "epoch": 1.052722260509993, "grad_norm": 7.026992321014404, "learning_rate": 5.4122657975177254e-08, "logits/chosen": -2.627516508102417, "logits/rejected": -2.6128859519958496, "logps/chosen": -96.61692810058594, "logps/rejected": -112.77938079833984, "loss": 0.635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.44917988777160645, "rewards/margins": 0.14372874796390533, "rewards/rejected": -0.5929085612297058, "step": 6110 }, { "epoch": 1.0544452101998623, "grad_norm": 7.159167766571045, "learning_rate": 5.397279293705648e-08, "logits/chosen": -2.726543664932251, "logits/rejected": -2.680231809616089, "logps/chosen": -95.74651336669922, "logps/rejected": -113.47621154785156, "loss": 0.6163, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.41424790024757385, "rewards/margins": 0.18573865294456482, "rewards/rejected": -0.5999865531921387, "step": 6120 }, { "epoch": 1.0561681598897312, "grad_norm": 7.718677997589111, "learning_rate": 5.3822891972663266e-08, "logits/chosen": -2.6282782554626465, "logits/rejected": -2.604907989501953, "logps/chosen": -98.44243621826172, "logps/rejected": -112.24897766113281, "loss": 0.6371, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.44050803780555725, "rewards/margins": 0.14252620935440063, "rewards/rejected": -0.5830342769622803, "step": 6130 }, { "epoch": 1.0578911095796002, "grad_norm": 8.668318748474121, "learning_rate": 5.36729564375636e-08, "logits/chosen": -2.653858184814453, "logits/rejected": -2.6416361331939697, "logps/chosen": -97.1240005493164, "logps/rejected": -110.01481628417969, "loss": 0.6425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.44045019149780273, "rewards/margins": 0.13490334153175354, "rewards/rejected": -0.5753535032272339, "step": 6140 }, { "epoch": 1.0596140592694694, "grad_norm": 8.170172691345215, "learning_rate": 5.352298768763606e-08, "logits/chosen": -2.5736560821533203, "logits/rejected": -2.560098171234131, "logps/chosen": -98.64427185058594, "logps/rejected": -106.1328125, "loss": 0.6553, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.45636358857154846, "rewards/margins": 0.10507573932409286, "rewards/rejected": -0.5614393949508667, "step": 6150 }, { "epoch": 1.0613370089593384, "grad_norm": 8.330756187438965, "learning_rate": 5.33729870790596e-08, "logits/chosen": -2.523110866546631, "logits/rejected": -2.5138514041900635, "logps/chosen": -93.60323333740234, "logps/rejected": -106.94438171386719, "loss": 0.6474, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4262743890285492, "rewards/margins": 0.11767958104610443, "rewards/rejected": -0.5439538955688477, "step": 6160 }, { "epoch": 1.0630599586492075, "grad_norm": 6.593761920928955, "learning_rate": 5.322295596830125e-08, "logits/chosen": -2.647068500518799, "logits/rejected": -2.6324515342712402, "logps/chosen": -108.75308990478516, "logps/rejected": -118.23289489746094, "loss": 0.6441, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4929245412349701, "rewards/margins": 0.12468580156564713, "rewards/rejected": -0.6176103353500366, "step": 6170 }, { "epoch": 1.0647829083390765, "grad_norm": 9.397297859191895, "learning_rate": 5.3072895712103925e-08, "logits/chosen": -2.6558566093444824, "logits/rejected": -2.633418560028076, "logps/chosen": -103.27950286865234, "logps/rejected": -110.69441223144531, "loss": 0.6459, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4512435793876648, "rewards/margins": 0.12590502202510834, "rewards/rejected": -0.5771486163139343, "step": 6180 }, { "epoch": 1.0665058580289455, "grad_norm": 8.016385078430176, "learning_rate": 5.292280766747408e-08, "logits/chosen": -2.631197690963745, "logits/rejected": -2.607970952987671, "logps/chosen": -103.00118255615234, "logps/rejected": -114.66705322265625, "loss": 0.631, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4482520520687103, "rewards/margins": 0.16839279234409332, "rewards/rejected": -0.6166448593139648, "step": 6190 }, { "epoch": 1.0682288077188147, "grad_norm": 8.2015962600708, "learning_rate": 5.277269319166944e-08, "logits/chosen": -2.5226476192474365, "logits/rejected": -2.503005027770996, "logps/chosen": -99.58888244628906, "logps/rejected": -114.26560974121094, "loss": 0.6372, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.45074397325515747, "rewards/margins": 0.13954012095928192, "rewards/rejected": -0.5902840495109558, "step": 6200 }, { "epoch": 1.0682288077188147, "eval_logits/chosen": -2.714384078979492, "eval_logits/rejected": -2.7082998752593994, "eval_logps/chosen": -95.46115112304688, "eval_logps/rejected": -107.26507568359375, "eval_loss": 0.6640035510063171, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.36749252676963806, "eval_rewards/margins": 0.07335695624351501, "eval_rewards/rejected": -0.4408494830131531, "eval_runtime": 360.7146, "eval_samples_per_second": 11.932, "eval_steps_per_second": 1.491, "step": 6200 }, { "epoch": 1.0699517574086836, "grad_norm": 8.162054061889648, "learning_rate": 5.2622553642186765e-08, "logits/chosen": -2.62589693069458, "logits/rejected": -2.5983972549438477, "logps/chosen": -102.60786437988281, "logps/rejected": -114.27645111083984, "loss": 0.6398, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.47782596945762634, "rewards/margins": 0.13597813248634338, "rewards/rejected": -0.6138041615486145, "step": 6210 }, { "epoch": 1.0716747070985528, "grad_norm": 7.845183372497559, "learning_rate": 5.24723903767496e-08, "logits/chosen": -2.613713026046753, "logits/rejected": -2.5951294898986816, "logps/chosen": -97.50817108154297, "logps/rejected": -110.98738861083984, "loss": 0.6403, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4536914825439453, "rewards/margins": 0.1307128369808197, "rewards/rejected": -0.5844042897224426, "step": 6220 }, { "epoch": 1.0733976567884218, "grad_norm": 7.181242942810059, "learning_rate": 5.232220475329586e-08, "logits/chosen": -2.747974395751953, "logits/rejected": -2.729741096496582, "logps/chosen": -101.5085220336914, "logps/rejected": -120.6954116821289, "loss": 0.6058, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4267581105232239, "rewards/margins": 0.2146727293729782, "rewards/rejected": -0.6414308547973633, "step": 6230 }, { "epoch": 1.0751206064782908, "grad_norm": 8.449043273925781, "learning_rate": 5.217199812996574e-08, "logits/chosen": -2.564089059829712, "logits/rejected": -2.5353121757507324, "logps/chosen": -102.2982406616211, "logps/rejected": -113.02010345458984, "loss": 0.6421, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4591143727302551, "rewards/margins": 0.13406124711036682, "rewards/rejected": -0.5931755900382996, "step": 6240 }, { "epoch": 1.07684355616816, "grad_norm": 17.723363876342773, "learning_rate": 5.202177186508929e-08, "logits/chosen": -2.6494898796081543, "logits/rejected": -2.627321720123291, "logps/chosen": -100.54495239257812, "logps/rejected": -107.69929504394531, "loss": 0.6615, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4481717050075531, "rewards/margins": 0.08987609297037125, "rewards/rejected": -0.5380477905273438, "step": 6250 }, { "epoch": 1.078566505858029, "grad_norm": 7.934974193572998, "learning_rate": 5.18715273171742e-08, "logits/chosen": -2.52590012550354, "logits/rejected": -2.506361722946167, "logps/chosen": -103.57615661621094, "logps/rejected": -110.61024475097656, "loss": 0.6537, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.49651750922203064, "rewards/margins": 0.10577519983053207, "rewards/rejected": -0.6022926568984985, "step": 6260 }, { "epoch": 1.080289455547898, "grad_norm": 7.386178493499756, "learning_rate": 5.1721265844893467e-08, "logits/chosen": -2.783388137817383, "logits/rejected": -2.782752513885498, "logps/chosen": -99.8791275024414, "logps/rejected": -114.23184967041016, "loss": 0.6441, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4697844386100769, "rewards/margins": 0.12615224719047546, "rewards/rejected": -0.5959367156028748, "step": 6270 }, { "epoch": 1.082012405237767, "grad_norm": 9.967061996459961, "learning_rate": 5.157098880707318e-08, "logits/chosen": -2.4792990684509277, "logits/rejected": -2.457472562789917, "logps/chosen": -101.3305892944336, "logps/rejected": -111.230224609375, "loss": 0.6487, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4691835343837738, "rewards/margins": 0.11668656766414642, "rewards/rejected": -0.5858700275421143, "step": 6280 }, { "epoch": 1.083735354927636, "grad_norm": 7.863672256469727, "learning_rate": 5.1420697562680136e-08, "logits/chosen": -2.5599868297576904, "logits/rejected": -2.5322113037109375, "logps/chosen": -95.88512420654297, "logps/rejected": -111.76493072509766, "loss": 0.6337, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.44506701827049255, "rewards/margins": 0.1488201916217804, "rewards/rejected": -0.593887209892273, "step": 6290 }, { "epoch": 1.0854583046175053, "grad_norm": 8.78534984588623, "learning_rate": 5.1270393470809636e-08, "logits/chosen": -2.62137508392334, "logits/rejected": -2.596475839614868, "logps/chosen": -103.24137878417969, "logps/rejected": -113.2978515625, "loss": 0.6555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5080513954162598, "rewards/margins": 0.10447593778371811, "rewards/rejected": -0.6125272512435913, "step": 6300 }, { "epoch": 1.0854583046175053, "eval_logits/chosen": -2.7071239948272705, "eval_logits/rejected": -2.7009353637695312, "eval_logps/chosen": -96.7947998046875, "eval_logps/rejected": -108.81401062011719, "eval_loss": 0.6633469462394714, "eval_rewards/accuracies": 0.6110594868659973, "eval_rewards/chosen": -0.38082900643348694, "eval_rewards/margins": 0.07550989836454391, "eval_rewards/rejected": -0.45633891224861145, "eval_runtime": 360.6678, "eval_samples_per_second": 11.933, "eval_steps_per_second": 1.492, "step": 6300 }, { "epoch": 1.0871812543073742, "grad_norm": 10.046746253967285, "learning_rate": 5.112007789067316e-08, "logits/chosen": -2.604231119155884, "logits/rejected": -2.5762839317321777, "logps/chosen": -96.14618682861328, "logps/rejected": -107.18050384521484, "loss": 0.6572, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.456892728805542, "rewards/margins": 0.09939368069171906, "rewards/rejected": -0.5562864542007446, "step": 6310 }, { "epoch": 1.0889042039972432, "grad_norm": 8.157792091369629, "learning_rate": 5.09697521815861e-08, "logits/chosen": -2.563054323196411, "logits/rejected": -2.5432045459747314, "logps/chosen": -103.923095703125, "logps/rejected": -114.36000061035156, "loss": 0.6489, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4920113682746887, "rewards/margins": 0.11835269629955292, "rewards/rejected": -0.6103640794754028, "step": 6320 }, { "epoch": 1.0906271536871124, "grad_norm": 8.477439880371094, "learning_rate": 5.0819417702955367e-08, "logits/chosen": -2.5750272274017334, "logits/rejected": -2.5546774864196777, "logps/chosen": -98.45436096191406, "logps/rejected": -118.94361877441406, "loss": 0.6105, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.4516433775424957, "rewards/margins": 0.20171110332012177, "rewards/rejected": -0.6533544659614563, "step": 6330 }, { "epoch": 1.0923501033769814, "grad_norm": 7.878482818603516, "learning_rate": 5.066907581426726e-08, "logits/chosen": -2.5957183837890625, "logits/rejected": -2.5751445293426514, "logps/chosen": -101.18866729736328, "logps/rejected": -111.032470703125, "loss": 0.6435, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.456013023853302, "rewards/margins": 0.12570513784885406, "rewards/rejected": -0.5817180871963501, "step": 6340 }, { "epoch": 1.0940730530668505, "grad_norm": 7.387296676635742, "learning_rate": 5.051872787507505e-08, "logits/chosen": -2.560865879058838, "logits/rejected": -2.5533196926116943, "logps/chosen": -101.19855499267578, "logps/rejected": -116.6091079711914, "loss": 0.6344, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.45683735609054565, "rewards/margins": 0.1462439000606537, "rewards/rejected": -0.6030812859535217, "step": 6350 }, { "epoch": 1.0957960027567195, "grad_norm": 8.644234657287598, "learning_rate": 5.036837524498672e-08, "logits/chosen": -2.5833213329315186, "logits/rejected": -2.5798540115356445, "logps/chosen": -97.86479187011719, "logps/rejected": -113.30020904541016, "loss": 0.6367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4439619481563568, "rewards/margins": 0.14320224523544312, "rewards/rejected": -0.5871641635894775, "step": 6360 }, { "epoch": 1.0975189524465885, "grad_norm": 8.76852798461914, "learning_rate": 5.021801928365269e-08, "logits/chosen": -2.6103196144104004, "logits/rejected": -2.6085126399993896, "logps/chosen": -99.3055191040039, "logps/rejected": -111.53495025634766, "loss": 0.6546, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4616120457649231, "rewards/margins": 0.10931645333766937, "rewards/rejected": -0.5709284543991089, "step": 6370 }, { "epoch": 1.0992419021364577, "grad_norm": 9.487778663635254, "learning_rate": 5.006766135075349e-08, "logits/chosen": -2.643077850341797, "logits/rejected": -2.6314942836761475, "logps/chosen": -101.48836517333984, "logps/rejected": -119.13397216796875, "loss": 0.6386, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.48753833770751953, "rewards/margins": 0.13859817385673523, "rewards/rejected": -0.6261364817619324, "step": 6380 }, { "epoch": 1.1009648518263266, "grad_norm": 7.828840732574463, "learning_rate": 4.991730280598747e-08, "logits/chosen": -2.682213544845581, "logits/rejected": -2.6493566036224365, "logps/chosen": -101.81409454345703, "logps/rejected": -111.9026870727539, "loss": 0.6355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.45730310678482056, "rewards/margins": 0.1517023742198944, "rewards/rejected": -0.6090055108070374, "step": 6390 }, { "epoch": 1.1026878015161956, "grad_norm": 8.809715270996094, "learning_rate": 4.976694500905857e-08, "logits/chosen": -2.61970591545105, "logits/rejected": -2.5968809127807617, "logps/chosen": -103.23225402832031, "logps/rejected": -115.15157318115234, "loss": 0.6406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.45538854598999023, "rewards/margins": 0.13073064386844635, "rewards/rejected": -0.5861191749572754, "step": 6400 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -2.700328826904297, "eval_logits/rejected": -2.6940526962280273, "eval_logps/chosen": -97.13938903808594, "eval_logps/rejected": -109.29045104980469, "eval_loss": 0.6628761291503906, "eval_rewards/accuracies": 0.6108271479606628, "eval_rewards/chosen": -0.3842748999595642, "eval_rewards/margins": 0.07682836055755615, "eval_rewards/rejected": -0.461103230714798, "eval_runtime": 360.0734, "eval_samples_per_second": 11.953, "eval_steps_per_second": 1.494, "step": 6400 }, { "epoch": 1.1044107512060648, "grad_norm": 9.525028228759766, "learning_rate": 4.961658931966387e-08, "logits/chosen": -2.704301357269287, "logits/rejected": -2.6803505420684814, "logps/chosen": -102.91910552978516, "logps/rejected": -113.65352630615234, "loss": 0.6471, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4855470657348633, "rewards/margins": 0.1335912048816681, "rewards/rejected": -0.619138240814209, "step": 6410 }, { "epoch": 1.1061337008959338, "grad_norm": 9.287670135498047, "learning_rate": 4.94662370974815e-08, "logits/chosen": -2.645318031311035, "logits/rejected": -2.6280157566070557, "logps/chosen": -103.72320556640625, "logps/rejected": -114.6949691772461, "loss": 0.6492, "rewards/accuracies": 0.59375, "rewards/chosen": -0.48932743072509766, "rewards/margins": 0.11837329715490341, "rewards/rejected": -0.6077008247375488, "step": 6420 }, { "epoch": 1.107856650585803, "grad_norm": 9.23127555847168, "learning_rate": 4.9315889702158156e-08, "logits/chosen": -2.558070182800293, "logits/rejected": -2.5443100929260254, "logps/chosen": -96.07037353515625, "logps/rejected": -101.66596221923828, "loss": 0.666, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.45678821206092834, "rewards/margins": 0.07751981914043427, "rewards/rejected": -0.5343080163002014, "step": 6430 }, { "epoch": 1.109579600275672, "grad_norm": 8.366679191589355, "learning_rate": 4.9165548493296894e-08, "logits/chosen": -2.568415880203247, "logits/rejected": -2.551879405975342, "logps/chosen": -102.79166412353516, "logps/rejected": -113.081787109375, "loss": 0.6582, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.49532589316368103, "rewards/margins": 0.10356497764587402, "rewards/rejected": -0.5988908410072327, "step": 6440 }, { "epoch": 1.111302549965541, "grad_norm": 8.183588981628418, "learning_rate": 4.9015214830444874e-08, "logits/chosen": -2.582038402557373, "logits/rejected": -2.550384759902954, "logps/chosen": -102.41912841796875, "logps/rejected": -114.42729187011719, "loss": 0.6341, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4609449505805969, "rewards/margins": 0.16104629635810852, "rewards/rejected": -0.6219911575317383, "step": 6450 }, { "epoch": 1.11302549965541, "grad_norm": 9.3693208694458, "learning_rate": 4.886489007308094e-08, "logits/chosen": -2.539620876312256, "logits/rejected": -2.5406689643859863, "logps/chosen": -99.10316467285156, "logps/rejected": -126.43525695800781, "loss": 0.6064, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.4722733497619629, "rewards/margins": 0.21284833550453186, "rewards/rejected": -0.6851217150688171, "step": 6460 }, { "epoch": 1.114748449345279, "grad_norm": 8.055909156799316, "learning_rate": 4.8714575580603515e-08, "logits/chosen": -2.51084303855896, "logits/rejected": -2.4892048835754395, "logps/chosen": -98.69096374511719, "logps/rejected": -110.8814926147461, "loss": 0.6468, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4592776298522949, "rewards/margins": 0.13193976879119873, "rewards/rejected": -0.5912173986434937, "step": 6470 }, { "epoch": 1.1164713990351482, "grad_norm": 9.206692695617676, "learning_rate": 4.856427271231805e-08, "logits/chosen": -2.5963027477264404, "logits/rejected": -2.5617220401763916, "logps/chosen": -100.11074829101562, "logps/rejected": -107.81233215332031, "loss": 0.639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.43541914224624634, "rewards/margins": 0.13812783360481262, "rewards/rejected": -0.5735469460487366, "step": 6480 }, { "epoch": 1.1181943487250172, "grad_norm": 8.638041496276855, "learning_rate": 4.841398282742503e-08, "logits/chosen": -2.7634589672088623, "logits/rejected": -2.751997947692871, "logps/chosen": -103.34661865234375, "logps/rejected": -116.1749038696289, "loss": 0.6466, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.49726349115371704, "rewards/margins": 0.12319080531597137, "rewards/rejected": -0.6204543113708496, "step": 6490 }, { "epoch": 1.1199172984148862, "grad_norm": 9.092048645019531, "learning_rate": 4.8263707285007393e-08, "logits/chosen": -2.607485294342041, "logits/rejected": -2.6008620262145996, "logps/chosen": -104.70477294921875, "logps/rejected": -114.68504333496094, "loss": 0.6445, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.48498040437698364, "rewards/margins": 0.13475123047828674, "rewards/rejected": -0.6197316646575928, "step": 6500 }, { "epoch": 1.1199172984148862, "eval_logits/chosen": -2.6922686100006104, "eval_logits/rejected": -2.686023235321045, "eval_logps/chosen": -97.65068817138672, "eval_logps/rejected": -109.87677001953125, "eval_loss": 0.6626461148262024, "eval_rewards/accuracies": 0.6096654534339905, "eval_rewards/chosen": -0.38938793540000916, "eval_rewards/margins": 0.0775785744190216, "eval_rewards/rejected": -0.46696653962135315, "eval_runtime": 359.8452, "eval_samples_per_second": 11.961, "eval_steps_per_second": 1.495, "step": 6500 }, { "epoch": 1.1216402481047554, "grad_norm": 8.9795560836792, "learning_rate": 4.811344744401849e-08, "logits/chosen": -2.6200554370880127, "logits/rejected": -2.576789617538452, "logps/chosen": -104.20268249511719, "logps/rejected": -114.16617584228516, "loss": 0.6395, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4741840362548828, "rewards/margins": 0.1373899281024933, "rewards/rejected": -0.6115739941596985, "step": 6510 }, { "epoch": 1.1233631977946243, "grad_norm": 10.132994651794434, "learning_rate": 4.796320466326961e-08, "logits/chosen": -2.5829601287841797, "logits/rejected": -2.55981707572937, "logps/chosen": -98.28492736816406, "logps/rejected": -107.94759368896484, "loss": 0.6482, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.446734756231308, "rewards/margins": 0.11816434562206268, "rewards/rejected": -0.5648991465568542, "step": 6520 }, { "epoch": 1.1250861474844935, "grad_norm": 8.41073226928711, "learning_rate": 4.7812980301417786e-08, "logits/chosen": -2.5432591438293457, "logits/rejected": -2.5165724754333496, "logps/chosen": -100.28233337402344, "logps/rejected": -114.78585052490234, "loss": 0.6264, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4665835499763489, "rewards/margins": 0.16683372855186462, "rewards/rejected": -0.6334173083305359, "step": 6530 }, { "epoch": 1.1268090971743625, "grad_norm": 7.00799036026001, "learning_rate": 4.766277571695348e-08, "logits/chosen": -2.590916633605957, "logits/rejected": -2.5647997856140137, "logps/chosen": -109.76686096191406, "logps/rejected": -120.7196273803711, "loss": 0.6495, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5239251255989075, "rewards/margins": 0.12022165954113007, "rewards/rejected": -0.644146740436554, "step": 6540 }, { "epoch": 1.1285320468642315, "grad_norm": 9.350703239440918, "learning_rate": 4.751259226818835e-08, "logits/chosen": -2.505497455596924, "logits/rejected": -2.483640432357788, "logps/chosen": -99.55348205566406, "logps/rejected": -111.34088134765625, "loss": 0.652, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4800754487514496, "rewards/margins": 0.11636264622211456, "rewards/rejected": -0.5964381098747253, "step": 6550 }, { "epoch": 1.1302549965541007, "grad_norm": 8.498332023620605, "learning_rate": 4.736243131324284e-08, "logits/chosen": -2.617701292037964, "logits/rejected": -2.5795741081237793, "logps/chosen": -102.66838073730469, "logps/rejected": -115.81105041503906, "loss": 0.6255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.46664175391197205, "rewards/margins": 0.17283225059509277, "rewards/rejected": -0.6394740343093872, "step": 6560 }, { "epoch": 1.1319779462439696, "grad_norm": 9.389053344726562, "learning_rate": 4.7212294210034075e-08, "logits/chosen": -2.6099629402160645, "logits/rejected": -2.5814719200134277, "logps/chosen": -101.36234283447266, "logps/rejected": -114.8719711303711, "loss": 0.6357, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.477983295917511, "rewards/margins": 0.14376316964626312, "rewards/rejected": -0.6217464208602905, "step": 6570 }, { "epoch": 1.1337008959338388, "grad_norm": 8.522909164428711, "learning_rate": 4.70621823162634e-08, "logits/chosen": -2.620063304901123, "logits/rejected": -2.5842626094818115, "logps/chosen": -103.08598327636719, "logps/rejected": -108.29972839355469, "loss": 0.6546, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.48057666420936584, "rewards/margins": 0.10240229219198227, "rewards/rejected": -0.5829789042472839, "step": 6580 }, { "epoch": 1.1354238456237078, "grad_norm": 10.206613540649414, "learning_rate": 4.6912096989404264e-08, "logits/chosen": -2.57188081741333, "logits/rejected": -2.5505259037017822, "logps/chosen": -99.1366958618164, "logps/rejected": -112.9898681640625, "loss": 0.6419, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4629674553871155, "rewards/margins": 0.13212160766124725, "rewards/rejected": -0.5950890779495239, "step": 6590 }, { "epoch": 1.1371467953135768, "grad_norm": 9.817237854003906, "learning_rate": 4.6762039586689795e-08, "logits/chosen": -2.66290020942688, "logits/rejected": -2.647390604019165, "logps/chosen": -102.1463394165039, "logps/rejected": -112.91239929199219, "loss": 0.6438, "rewards/accuracies": 0.625, "rewards/chosen": -0.4651399552822113, "rewards/margins": 0.12810549139976501, "rewards/rejected": -0.5932454466819763, "step": 6600 }, { "epoch": 1.1371467953135768, "eval_logits/chosen": -2.687708854675293, "eval_logits/rejected": -2.6813712120056152, "eval_logps/chosen": -97.78389739990234, "eval_logps/rejected": -110.0129165649414, "eval_loss": 0.6627377271652222, "eval_rewards/accuracies": 0.607342004776001, "eval_rewards/chosen": -0.3907199501991272, "eval_rewards/margins": 0.07760793715715408, "eval_rewards/rejected": -0.4683278203010559, "eval_runtime": 359.7714, "eval_samples_per_second": 11.963, "eval_steps_per_second": 1.495, "step": 6600 }, { "epoch": 1.138869745003446, "grad_norm": 8.432068824768066, "learning_rate": 4.661201146510068e-08, "logits/chosen": -2.617776393890381, "logits/rejected": -2.600355863571167, "logps/chosen": -98.23858642578125, "logps/rejected": -113.22637939453125, "loss": 0.6321, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.46272364258766174, "rewards/margins": 0.15199866890907288, "rewards/rejected": -0.6147223114967346, "step": 6610 }, { "epoch": 1.140592694693315, "grad_norm": 9.28195571899414, "learning_rate": 4.646201398135273e-08, "logits/chosen": -2.5815634727478027, "logits/rejected": -2.560616970062256, "logps/chosen": -104.71697998046875, "logps/rejected": -115.4776382446289, "loss": 0.6548, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.523261308670044, "rewards/margins": 0.10992765426635742, "rewards/rejected": -0.6331889629364014, "step": 6620 }, { "epoch": 1.1423156443831841, "grad_norm": 8.086362838745117, "learning_rate": 4.6312048491884784e-08, "logits/chosen": -2.5562520027160645, "logits/rejected": -2.5318968296051025, "logps/chosen": -100.54441833496094, "logps/rejected": -106.32527923583984, "loss": 0.6566, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.47232919931411743, "rewards/margins": 0.0990000069141388, "rewards/rejected": -0.5713291764259338, "step": 6630 }, { "epoch": 1.144038594073053, "grad_norm": 7.732243061065674, "learning_rate": 4.6162116352846295e-08, "logits/chosen": -2.6114213466644287, "logits/rejected": -2.577406644821167, "logps/chosen": -102.87176513671875, "logps/rejected": -110.48823547363281, "loss": 0.6529, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.49765920639038086, "rewards/margins": 0.11124183237552643, "rewards/rejected": -0.6089010238647461, "step": 6640 }, { "epoch": 1.145761543762922, "grad_norm": 9.960616111755371, "learning_rate": 4.6012218920085124e-08, "logits/chosen": -2.6517696380615234, "logits/rejected": -2.6302433013916016, "logps/chosen": -99.01194763183594, "logps/rejected": -117.2528076171875, "loss": 0.6276, "rewards/accuracies": 0.71875, "rewards/chosen": -0.46898430585861206, "rewards/margins": 0.17000864446163177, "rewards/rejected": -0.638992965221405, "step": 6650 }, { "epoch": 1.1474844934527912, "grad_norm": 7.823163986206055, "learning_rate": 4.586235754913532e-08, "logits/chosen": -2.6741387844085693, "logits/rejected": -2.6565566062927246, "logps/chosen": -98.02713775634766, "logps/rejected": -115.93632507324219, "loss": 0.6311, "rewards/accuracies": 0.65625, "rewards/chosen": -0.46266403794288635, "rewards/margins": 0.1550600379705429, "rewards/rejected": -0.6177240610122681, "step": 6660 }, { "epoch": 1.1492074431426602, "grad_norm": 9.069050788879395, "learning_rate": 4.5712533595204785e-08, "logits/chosen": -2.5663323402404785, "logits/rejected": -2.5679333209991455, "logps/chosen": -100.11381530761719, "logps/rejected": -125.8458480834961, "loss": 0.6122, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5020694732666016, "rewards/margins": 0.20346996188163757, "rewards/rejected": -0.7055394649505615, "step": 6670 }, { "epoch": 1.1509303928325294, "grad_norm": 9.263232231140137, "learning_rate": 4.5562748413163086e-08, "logits/chosen": -2.5756216049194336, "logits/rejected": -2.546689748764038, "logps/chosen": -101.28739166259766, "logps/rejected": -112.49143981933594, "loss": 0.6429, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4746217131614685, "rewards/margins": 0.12963800132274628, "rewards/rejected": -0.604259729385376, "step": 6680 }, { "epoch": 1.1526533425223984, "grad_norm": 7.940311431884766, "learning_rate": 4.5413003357529115e-08, "logits/chosen": -2.5793943405151367, "logits/rejected": -2.559263229370117, "logps/chosen": -101.38301086425781, "logps/rejected": -119.2028579711914, "loss": 0.6141, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4422333240509033, "rewards/margins": 0.19144585728645325, "rewards/rejected": -0.6336792707443237, "step": 6690 }, { "epoch": 1.1543762922122673, "grad_norm": 9.524909973144531, "learning_rate": 4.5263299782459e-08, "logits/chosen": -2.581721544265747, "logits/rejected": -2.5713274478912354, "logps/chosen": -104.95369720458984, "logps/rejected": -118.57572937011719, "loss": 0.6411, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4882546365261078, "rewards/margins": 0.13852766156196594, "rewards/rejected": -0.6267822980880737, "step": 6700 }, { "epoch": 1.1543762922122673, "eval_logits/chosen": -2.6791462898254395, "eval_logits/rejected": -2.672869920730591, "eval_logps/chosen": -98.66951751708984, "eval_logps/rejected": -111.08659362792969, "eval_loss": 0.6621683835983276, "eval_rewards/accuracies": 0.6122211813926697, "eval_rewards/chosen": -0.39957618713378906, "eval_rewards/margins": 0.07948849350214005, "eval_rewards/rejected": -0.4790646433830261, "eval_runtime": 359.782, "eval_samples_per_second": 11.963, "eval_steps_per_second": 1.495, "step": 6700 }, { "epoch": 1.1560992419021365, "grad_norm": 7.029930591583252, "learning_rate": 4.5113639041733654e-08, "logits/chosen": -2.5752780437469482, "logits/rejected": -2.5508055686950684, "logps/chosen": -105.0222396850586, "logps/rejected": -118.96745300292969, "loss": 0.6333, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.48097047209739685, "rewards/margins": 0.17322038114070892, "rewards/rejected": -0.6541908383369446, "step": 6710 }, { "epoch": 1.1578221915920055, "grad_norm": 9.368919372558594, "learning_rate": 4.496402248874671e-08, "logits/chosen": -2.564828634262085, "logits/rejected": -2.550750255584717, "logps/chosen": -97.69122314453125, "logps/rejected": -116.96647644042969, "loss": 0.6179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.46276330947875977, "rewards/margins": 0.19141033291816711, "rewards/rejected": -0.6541736721992493, "step": 6720 }, { "epoch": 1.1595451412818747, "grad_norm": 9.316107749938965, "learning_rate": 4.4814451476492146e-08, "logits/chosen": -2.524674892425537, "logits/rejected": -2.5094120502471924, "logps/chosen": -105.376708984375, "logps/rejected": -116.1399154663086, "loss": 0.6477, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4867584705352783, "rewards/margins": 0.12187260389328003, "rewards/rejected": -0.6086310744285583, "step": 6730 }, { "epoch": 1.1612680909717437, "grad_norm": 10.971795082092285, "learning_rate": 4.466492735755218e-08, "logits/chosen": -2.587489128112793, "logits/rejected": -2.579017400741577, "logps/chosen": -102.76058197021484, "logps/rejected": -120.98915100097656, "loss": 0.627, "rewards/accuracies": 0.625, "rewards/chosen": -0.48327407240867615, "rewards/margins": 0.16570714116096497, "rewards/rejected": -0.6489812731742859, "step": 6740 }, { "epoch": 1.1629910406616126, "grad_norm": 9.019688606262207, "learning_rate": 4.4515451484084875e-08, "logits/chosen": -2.622640609741211, "logits/rejected": -2.608363151550293, "logps/chosen": -104.0699462890625, "logps/rejected": -121.1629409790039, "loss": 0.6334, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5161177515983582, "rewards/margins": 0.15863685309886932, "rewards/rejected": -0.6747546195983887, "step": 6750 }, { "epoch": 1.1647139903514818, "grad_norm": 9.12403678894043, "learning_rate": 4.436602520781213e-08, "logits/chosen": -2.5706498622894287, "logits/rejected": -2.5449025630950928, "logps/chosen": -101.08219146728516, "logps/rejected": -112.35832214355469, "loss": 0.6398, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4902876317501068, "rewards/margins": 0.14592550694942474, "rewards/rejected": -0.6362131834030151, "step": 6760 }, { "epoch": 1.1664369400413508, "grad_norm": 8.566431999206543, "learning_rate": 4.4216649880007214e-08, "logits/chosen": -2.586684465408325, "logits/rejected": -2.579789876937866, "logps/chosen": -100.56122589111328, "logps/rejected": -118.85711669921875, "loss": 0.6268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.46726298332214355, "rewards/margins": 0.1650046408176422, "rewards/rejected": -0.6322677135467529, "step": 6770 }, { "epoch": 1.1681598897312198, "grad_norm": 10.997673988342285, "learning_rate": 4.4067326851482754e-08, "logits/chosen": -2.5771098136901855, "logits/rejected": -2.5475375652313232, "logps/chosen": -108.99159240722656, "logps/rejected": -116.57635498046875, "loss": 0.6568, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5248969197273254, "rewards/margins": 0.1076025515794754, "rewards/rejected": -0.6324995756149292, "step": 6780 }, { "epoch": 1.169882839421089, "grad_norm": 8.650938987731934, "learning_rate": 4.391805747257837e-08, "logits/chosen": -2.5931596755981445, "logits/rejected": -2.577589511871338, "logps/chosen": -98.49267578125, "logps/rejected": -118.1158218383789, "loss": 0.6256, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.47968751192092896, "rewards/margins": 0.17027659714221954, "rewards/rejected": -0.6499640941619873, "step": 6790 }, { "epoch": 1.171605789110958, "grad_norm": 6.909446716308594, "learning_rate": 4.3768843093148576e-08, "logits/chosen": -2.5084850788116455, "logits/rejected": -2.4819085597991943, "logps/chosen": -101.25658416748047, "logps/rejected": -116.91668701171875, "loss": 0.6224, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4883821904659271, "rewards/margins": 0.17982549965381622, "rewards/rejected": -0.6682077050209045, "step": 6800 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -2.66884446144104, "eval_logits/rejected": -2.662496566772461, "eval_logps/chosen": -100.33702850341797, "eval_logps/rejected": -112.9988021850586, "eval_loss": 0.6614071130752563, "eval_rewards/accuracies": 0.6115241646766663, "eval_rewards/chosen": -0.4162512421607971, "eval_rewards/margins": 0.08193553239107132, "eval_rewards/rejected": -0.4981868267059326, "eval_runtime": 359.807, "eval_samples_per_second": 11.962, "eval_steps_per_second": 1.495, "step": 6800 }, { "epoch": 1.173328738800827, "grad_norm": 8.090224266052246, "learning_rate": 4.361968506255046e-08, "logits/chosen": -2.56339168548584, "logits/rejected": -2.5459580421447754, "logps/chosen": -104.08646392822266, "logps/rejected": -118.2791519165039, "loss": 0.6255, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.48382797837257385, "rewards/margins": 0.18149612843990326, "rewards/rejected": -0.6653240919113159, "step": 6810 }, { "epoch": 1.175051688490696, "grad_norm": 10.138392448425293, "learning_rate": 4.347058472963162e-08, "logits/chosen": -2.6586554050445557, "logits/rejected": -2.630859136581421, "logps/chosen": -106.34110260009766, "logps/rejected": -117.0941390991211, "loss": 0.6443, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5131762027740479, "rewards/margins": 0.1275903284549713, "rewards/rejected": -0.6407665014266968, "step": 6820 }, { "epoch": 1.176774638180565, "grad_norm": 8.95630168914795, "learning_rate": 4.3321543442717796e-08, "logits/chosen": -2.6672542095184326, "logits/rejected": -2.660428285598755, "logps/chosen": -102.14369201660156, "logps/rejected": -114.99869537353516, "loss": 0.6439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4824022650718689, "rewards/margins": 0.1374589502811432, "rewards/rejected": -0.6198612451553345, "step": 6830 }, { "epoch": 1.1784975878704342, "grad_norm": 8.868681907653809, "learning_rate": 4.3172562549600866e-08, "logits/chosen": -2.621293067932129, "logits/rejected": -2.5976157188415527, "logps/chosen": -108.63401794433594, "logps/rejected": -121.0826644897461, "loss": 0.6456, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5170341730117798, "rewards/margins": 0.1398032307624817, "rewards/rejected": -0.6568374633789062, "step": 6840 }, { "epoch": 1.1802205375603032, "grad_norm": 8.628061294555664, "learning_rate": 4.3023643397526496e-08, "logits/chosen": -2.4805750846862793, "logits/rejected": -2.4513936042785645, "logps/chosen": -103.87449645996094, "logps/rejected": -119.28651428222656, "loss": 0.6333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.49710139632225037, "rewards/margins": 0.15935452282428741, "rewards/rejected": -0.656455934047699, "step": 6850 }, { "epoch": 1.1819434872501722, "grad_norm": 9.598288536071777, "learning_rate": 4.287478733318204e-08, "logits/chosen": -2.5880210399627686, "logits/rejected": -2.553973913192749, "logps/chosen": -112.11553955078125, "logps/rejected": -120.29103088378906, "loss": 0.6355, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5300083756446838, "rewards/margins": 0.14968501031398773, "rewards/rejected": -0.6796934008598328, "step": 6860 }, { "epoch": 1.1836664369400414, "grad_norm": 9.507317543029785, "learning_rate": 4.272599570268437e-08, "logits/chosen": -2.5000433921813965, "logits/rejected": -2.4891629219055176, "logps/chosen": -104.8589859008789, "logps/rejected": -119.49824523925781, "loss": 0.6399, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5377354621887207, "rewards/margins": 0.13954982161521912, "rewards/rejected": -0.6772853136062622, "step": 6870 }, { "epoch": 1.1853893866299103, "grad_norm": 8.982834815979004, "learning_rate": 4.257726985156763e-08, "logits/chosen": -2.6045875549316406, "logits/rejected": -2.5753495693206787, "logps/chosen": -107.98268127441406, "logps/rejected": -120.47676086425781, "loss": 0.6346, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5371975302696228, "rewards/margins": 0.15595701336860657, "rewards/rejected": -0.6931546330451965, "step": 6880 }, { "epoch": 1.1871123363197795, "grad_norm": 8.530778884887695, "learning_rate": 4.2428611124771177e-08, "logits/chosen": -2.5295612812042236, "logits/rejected": -2.5056262016296387, "logps/chosen": -101.05653381347656, "logps/rejected": -117.50874328613281, "loss": 0.633, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.48746204376220703, "rewards/margins": 0.16373209655284882, "rewards/rejected": -0.6511940956115723, "step": 6890 }, { "epoch": 1.1888352860096485, "grad_norm": 9.275869369506836, "learning_rate": 4.2280020866627286e-08, "logits/chosen": -2.5908186435699463, "logits/rejected": -2.5705745220184326, "logps/chosen": -104.81771087646484, "logps/rejected": -118.31611633300781, "loss": 0.6437, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5126312971115112, "rewards/margins": 0.13616414368152618, "rewards/rejected": -0.648795485496521, "step": 6900 }, { "epoch": 1.1888352860096485, "eval_logits/chosen": -2.6617848873138428, "eval_logits/rejected": -2.655430793762207, "eval_logps/chosen": -101.02922821044922, "eval_logps/rejected": -113.82203674316406, "eval_loss": 0.6610434055328369, "eval_rewards/accuracies": 0.6105948090553284, "eval_rewards/chosen": -0.4231734275817871, "eval_rewards/margins": 0.08324573189020157, "eval_rewards/rejected": -0.5064191222190857, "eval_runtime": 360.3467, "eval_samples_per_second": 11.944, "eval_steps_per_second": 1.493, "step": 6900 }, { "epoch": 1.1905582356995175, "grad_norm": 9.048493385314941, "learning_rate": 4.213150042084914e-08, "logits/chosen": -2.6164145469665527, "logits/rejected": -2.593142032623291, "logps/chosen": -106.26557922363281, "logps/rejected": -123.49296569824219, "loss": 0.6194, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.49379563331604004, "rewards/margins": 0.18906860053539276, "rewards/rejected": -0.6828643083572388, "step": 6910 }, { "epoch": 1.1922811853893867, "grad_norm": 9.733365058898926, "learning_rate": 4.198305113051852e-08, "logits/chosen": -2.6699588298797607, "logits/rejected": -2.6404507160186768, "logps/chosen": -109.824951171875, "logps/rejected": -120.6965560913086, "loss": 0.6499, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.537209689617157, "rewards/margins": 0.13205452263355255, "rewards/rejected": -0.6692641973495483, "step": 6920 }, { "epoch": 1.1940041350792556, "grad_norm": 9.200611114501953, "learning_rate": 4.183467433807385e-08, "logits/chosen": -2.495748519897461, "logits/rejected": -2.4691426753997803, "logps/chosen": -107.3687744140625, "logps/rejected": -122.67182922363281, "loss": 0.6207, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4973491132259369, "rewards/margins": 0.18685005605220795, "rewards/rejected": -0.6841990947723389, "step": 6930 }, { "epoch": 1.1957270847691248, "grad_norm": 11.490638732910156, "learning_rate": 4.168637138529783e-08, "logits/chosen": -2.492746591567993, "logits/rejected": -2.450110673904419, "logps/chosen": -106.14799499511719, "logps/rejected": -115.3590316772461, "loss": 0.6308, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4818505346775055, "rewards/margins": 0.16582930088043213, "rewards/rejected": -0.64767986536026, "step": 6940 }, { "epoch": 1.1974500344589938, "grad_norm": 8.843438148498535, "learning_rate": 4.153814361330552e-08, "logits/chosen": -2.5467190742492676, "logits/rejected": -2.530759572982788, "logps/chosen": -103.19229888916016, "logps/rejected": -122.7353515625, "loss": 0.6129, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4933549761772156, "rewards/margins": 0.20461630821228027, "rewards/rejected": -0.6979712247848511, "step": 6950 }, { "epoch": 1.1991729841488628, "grad_norm": 9.876995086669922, "learning_rate": 4.138999236253205e-08, "logits/chosen": -2.7103123664855957, "logits/rejected": -2.674586057662964, "logps/chosen": -109.5539321899414, "logps/rejected": -114.24405670166016, "loss": 0.648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.542354941368103, "rewards/margins": 0.12227793782949448, "rewards/rejected": -0.6646329164505005, "step": 6960 }, { "epoch": 1.200895933838732, "grad_norm": 12.366546630859375, "learning_rate": 4.1241918972720626e-08, "logits/chosen": -2.6229166984558105, "logits/rejected": -2.5920138359069824, "logps/chosen": -104.89115142822266, "logps/rejected": -114.06379699707031, "loss": 0.644, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5034934282302856, "rewards/margins": 0.13324648141860962, "rewards/rejected": -0.63673996925354, "step": 6970 }, { "epoch": 1.202618883528601, "grad_norm": 8.891714096069336, "learning_rate": 4.1093924782910256e-08, "logits/chosen": -2.532543420791626, "logits/rejected": -2.5042622089385986, "logps/chosen": -105.50868225097656, "logps/rejected": -124.03361511230469, "loss": 0.6174, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5128332376480103, "rewards/margins": 0.19743052124977112, "rewards/rejected": -0.7102638483047485, "step": 6980 }, { "epoch": 1.20434183321847, "grad_norm": 9.686083793640137, "learning_rate": 4.094601113142385e-08, "logits/chosen": -2.4580886363983154, "logits/rejected": -2.447352170944214, "logps/chosen": -110.0320053100586, "logps/rejected": -122.3864517211914, "loss": 0.6381, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5304861068725586, "rewards/margins": 0.15075218677520752, "rewards/rejected": -0.6812382936477661, "step": 6990 }, { "epoch": 1.206064782908339, "grad_norm": 9.574707984924316, "learning_rate": 4.07981793558559e-08, "logits/chosen": -2.7277112007141113, "logits/rejected": -2.700732707977295, "logps/chosen": -114.8912582397461, "logps/rejected": -130.61427307128906, "loss": 0.6268, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5859689712524414, "rewards/margins": 0.17327004671096802, "rewards/rejected": -0.7592389583587646, "step": 7000 }, { "epoch": 1.206064782908339, "eval_logits/chosen": -2.6553220748901367, "eval_logits/rejected": -2.648987054824829, "eval_logps/chosen": -102.90448760986328, "eval_logps/rejected": -115.96162414550781, "eval_loss": 0.660365104675293, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.44192585349082947, "eval_rewards/margins": 0.08588908612728119, "eval_rewards/rejected": -0.5278149247169495, "eval_runtime": 359.8432, "eval_samples_per_second": 11.961, "eval_steps_per_second": 1.495, "step": 7000 }, { "epoch": 1.207787732598208, "grad_norm": 7.930838108062744, "learning_rate": 4.065043079306057e-08, "logits/chosen": -2.558248996734619, "logits/rejected": -2.5334038734436035, "logps/chosen": -110.71012115478516, "logps/rejected": -125.1780014038086, "loss": 0.625, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5428487062454224, "rewards/margins": 0.1803073137998581, "rewards/rejected": -0.7231560349464417, "step": 7010 }, { "epoch": 1.2095106822880772, "grad_norm": 10.088218688964844, "learning_rate": 4.050276677913948e-08, "logits/chosen": -2.539398670196533, "logits/rejected": -2.514713764190674, "logps/chosen": -104.53022766113281, "logps/rejected": -120.55018615722656, "loss": 0.6357, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5319541692733765, "rewards/margins": 0.14996173977851868, "rewards/rejected": -0.6819158792495728, "step": 7020 }, { "epoch": 1.2112336319779462, "grad_norm": 10.294520378112793, "learning_rate": 4.0355188649429677e-08, "logits/chosen": -2.5818893909454346, "logits/rejected": -2.5657174587249756, "logps/chosen": -108.9336929321289, "logps/rejected": -125.43675231933594, "loss": 0.6416, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5583351254463196, "rewards/margins": 0.1448930948972702, "rewards/rejected": -0.7032281756401062, "step": 7030 }, { "epoch": 1.2129565816678154, "grad_norm": 12.805583953857422, "learning_rate": 4.020769773849153e-08, "logits/chosen": -2.526520013809204, "logits/rejected": -2.5062756538391113, "logps/chosen": -110.2097396850586, "logps/rejected": -127.39622497558594, "loss": 0.6268, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5413297414779663, "rewards/margins": 0.16765829920768738, "rewards/rejected": -0.7089880704879761, "step": 7040 }, { "epoch": 1.2146795313576844, "grad_norm": 10.435907363891602, "learning_rate": 4.0060295380096745e-08, "logits/chosen": -2.502941608428955, "logits/rejected": -2.483719825744629, "logps/chosen": -104.99825286865234, "logps/rejected": -122.9035415649414, "loss": 0.6332, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5354247689247131, "rewards/margins": 0.15982839465141296, "rewards/rejected": -0.6952531337738037, "step": 7050 }, { "epoch": 1.2164024810475533, "grad_norm": 10.906390190124512, "learning_rate": 3.991298290721618e-08, "logits/chosen": -2.6106855869293213, "logits/rejected": -2.597623586654663, "logps/chosen": -112.033447265625, "logps/rejected": -123.20802307128906, "loss": 0.6542, "rewards/accuracies": 0.625, "rewards/chosen": -0.5882318019866943, "rewards/margins": 0.11472143977880478, "rewards/rejected": -0.7029532194137573, "step": 7060 }, { "epoch": 1.2181254307374225, "grad_norm": 11.68134880065918, "learning_rate": 3.976576165200784e-08, "logits/chosen": -2.562283992767334, "logits/rejected": -2.5388481616973877, "logps/chosen": -113.50862121582031, "logps/rejected": -120.29753112792969, "loss": 0.6744, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.5890558958053589, "rewards/margins": 0.08144049346446991, "rewards/rejected": -0.67049640417099, "step": 7070 }, { "epoch": 1.2198483804272915, "grad_norm": 8.486589431762695, "learning_rate": 3.961863294580492e-08, "logits/chosen": -2.6284613609313965, "logits/rejected": -2.600008487701416, "logps/chosen": -107.27657318115234, "logps/rejected": -123.54029846191406, "loss": 0.631, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5380843281745911, "rewards/margins": 0.1576683223247528, "rewards/rejected": -0.6957526803016663, "step": 7080 }, { "epoch": 1.2215713301171607, "grad_norm": 8.582545280456543, "learning_rate": 3.94715981191036e-08, "logits/chosen": -2.6594293117523193, "logits/rejected": -2.648655891418457, "logps/chosen": -110.38819885253906, "logps/rejected": -125.69108581542969, "loss": 0.631, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5687033534049988, "rewards/margins": 0.16610802710056305, "rewards/rejected": -0.7348113059997559, "step": 7090 }, { "epoch": 1.2232942798070296, "grad_norm": 8.429627418518066, "learning_rate": 3.932465850155117e-08, "logits/chosen": -2.528796434402466, "logits/rejected": -2.508687973022461, "logps/chosen": -110.3732681274414, "logps/rejected": -126.6390151977539, "loss": 0.6303, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5329998135566711, "rewards/margins": 0.16490605473518372, "rewards/rejected": -0.6979058980941772, "step": 7100 }, { "epoch": 1.2232942798070296, "eval_logits/chosen": -2.6505863666534424, "eval_logits/rejected": -2.6443142890930176, "eval_logps/chosen": -102.50411987304688, "eval_logps/rejected": -115.5604019165039, "eval_loss": 0.6603997349739075, "eval_rewards/accuracies": 0.6129181981086731, "eval_rewards/chosen": -0.4379221498966217, "eval_rewards/margins": 0.08588062226772308, "eval_rewards/rejected": -0.5238028168678284, "eval_runtime": 360.333, "eval_samples_per_second": 11.945, "eval_steps_per_second": 1.493, "step": 7100 }, { "epoch": 1.2250172294968986, "grad_norm": 9.6629056930542, "learning_rate": 3.9177815421933884e-08, "logits/chosen": -2.572434663772583, "logits/rejected": -2.5384533405303955, "logps/chosen": -108.16790771484375, "logps/rejected": -114.19828796386719, "loss": 0.6664, "rewards/accuracies": 0.625, "rewards/chosen": -0.5469506978988647, "rewards/margins": 0.08520477265119553, "rewards/rejected": -0.6321554183959961, "step": 7110 }, { "epoch": 1.2267401791867678, "grad_norm": 10.517024040222168, "learning_rate": 3.903107020816504e-08, "logits/chosen": -2.561241626739502, "logits/rejected": -2.52276873588562, "logps/chosen": -116.26021575927734, "logps/rejected": -118.6120834350586, "loss": 0.6523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5427398085594177, "rewards/margins": 0.11778402328491211, "rewards/rejected": -0.6605237722396851, "step": 7120 }, { "epoch": 1.2284631288766368, "grad_norm": 9.681543350219727, "learning_rate": 3.8884424187272866e-08, "logits/chosen": -2.455671787261963, "logits/rejected": -2.4319849014282227, "logps/chosen": -106.87947082519531, "logps/rejected": -119.38846588134766, "loss": 0.6352, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5053822994232178, "rewards/margins": 0.15671773254871368, "rewards/rejected": -0.662100076675415, "step": 7130 }, { "epoch": 1.230186078566506, "grad_norm": 8.6825590133667, "learning_rate": 3.873787868538866e-08, "logits/chosen": -2.5538275241851807, "logits/rejected": -2.548265218734741, "logps/chosen": -104.10013580322266, "logps/rejected": -119.7091293334961, "loss": 0.6397, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5278245806694031, "rewards/margins": 0.14982108771800995, "rewards/rejected": -0.6776455640792847, "step": 7140 }, { "epoch": 1.231909028256375, "grad_norm": 10.221883773803711, "learning_rate": 3.8591435027734646e-08, "logits/chosen": -2.628955364227295, "logits/rejected": -2.6071934700012207, "logps/chosen": -117.13688659667969, "logps/rejected": -128.465087890625, "loss": 0.6497, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6074936985969543, "rewards/margins": 0.12625807523727417, "rewards/rejected": -0.7337517738342285, "step": 7150 }, { "epoch": 1.233631977946244, "grad_norm": 9.913213729858398, "learning_rate": 3.844509453861214e-08, "logits/chosen": -2.545468807220459, "logits/rejected": -2.511420965194702, "logps/chosen": -106.26422119140625, "logps/rejected": -124.04301452636719, "loss": 0.6263, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5260246396064758, "rewards/margins": 0.17362895607948303, "rewards/rejected": -0.6996536254882812, "step": 7160 }, { "epoch": 1.235354927636113, "grad_norm": 10.441494941711426, "learning_rate": 3.829885854138941e-08, "logits/chosen": -2.511805772781372, "logits/rejected": -2.4859747886657715, "logps/chosen": -110.22599029541016, "logps/rejected": -122.40694427490234, "loss": 0.6471, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5465968251228333, "rewards/margins": 0.13938425481319427, "rewards/rejected": -0.6859810948371887, "step": 7170 }, { "epoch": 1.237077877325982, "grad_norm": 8.994463920593262, "learning_rate": 3.815272835848987e-08, "logits/chosen": -2.636838436126709, "logits/rejected": -2.6227402687072754, "logps/chosen": -106.834228515625, "logps/rejected": -122.57475280761719, "loss": 0.6284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5153202414512634, "rewards/margins": 0.16570481657981873, "rewards/rejected": -0.6810250282287598, "step": 7180 }, { "epoch": 1.2388008270158513, "grad_norm": 8.581153869628906, "learning_rate": 3.8006705311379985e-08, "logits/chosen": -2.488649845123291, "logits/rejected": -2.4648635387420654, "logps/chosen": -105.9777603149414, "logps/rejected": -123.8721923828125, "loss": 0.6289, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5330722332000732, "rewards/margins": 0.1694321185350418, "rewards/rejected": -0.7025043368339539, "step": 7190 }, { "epoch": 1.2405237767057202, "grad_norm": 10.977418899536133, "learning_rate": 3.7860790720557445e-08, "logits/chosen": -2.5012497901916504, "logits/rejected": -2.4833712577819824, "logps/chosen": -108.4955825805664, "logps/rejected": -130.1787872314453, "loss": 0.6251, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5539822578430176, "rewards/margins": 0.18606498837471008, "rewards/rejected": -0.74004727602005, "step": 7200 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -2.6447579860687256, "eval_logits/rejected": -2.6383352279663086, "eval_logps/chosen": -103.08140563964844, "eval_logps/rejected": -116.27261352539062, "eval_loss": 0.659978449344635, "eval_rewards/accuracies": 0.6101301312446594, "eval_rewards/chosen": -0.4436950981616974, "eval_rewards/margins": 0.08722980320453644, "eval_rewards/rejected": -0.530924916267395, "eval_runtime": 359.9641, "eval_samples_per_second": 11.957, "eval_steps_per_second": 1.495, "step": 7200 }, { "epoch": 1.2422467263955892, "grad_norm": 10.23493766784668, "learning_rate": 3.77149859055391e-08, "logits/chosen": -2.6112637519836426, "logits/rejected": -2.5924153327941895, "logps/chosen": -115.63868713378906, "logps/rejected": -121.07205963134766, "loss": 0.6621, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5860998630523682, "rewards/margins": 0.09432484954595566, "rewards/rejected": -0.680424690246582, "step": 7210 }, { "epoch": 1.2439696760854584, "grad_norm": 9.584635734558105, "learning_rate": 3.756929218484914e-08, "logits/chosen": -2.4883735179901123, "logits/rejected": -2.4554367065429688, "logps/chosen": -110.43601989746094, "logps/rejected": -120.67839050292969, "loss": 0.6509, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.559060275554657, "rewards/margins": 0.13107997179031372, "rewards/rejected": -0.6901403069496155, "step": 7220 }, { "epoch": 1.2456926257753274, "grad_norm": 12.276067733764648, "learning_rate": 3.7423710876007084e-08, "logits/chosen": -2.5233511924743652, "logits/rejected": -2.5052595138549805, "logps/chosen": -111.4328384399414, "logps/rejected": -119.4492416381836, "loss": 0.6636, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5721146464347839, "rewards/margins": 0.09257705509662628, "rewards/rejected": -0.664691686630249, "step": 7230 }, { "epoch": 1.2474155754651963, "grad_norm": 10.505390167236328, "learning_rate": 3.727824329551595e-08, "logits/chosen": -2.5546631813049316, "logits/rejected": -2.543801784515381, "logps/chosen": -112.68693542480469, "logps/rejected": -124.9740982055664, "loss": 0.6382, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5524017214775085, "rewards/margins": 0.14962348341941833, "rewards/rejected": -0.7020251154899597, "step": 7240 }, { "epoch": 1.2491385251550655, "grad_norm": 9.379290580749512, "learning_rate": 3.713289075885023e-08, "logits/chosen": -2.602900743484497, "logits/rejected": -2.574934959411621, "logps/chosen": -111.77998352050781, "logps/rejected": -121.20411682128906, "loss": 0.6556, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5721232295036316, "rewards/margins": 0.11137218773365021, "rewards/rejected": -0.6834954023361206, "step": 7250 }, { "epoch": 1.2508614748449345, "grad_norm": 9.753902435302734, "learning_rate": 3.698765458044414e-08, "logits/chosen": -2.591021776199341, "logits/rejected": -2.5614984035491943, "logps/chosen": -108.41435241699219, "logps/rejected": -123.48286437988281, "loss": 0.6298, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5472859144210815, "rewards/margins": 0.16381797194480896, "rewards/rejected": -0.7111038565635681, "step": 7260 }, { "epoch": 1.2525844245348035, "grad_norm": 9.254847526550293, "learning_rate": 3.6842536073679596e-08, "logits/chosen": -2.6056957244873047, "logits/rejected": -2.577763080596924, "logps/chosen": -106.95680236816406, "logps/rejected": -119.10015869140625, "loss": 0.6407, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5264113545417786, "rewards/margins": 0.13823921978473663, "rewards/rejected": -0.6646506190299988, "step": 7270 }, { "epoch": 1.2543073742246726, "grad_norm": 9.7710542678833, "learning_rate": 3.669753655087442e-08, "logits/chosen": -2.639329433441162, "logits/rejected": -2.6067185401916504, "logps/chosen": -107.37776184082031, "logps/rejected": -121.13716888427734, "loss": 0.6253, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5080801844596863, "rewards/margins": 0.17418815195560455, "rewards/rejected": -0.6822682619094849, "step": 7280 }, { "epoch": 1.2560303239145416, "grad_norm": 9.96068286895752, "learning_rate": 3.65526573232705e-08, "logits/chosen": -2.443098545074463, "logits/rejected": -2.4309229850769043, "logps/chosen": -109.87693786621094, "logps/rejected": -120.99009704589844, "loss": 0.6564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5508560538291931, "rewards/margins": 0.10041414201259613, "rewards/rejected": -0.6512702107429504, "step": 7290 }, { "epoch": 1.2577532736044108, "grad_norm": 10.196106910705566, "learning_rate": 3.6407899701021807e-08, "logits/chosen": -2.3989205360412598, "logits/rejected": -2.3831562995910645, "logps/chosen": -101.35286712646484, "logps/rejected": -114.70906066894531, "loss": 0.6531, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5109211206436157, "rewards/margins": 0.11609502881765366, "rewards/rejected": -0.6270160675048828, "step": 7300 }, { "epoch": 1.2577532736044108, "eval_logits/chosen": -2.642998218536377, "eval_logits/rejected": -2.636584997177124, "eval_logps/chosen": -102.09992218017578, "eval_logps/rejected": -115.19979095458984, "eval_loss": 0.6601964235305786, "eval_rewards/accuracies": 0.6124535202980042, "eval_rewards/chosen": -0.4338802695274353, "eval_rewards/margins": 0.08631633222103119, "eval_rewards/rejected": -0.5201966166496277, "eval_runtime": 359.6659, "eval_samples_per_second": 11.967, "eval_steps_per_second": 1.496, "step": 7300 }, { "epoch": 1.2594762232942798, "grad_norm": 9.084516525268555, "learning_rate": 3.6263264993182695e-08, "logits/chosen": -2.5876553058624268, "logits/rejected": -2.564537286758423, "logps/chosen": -109.34537506103516, "logps/rejected": -122.04302978515625, "loss": 0.6359, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5428635478019714, "rewards/margins": 0.15870937705039978, "rewards/rejected": -0.7015729546546936, "step": 7310 }, { "epoch": 1.2611991729841487, "grad_norm": 8.735158920288086, "learning_rate": 3.6118754507695946e-08, "logits/chosen": -2.5970218181610107, "logits/rejected": -2.574091672897339, "logps/chosen": -106.69927978515625, "logps/rejected": -115.67286682128906, "loss": 0.6509, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5234571695327759, "rewards/margins": 0.12171218544244766, "rewards/rejected": -0.6451693773269653, "step": 7320 }, { "epoch": 1.262922122674018, "grad_norm": 9.100055694580078, "learning_rate": 3.597436955138102e-08, "logits/chosen": -2.496898651123047, "logits/rejected": -2.4709010124206543, "logps/chosen": -103.43009948730469, "logps/rejected": -122.0030288696289, "loss": 0.6194, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5036752223968506, "rewards/margins": 0.20063956081867218, "rewards/rejected": -0.7043147683143616, "step": 7330 }, { "epoch": 1.264645072363887, "grad_norm": 9.007047653198242, "learning_rate": 3.583011142992218e-08, "logits/chosen": -2.5679678916931152, "logits/rejected": -2.543445348739624, "logps/chosen": -109.4923095703125, "logps/rejected": -120.74739074707031, "loss": 0.6527, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5411123633384705, "rewards/margins": 0.12720635533332825, "rewards/rejected": -0.6683186888694763, "step": 7340 }, { "epoch": 1.266368022053756, "grad_norm": 9.416090965270996, "learning_rate": 3.568598144785675e-08, "logits/chosen": -2.6188342571258545, "logits/rejected": -2.5921006202697754, "logps/chosen": -108.1654052734375, "logps/rejected": -119.23826599121094, "loss": 0.633, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5124603509902954, "rewards/margins": 0.16223213076591492, "rewards/rejected": -0.6746925115585327, "step": 7350 }, { "epoch": 1.268090971743625, "grad_norm": 11.118308067321777, "learning_rate": 3.5541980908563216e-08, "logits/chosen": -2.5278282165527344, "logits/rejected": -2.509023666381836, "logps/chosen": -108.90681457519531, "logps/rejected": -120.905517578125, "loss": 0.6411, "rewards/accuracies": 0.625, "rewards/chosen": -0.546089231967926, "rewards/margins": 0.14111468195915222, "rewards/rejected": -0.6872037649154663, "step": 7360 }, { "epoch": 1.269813921433494, "grad_norm": 9.836128234863281, "learning_rate": 3.539811111424959e-08, "logits/chosen": -2.440730571746826, "logits/rejected": -2.4208149909973145, "logps/chosen": -104.86732482910156, "logps/rejected": -113.34352111816406, "loss": 0.6599, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5206080675125122, "rewards/margins": 0.10041671991348267, "rewards/rejected": -0.6210247278213501, "step": 7370 }, { "epoch": 1.2715368711233632, "grad_norm": 9.010052680969238, "learning_rate": 3.525437336594145e-08, "logits/chosen": -2.530202865600586, "logits/rejected": -2.5104517936706543, "logps/chosen": -105.98480224609375, "logps/rejected": -123.57877349853516, "loss": 0.627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5281702280044556, "rewards/margins": 0.17074742913246155, "rewards/rejected": -0.6989176869392395, "step": 7380 }, { "epoch": 1.2732598208132322, "grad_norm": 10.061474800109863, "learning_rate": 3.511076896347036e-08, "logits/chosen": -2.501349687576294, "logits/rejected": -2.4825596809387207, "logps/chosen": -104.70169830322266, "logps/rejected": -120.3403549194336, "loss": 0.6387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5248795747756958, "rewards/margins": 0.1483328640460968, "rewards/rejected": -0.6732124090194702, "step": 7390 }, { "epoch": 1.2749827705031014, "grad_norm": 8.739763259887695, "learning_rate": 3.4967299205461974e-08, "logits/chosen": -2.5027923583984375, "logits/rejected": -2.4664530754089355, "logps/chosen": -111.92269134521484, "logps/rejected": -119.95565032958984, "loss": 0.6456, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5647163391113281, "rewards/margins": 0.12913253903388977, "rewards/rejected": -0.6938489079475403, "step": 7400 }, { "epoch": 1.2749827705031014, "eval_logits/chosen": -2.6409223079681396, "eval_logits/rejected": -2.634524345397949, "eval_logps/chosen": -101.84141540527344, "eval_logps/rejected": -114.98131561279297, "eval_loss": 0.6600046753883362, "eval_rewards/accuracies": 0.6124535202980042, "eval_rewards/chosen": -0.4312951862812042, "eval_rewards/margins": 0.08671677857637405, "eval_rewards/rejected": -0.5180119276046753, "eval_runtime": 360.4764, "eval_samples_per_second": 11.94, "eval_steps_per_second": 1.492, "step": 7400 }, { "epoch": 1.2767057201929704, "grad_norm": 10.62580394744873, "learning_rate": 3.482396538932438e-08, "logits/chosen": -2.479602336883545, "logits/rejected": -2.4505069255828857, "logps/chosen": -105.51661682128906, "logps/rejected": -118.50335693359375, "loss": 0.6429, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5176024436950684, "rewards/margins": 0.137213796377182, "rewards/rejected": -0.6548162698745728, "step": 7410 }, { "epoch": 1.2784286698828393, "grad_norm": 9.140203475952148, "learning_rate": 3.4680768811236266e-08, "logits/chosen": -2.4199485778808594, "logits/rejected": -2.3975508213043213, "logps/chosen": -105.53422546386719, "logps/rejected": -115.3234634399414, "loss": 0.6455, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4936627447605133, "rewards/margins": 0.1288955807685852, "rewards/rejected": -0.6225583553314209, "step": 7420 }, { "epoch": 1.2801516195727085, "grad_norm": 11.094929695129395, "learning_rate": 3.4537710766135366e-08, "logits/chosen": -2.470757246017456, "logits/rejected": -2.44372296333313, "logps/chosen": -109.74659729003906, "logps/rejected": -123.33604431152344, "loss": 0.6375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5634235739707947, "rewards/margins": 0.15116463601589203, "rewards/rejected": -0.7145882248878479, "step": 7430 }, { "epoch": 1.2818745692625775, "grad_norm": 9.100417137145996, "learning_rate": 3.439479254770655e-08, "logits/chosen": -2.5642802715301514, "logits/rejected": -2.536123752593994, "logps/chosen": -114.7548828125, "logps/rejected": -126.5836181640625, "loss": 0.6416, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5875005722045898, "rewards/margins": 0.14469286799430847, "rewards/rejected": -0.7321933507919312, "step": 7440 }, { "epoch": 1.2835975189524467, "grad_norm": 9.80208969116211, "learning_rate": 3.425201544837033e-08, "logits/chosen": -2.6381442546844482, "logits/rejected": -2.6124885082244873, "logps/chosen": -109.50288391113281, "logps/rejected": -119.41923522949219, "loss": 0.6478, "rewards/accuracies": 0.6875, "rewards/chosen": -0.535770833492279, "rewards/margins": 0.13235479593276978, "rewards/rejected": -0.6681256294250488, "step": 7450 }, { "epoch": 1.2853204686423156, "grad_norm": 9.576980590820312, "learning_rate": 3.410938075927096e-08, "logits/chosen": -2.4544360637664795, "logits/rejected": -2.457080841064453, "logps/chosen": -101.24821472167969, "logps/rejected": -119.45255279541016, "loss": 0.6353, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5108596682548523, "rewards/margins": 0.15377506613731384, "rewards/rejected": -0.6646348237991333, "step": 7460 }, { "epoch": 1.2870434183321846, "grad_norm": 10.5313720703125, "learning_rate": 3.396688977026494e-08, "logits/chosen": -2.5422723293304443, "logits/rejected": -2.5268795490264893, "logps/chosen": -106.96131896972656, "logps/rejected": -115.691650390625, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": -0.5230668783187866, "rewards/margins": 0.12071572244167328, "rewards/rejected": -0.6437825560569763, "step": 7470 }, { "epoch": 1.2887663680220538, "grad_norm": 9.148719787597656, "learning_rate": 3.382454376990922e-08, "logits/chosen": -2.500944137573242, "logits/rejected": -2.475217580795288, "logps/chosen": -105.0721206665039, "logps/rejected": -112.06779479980469, "loss": 0.6535, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5043980479240417, "rewards/margins": 0.12430262565612793, "rewards/rejected": -0.6287007331848145, "step": 7480 }, { "epoch": 1.2904893177119228, "grad_norm": 9.421043395996094, "learning_rate": 3.36823440454497e-08, "logits/chosen": -2.4685792922973633, "logits/rejected": -2.4556145668029785, "logps/chosen": -106.70912170410156, "logps/rejected": -123.2634048461914, "loss": 0.629, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5414863228797913, "rewards/margins": 0.15967078506946564, "rewards/rejected": -0.7011570334434509, "step": 7490 }, { "epoch": 1.292212267401792, "grad_norm": 8.091127395629883, "learning_rate": 3.3540291882809394e-08, "logits/chosen": -2.4892683029174805, "logits/rejected": -2.465965509414673, "logps/chosen": -104.45188903808594, "logps/rejected": -118.73978424072266, "loss": 0.6455, "rewards/accuracies": 0.625, "rewards/chosen": -0.5297764539718628, "rewards/margins": 0.1378394067287445, "rewards/rejected": -0.6676157712936401, "step": 7500 }, { "epoch": 1.292212267401792, "eval_logits/chosen": -2.635746955871582, "eval_logits/rejected": -2.629236936569214, "eval_logps/chosen": -101.78617095947266, "eval_logps/rejected": -114.98066711425781, "eval_loss": 0.6597474813461304, "eval_rewards/accuracies": 0.6147769689559937, "eval_rewards/chosen": -0.4307427406311035, "eval_rewards/margins": 0.0872626081109047, "eval_rewards/rejected": -0.51800537109375, "eval_runtime": 359.6591, "eval_samples_per_second": 11.967, "eval_steps_per_second": 1.496, "step": 7500 }, { "epoch": 1.293935217091661, "grad_norm": 10.538162231445312, "learning_rate": 3.339838856657694e-08, "logits/chosen": -2.5177297592163086, "logits/rejected": -2.4895987510681152, "logps/chosen": -113.8686752319336, "logps/rejected": -116.67778015136719, "loss": 0.6629, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5660760998725891, "rewards/margins": 0.08849167078733444, "rewards/rejected": -0.6545677185058594, "step": 7510 }, { "epoch": 1.29565816678153, "grad_norm": 10.612116813659668, "learning_rate": 3.3256635379995e-08, "logits/chosen": -2.4990150928497314, "logits/rejected": -2.479405641555786, "logps/chosen": -103.64764404296875, "logps/rejected": -122.71256256103516, "loss": 0.6233, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5071987509727478, "rewards/margins": 0.18135200440883636, "rewards/rejected": -0.6885508298873901, "step": 7520 }, { "epoch": 1.297381116471399, "grad_norm": 9.325216293334961, "learning_rate": 3.311503360494849e-08, "logits/chosen": -2.437972068786621, "logits/rejected": -2.434144973754883, "logps/chosen": -103.4576187133789, "logps/rejected": -120.95352935791016, "loss": 0.6315, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5124028921127319, "rewards/margins": 0.16277670860290527, "rewards/rejected": -0.6751796007156372, "step": 7530 }, { "epoch": 1.299104066161268, "grad_norm": 8.740460395812988, "learning_rate": 3.297358452195324e-08, "logits/chosen": -2.5994045734405518, "logits/rejected": -2.592708110809326, "logps/chosen": -99.10334014892578, "logps/rejected": -118.64372253417969, "loss": 0.6199, "rewards/accuracies": 0.6875, "rewards/chosen": -0.47562798857688904, "rewards/margins": 0.18405228853225708, "rewards/rejected": -0.6596802473068237, "step": 7540 }, { "epoch": 1.3008270158511372, "grad_norm": 8.941034317016602, "learning_rate": 3.283228941014414e-08, "logits/chosen": -2.51711106300354, "logits/rejected": -2.498652935028076, "logps/chosen": -102.238525390625, "logps/rejected": -120.69661712646484, "loss": 0.6221, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5044776797294617, "rewards/margins": 0.18435628712177277, "rewards/rejected": -0.688834011554718, "step": 7550 }, { "epoch": 1.3025499655410062, "grad_norm": 9.605469703674316, "learning_rate": 3.2691149547263794e-08, "logits/chosen": -2.6023619174957275, "logits/rejected": -2.582151412963867, "logps/chosen": -99.77006530761719, "logps/rejected": -122.7721939086914, "loss": 0.6113, "rewards/accuracies": 0.75, "rewards/chosen": -0.4719509482383728, "rewards/margins": 0.20214197039604187, "rewards/rejected": -0.6740928888320923, "step": 7560 }, { "epoch": 1.3042729152308752, "grad_norm": 10.741072654724121, "learning_rate": 3.255016620965082e-08, "logits/chosen": -2.588242292404175, "logits/rejected": -2.5711989402770996, "logps/chosen": -111.654052734375, "logps/rejected": -124.45565032958984, "loss": 0.6266, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5485700368881226, "rewards/margins": 0.17345201969146729, "rewards/rejected": -0.7220219969749451, "step": 7570 }, { "epoch": 1.3059958649207444, "grad_norm": 10.163570404052734, "learning_rate": 3.240934067222845e-08, "logits/chosen": -2.539912223815918, "logits/rejected": -2.5225236415863037, "logps/chosen": -103.22077941894531, "logps/rejected": -120.7096939086914, "loss": 0.6187, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.47207051515579224, "rewards/margins": 0.1884404718875885, "rewards/rejected": -0.6605108976364136, "step": 7580 }, { "epoch": 1.3077188146106133, "grad_norm": 9.670416831970215, "learning_rate": 3.226867420849279e-08, "logits/chosen": -2.5681509971618652, "logits/rejected": -2.526566505432129, "logps/chosen": -115.42330169677734, "logps/rejected": -125.2770004272461, "loss": 0.6329, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5710044503211975, "rewards/margins": 0.1676451563835144, "rewards/rejected": -0.7386496067047119, "step": 7590 }, { "epoch": 1.3094417643004825, "grad_norm": 11.190610885620117, "learning_rate": 3.2128168090501575e-08, "logits/chosen": -2.5398783683776855, "logits/rejected": -2.513364791870117, "logps/chosen": -111.20823669433594, "logps/rejected": -114.00135803222656, "loss": 0.6762, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5867103934288025, "rewards/margins": 0.06733911484479904, "rewards/rejected": -0.6540495157241821, "step": 7600 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -2.62813401222229, "eval_logits/rejected": -2.621640682220459, "eval_logps/chosen": -102.6287612915039, "eval_logps/rejected": -115.96487426757812, "eval_loss": 0.6593355536460876, "eval_rewards/accuracies": 0.6117565035820007, "eval_rewards/chosen": -0.4391685724258423, "eval_rewards/margins": 0.08867882192134857, "eval_rewards/rejected": -0.5278474688529968, "eval_runtime": 359.5884, "eval_samples_per_second": 11.969, "eval_steps_per_second": 1.496, "step": 7600 }, { "epoch": 1.3111647139903515, "grad_norm": 7.892940998077393, "learning_rate": 3.1987823588862395e-08, "logits/chosen": -2.494873285293579, "logits/rejected": -2.4665284156799316, "logps/chosen": -105.79533386230469, "logps/rejected": -118.2354965209961, "loss": 0.6389, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5288399457931519, "rewards/margins": 0.1512880027294159, "rewards/rejected": -0.6801279783248901, "step": 7610 }, { "epoch": 1.3128876636802205, "grad_norm": 9.36888599395752, "learning_rate": 3.18476419727214e-08, "logits/chosen": -2.5487782955169678, "logits/rejected": -2.52319073677063, "logps/chosen": -107.84880065917969, "logps/rejected": -116.66322326660156, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": -0.5033544301986694, "rewards/margins": 0.1253475844860077, "rewards/rejected": -0.6287019848823547, "step": 7620 }, { "epoch": 1.3146106133700897, "grad_norm": 11.694605827331543, "learning_rate": 3.1707624509751754e-08, "logits/chosen": -2.480271339416504, "logits/rejected": -2.4887309074401855, "logps/chosen": -99.38700866699219, "logps/rejected": -120.13069152832031, "loss": 0.6284, "rewards/accuracies": 0.71875, "rewards/chosen": -0.495540052652359, "rewards/margins": 0.16540366411209106, "rewards/rejected": -0.6609436869621277, "step": 7630 }, { "epoch": 1.3163335630599586, "grad_norm": 13.094701766967773, "learning_rate": 3.156777246614215e-08, "logits/chosen": -2.44416880607605, "logits/rejected": -2.4228973388671875, "logps/chosen": -111.28560638427734, "logps/rejected": -125.09645080566406, "loss": 0.6179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5336799025535583, "rewards/margins": 0.19319351017475128, "rewards/rejected": -0.7268733978271484, "step": 7640 }, { "epoch": 1.3180565127498278, "grad_norm": 8.995244026184082, "learning_rate": 3.1428087106585365e-08, "logits/chosen": -2.565865993499756, "logits/rejected": -2.5419039726257324, "logps/chosen": -116.0489273071289, "logps/rejected": -128.86428833007812, "loss": 0.6364, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5765261650085449, "rewards/margins": 0.15430589020252228, "rewards/rejected": -0.730832040309906, "step": 7650 }, { "epoch": 1.3197794624396968, "grad_norm": 10.541084289550781, "learning_rate": 3.12885696942669e-08, "logits/chosen": -2.5416741371154785, "logits/rejected": -2.5064024925231934, "logps/chosen": -111.7835922241211, "logps/rejected": -125.22023010253906, "loss": 0.635, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5475324392318726, "rewards/margins": 0.1569490134716034, "rewards/rejected": -0.7044814825057983, "step": 7660 }, { "epoch": 1.3215024121295658, "grad_norm": 12.941953659057617, "learning_rate": 3.114922149085341e-08, "logits/chosen": -2.492443799972534, "logits/rejected": -2.467200994491577, "logps/chosen": -110.19084167480469, "logps/rejected": -119.7401123046875, "loss": 0.6406, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.526465117931366, "rewards/margins": 0.14155793190002441, "rewards/rejected": -0.6680231690406799, "step": 7670 }, { "epoch": 1.323225361819435, "grad_norm": 8.414935111999512, "learning_rate": 3.101004375648146e-08, "logits/chosen": -2.5556654930114746, "logits/rejected": -2.5384323596954346, "logps/chosen": -103.7856674194336, "logps/rejected": -119.1416244506836, "loss": 0.6285, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5314931869506836, "rewards/margins": 0.1628451645374298, "rewards/rejected": -0.6943383812904358, "step": 7680 }, { "epoch": 1.324948311509304, "grad_norm": 8.739034652709961, "learning_rate": 3.087103774974602e-08, "logits/chosen": -2.64532732963562, "logits/rejected": -2.6009573936462402, "logps/chosen": -101.80657958984375, "logps/rejected": -114.46602630615234, "loss": 0.6212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4682212471961975, "rewards/margins": 0.18588753044605255, "rewards/rejected": -0.6541087031364441, "step": 7690 }, { "epoch": 1.3266712611991731, "grad_norm": 8.63172435760498, "learning_rate": 3.07322047276891e-08, "logits/chosen": -2.512674570083618, "logits/rejected": -2.48405385017395, "logps/chosen": -105.63853454589844, "logps/rejected": -120.13700866699219, "loss": 0.6365, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5192811489105225, "rewards/margins": 0.14506670832633972, "rewards/rejected": -0.6643478274345398, "step": 7700 }, { "epoch": 1.3266712611991731, "eval_logits/chosen": -2.6237244606018066, "eval_logits/rejected": -2.617231845855713, "eval_logps/chosen": -102.73433685302734, "eval_logps/rejected": -116.12883758544922, "eval_loss": 0.659196138381958, "eval_rewards/accuracies": 0.6157063245773315, "eval_rewards/chosen": -0.44022446870803833, "eval_rewards/margins": 0.08926267176866531, "eval_rewards/rejected": -0.529487133026123, "eval_runtime": 359.8583, "eval_samples_per_second": 11.96, "eval_steps_per_second": 1.495, "step": 7700 }, { "epoch": 1.328394210889042, "grad_norm": 9.414764404296875, "learning_rate": 3.0593545945788426e-08, "logits/chosen": -2.5537688732147217, "logits/rejected": -2.5392956733703613, "logps/chosen": -111.46722412109375, "logps/rejected": -122.5143814086914, "loss": 0.6532, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5989468693733215, "rewards/margins": 0.11693338304758072, "rewards/rejected": -0.7158802151679993, "step": 7710 }, { "epoch": 1.330117160578911, "grad_norm": 10.093575477600098, "learning_rate": 3.045506265794599e-08, "logits/chosen": -2.5286622047424316, "logits/rejected": -2.496185064315796, "logps/chosen": -113.056884765625, "logps/rejected": -122.25138092041016, "loss": 0.6378, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5691434144973755, "rewards/margins": 0.14525368809700012, "rewards/rejected": -0.714397132396698, "step": 7720 }, { "epoch": 1.33184011026878, "grad_norm": 9.999787330627441, "learning_rate": 3.0316756116476885e-08, "logits/chosen": -2.489351749420166, "logits/rejected": -2.4682085514068604, "logps/chosen": -105.78633880615234, "logps/rejected": -120.046875, "loss": 0.6301, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.518639862537384, "rewards/margins": 0.15913304686546326, "rewards/rejected": -0.6777728796005249, "step": 7730 }, { "epoch": 1.3335630599586492, "grad_norm": 9.254657745361328, "learning_rate": 3.017862757209777e-08, "logits/chosen": -2.481689453125, "logits/rejected": -2.4662909507751465, "logps/chosen": -104.68952941894531, "logps/rejected": -126.27779388427734, "loss": 0.6176, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5298460721969604, "rewards/margins": 0.19512736797332764, "rewards/rejected": -0.7249733805656433, "step": 7740 }, { "epoch": 1.3352860096485184, "grad_norm": 9.561819076538086, "learning_rate": 3.004067827391575e-08, "logits/chosen": -2.4999566078186035, "logits/rejected": -2.4698214530944824, "logps/chosen": -110.5082778930664, "logps/rejected": -127.4419174194336, "loss": 0.6253, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5656547546386719, "rewards/margins": 0.18680861592292786, "rewards/rejected": -0.7524634599685669, "step": 7750 }, { "epoch": 1.3370089593383874, "grad_norm": 8.614842414855957, "learning_rate": 2.990290946941691e-08, "logits/chosen": -2.4035215377807617, "logits/rejected": -2.3857004642486572, "logps/chosen": -103.11820220947266, "logps/rejected": -124.1723861694336, "loss": 0.6166, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.526114821434021, "rewards/margins": 0.19871024787425995, "rewards/rejected": -0.7248250246047974, "step": 7760 }, { "epoch": 1.3387319090282563, "grad_norm": 9.305093765258789, "learning_rate": 2.9765322404455194e-08, "logits/chosen": -2.4598212242126465, "logits/rejected": -2.4356131553649902, "logps/chosen": -104.67939758300781, "logps/rejected": -120.8833236694336, "loss": 0.6307, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5256319642066956, "rewards/margins": 0.15807506442070007, "rewards/rejected": -0.683707058429718, "step": 7770 }, { "epoch": 1.3404548587181253, "grad_norm": 10.675363540649414, "learning_rate": 2.9627918323241004e-08, "logits/chosen": -2.6021933555603027, "logits/rejected": -2.58845853805542, "logps/chosen": -109.0901107788086, "logps/rejected": -123.86561584472656, "loss": 0.6522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5755112767219543, "rewards/margins": 0.12428200244903564, "rewards/rejected": -0.69979327917099, "step": 7780 }, { "epoch": 1.3421778084079945, "grad_norm": 11.858165740966797, "learning_rate": 2.9490698468330034e-08, "logits/chosen": -2.536799192428589, "logits/rejected": -2.523071765899658, "logps/chosen": -106.40971374511719, "logps/rejected": -124.10882568359375, "loss": 0.6297, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5483931303024292, "rewards/margins": 0.17372551560401917, "rewards/rejected": -0.7221185564994812, "step": 7790 }, { "epoch": 1.3439007580978635, "grad_norm": 9.725826263427734, "learning_rate": 2.9353664080611968e-08, "logits/chosen": -2.563347339630127, "logits/rejected": -2.557152032852173, "logps/chosen": -110.21385192871094, "logps/rejected": -129.93624877929688, "loss": 0.6211, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.562333881855011, "rewards/margins": 0.18971523642539978, "rewards/rejected": -0.7520490884780884, "step": 7800 }, { "epoch": 1.3439007580978635, "eval_logits/chosen": -2.6180360317230225, "eval_logits/rejected": -2.6114888191223145, "eval_logps/chosen": -103.54806518554688, "eval_logps/rejected": -117.0740737915039, "eval_loss": 0.658841073513031, "eval_rewards/accuracies": 0.6194238066673279, "eval_rewards/chosen": -0.4483616352081299, "eval_rewards/margins": 0.0905778631567955, "eval_rewards/rejected": -0.5389395952224731, "eval_runtime": 359.6595, "eval_samples_per_second": 11.967, "eval_steps_per_second": 1.496, "step": 7800 }, { "epoch": 1.3456237077877327, "grad_norm": 10.2208890914917, "learning_rate": 2.9216816399299372e-08, "logits/chosen": -2.583174228668213, "logits/rejected": -2.555354595184326, "logps/chosen": -111.4613265991211, "logps/rejected": -122.44720458984375, "loss": 0.6357, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5244373083114624, "rewards/margins": 0.15247417986392975, "rewards/rejected": -0.6769115328788757, "step": 7810 }, { "epoch": 1.3473466574776016, "grad_norm": 7.568568706512451, "learning_rate": 2.908015666191633e-08, "logits/chosen": -2.5538055896759033, "logits/rejected": -2.5238800048828125, "logps/chosen": -105.6512680053711, "logps/rejected": -123.7880630493164, "loss": 0.6232, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49285778403282166, "rewards/margins": 0.19189631938934326, "rewards/rejected": -0.6847540736198425, "step": 7820 }, { "epoch": 1.3490696071674706, "grad_norm": 10.060046195983887, "learning_rate": 2.894368610428739e-08, "logits/chosen": -2.4695115089416504, "logits/rejected": -2.458768367767334, "logps/chosen": -105.97291564941406, "logps/rejected": -125.56675720214844, "loss": 0.6339, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5617285966873169, "rewards/margins": 0.15965981781482697, "rewards/rejected": -0.7213884592056274, "step": 7830 }, { "epoch": 1.3507925568573398, "grad_norm": 8.617898941040039, "learning_rate": 2.8807405960526297e-08, "logits/chosen": -2.4485971927642822, "logits/rejected": -2.427280902862549, "logps/chosen": -108.01090240478516, "logps/rejected": -126.32137298583984, "loss": 0.6278, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5444341897964478, "rewards/margins": 0.1764529049396515, "rewards/rejected": -0.7208870649337769, "step": 7840 }, { "epoch": 1.3525155065472088, "grad_norm": 10.258645057678223, "learning_rate": 2.8671317463024904e-08, "logits/chosen": -2.4429070949554443, "logits/rejected": -2.4116177558898926, "logps/chosen": -110.6179428100586, "logps/rejected": -117.56571197509766, "loss": 0.6424, "rewards/accuracies": 0.625, "rewards/chosen": -0.5504086017608643, "rewards/margins": 0.14148317277431488, "rewards/rejected": -0.6918917894363403, "step": 7850 }, { "epoch": 1.354238456237078, "grad_norm": 8.191166877746582, "learning_rate": 2.8535421842441948e-08, "logits/chosen": -2.4752354621887207, "logits/rejected": -2.4408156871795654, "logps/chosen": -107.2601089477539, "logps/rejected": -127.4552993774414, "loss": 0.6149, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5436367392539978, "rewards/margins": 0.2043880671262741, "rewards/rejected": -0.7480248212814331, "step": 7860 }, { "epoch": 1.355961405926947, "grad_norm": 9.708309173583984, "learning_rate": 2.8399720327692013e-08, "logits/chosen": -2.4959757328033447, "logits/rejected": -2.4798731803894043, "logps/chosen": -113.02349853515625, "logps/rejected": -128.7800750732422, "loss": 0.6297, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5734953880310059, "rewards/margins": 0.1706721931695938, "rewards/rejected": -0.7441675662994385, "step": 7870 }, { "epoch": 1.3576843556168159, "grad_norm": 11.513443946838379, "learning_rate": 2.826421414593433e-08, "logits/chosen": -2.5580294132232666, "logits/rejected": -2.5429983139038086, "logps/chosen": -108.66941833496094, "logps/rejected": -122.9479751586914, "loss": 0.6411, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5555582642555237, "rewards/margins": 0.1487930566072464, "rewards/rejected": -0.7043513059616089, "step": 7880 }, { "epoch": 1.359407305306685, "grad_norm": 10.203879356384277, "learning_rate": 2.812890452256176e-08, "logits/chosen": -2.5020322799682617, "logits/rejected": -2.482840061187744, "logps/chosen": -108.50390625, "logps/rejected": -126.86137390136719, "loss": 0.6284, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5412786602973938, "rewards/margins": 0.1748257875442505, "rewards/rejected": -0.7161044478416443, "step": 7890 }, { "epoch": 1.361130254996554, "grad_norm": 10.851054191589355, "learning_rate": 2.7993792681189583e-08, "logits/chosen": -2.4921271800994873, "logits/rejected": -2.488858699798584, "logps/chosen": -109.33931732177734, "logps/rejected": -126.16691589355469, "loss": 0.641, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5716311931610107, "rewards/margins": 0.15354785323143005, "rewards/rejected": -0.7251790761947632, "step": 7900 }, { "epoch": 1.361130254996554, "eval_logits/chosen": -2.6142539978027344, "eval_logits/rejected": -2.6076812744140625, "eval_logps/chosen": -104.2408676147461, "eval_logps/rejected": -117.97350311279297, "eval_loss": 0.6580982804298401, "eval_rewards/accuracies": 0.6217471957206726, "eval_rewards/chosen": -0.45528969168663025, "eval_rewards/margins": 0.0926441103219986, "eval_rewards/rejected": -0.54793381690979, "eval_runtime": 359.2482, "eval_samples_per_second": 11.981, "eval_steps_per_second": 1.498, "step": 7900 }, { "epoch": 1.3628532046864232, "grad_norm": 10.77269172668457, "learning_rate": 2.7858879843644666e-08, "logits/chosen": -2.495147705078125, "logits/rejected": -2.472882032394409, "logps/chosen": -114.080078125, "logps/rejected": -127.13459777832031, "loss": 0.637, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.572566568851471, "rewards/margins": 0.16227777302265167, "rewards/rejected": -0.7348443269729614, "step": 7910 }, { "epoch": 1.3645761543762922, "grad_norm": 9.462078094482422, "learning_rate": 2.7724167229954133e-08, "logits/chosen": -2.4954214096069336, "logits/rejected": -2.4742679595947266, "logps/chosen": -109.8337631225586, "logps/rejected": -122.7774658203125, "loss": 0.6309, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5501305460929871, "rewards/margins": 0.16303303837776184, "rewards/rejected": -0.7131635546684265, "step": 7920 }, { "epoch": 1.3662991040661612, "grad_norm": 12.405163764953613, "learning_rate": 2.758965605833453e-08, "logits/chosen": -2.545825481414795, "logits/rejected": -2.5186660289764404, "logps/chosen": -116.6957778930664, "logps/rejected": -126.533203125, "loss": 0.6346, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5836119651794434, "rewards/margins": 0.1577313393354416, "rewards/rejected": -0.7413433194160461, "step": 7930 }, { "epoch": 1.3680220537560304, "grad_norm": 11.84246826171875, "learning_rate": 2.745534754518075e-08, "logits/chosen": -2.450265407562256, "logits/rejected": -2.4336650371551514, "logps/chosen": -114.76383209228516, "logps/rejected": -131.10342407226562, "loss": 0.6262, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5620352029800415, "rewards/margins": 0.1967960149049759, "rewards/rejected": -0.7588313221931458, "step": 7940 }, { "epoch": 1.3697450034458993, "grad_norm": 10.144691467285156, "learning_rate": 2.732124290505501e-08, "logits/chosen": -2.4120466709136963, "logits/rejected": -2.4035420417785645, "logps/chosen": -106.06107330322266, "logps/rejected": -125.48957824707031, "loss": 0.6304, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5455502271652222, "rewards/margins": 0.17666089534759521, "rewards/rejected": -0.7222111821174622, "step": 7950 }, { "epoch": 1.3714679531357685, "grad_norm": 9.811049461364746, "learning_rate": 2.7187343350675906e-08, "logits/chosen": -2.429908514022827, "logits/rejected": -2.4045462608337402, "logps/chosen": -106.56965637207031, "logps/rejected": -118.9483871459961, "loss": 0.6675, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5609061121940613, "rewards/margins": 0.10372792184352875, "rewards/rejected": -0.6646339297294617, "step": 7960 }, { "epoch": 1.3731909028256375, "grad_norm": 8.188979148864746, "learning_rate": 2.705365009290741e-08, "logits/chosen": -2.5196266174316406, "logits/rejected": -2.4908502101898193, "logps/chosen": -115.40313720703125, "logps/rejected": -124.7549057006836, "loss": 0.6544, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6000576615333557, "rewards/margins": 0.1267736852169037, "rewards/rejected": -0.7268313765525818, "step": 7970 }, { "epoch": 1.3749138525155065, "grad_norm": 12.60647201538086, "learning_rate": 2.6920164340747976e-08, "logits/chosen": -2.567558765411377, "logits/rejected": -2.553401470184326, "logps/chosen": -111.7811508178711, "logps/rejected": -127.6240234375, "loss": 0.6324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5490156412124634, "rewards/margins": 0.16616667807102203, "rewards/rejected": -0.715182363986969, "step": 7980 }, { "epoch": 1.3766368022053757, "grad_norm": 9.503989219665527, "learning_rate": 2.678688730131946e-08, "logits/chosen": -2.581275701522827, "logits/rejected": -2.568692445755005, "logps/chosen": -105.19342041015625, "logps/rejected": -123.72818756103516, "loss": 0.6307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5452620387077332, "rewards/margins": 0.17543041706085205, "rewards/rejected": -0.7206924557685852, "step": 7990 }, { "epoch": 1.3783597518952446, "grad_norm": 10.40451717376709, "learning_rate": 2.665382017985649e-08, "logits/chosen": -2.4500977993011475, "logits/rejected": -2.439554214477539, "logps/chosen": -107.6703109741211, "logps/rejected": -127.6657485961914, "loss": 0.6228, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5315279364585876, "rewards/margins": 0.18270538747310638, "rewards/rejected": -0.7142332792282104, "step": 8000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -2.610853910446167, "eval_logits/rejected": -2.604253053665161, "eval_logps/chosen": -104.54552459716797, "eval_logps/rejected": -118.3795166015625, "eval_loss": 0.6577586531639099, "eval_rewards/accuracies": 0.6215148568153381, "eval_rewards/chosen": -0.4583362340927124, "eval_rewards/margins": 0.0936574935913086, "eval_rewards/rejected": -0.5519937872886658, "eval_runtime": 359.5438, "eval_samples_per_second": 11.971, "eval_steps_per_second": 1.496, "step": 8000 }, { "epoch": 1.3800827015851138, "grad_norm": 9.779748916625977, "learning_rate": 2.6520964179695206e-08, "logits/chosen": -2.483043909072876, "logits/rejected": -2.461374521255493, "logps/chosen": -112.08184814453125, "logps/rejected": -120.54051208496094, "loss": 0.6427, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5492201447486877, "rewards/margins": 0.1319514662027359, "rewards/rejected": -0.6811715960502625, "step": 8010 }, { "epoch": 1.3818056512749828, "grad_norm": 10.46135139465332, "learning_rate": 2.638832050226273e-08, "logits/chosen": -2.4984774589538574, "logits/rejected": -2.4609427452087402, "logps/chosen": -120.8825454711914, "logps/rejected": -130.7781219482422, "loss": 0.6356, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6064472198486328, "rewards/margins": 0.153905987739563, "rewards/rejected": -0.7603532075881958, "step": 8020 }, { "epoch": 1.3835286009648518, "grad_norm": 9.138134002685547, "learning_rate": 2.6255890347065978e-08, "logits/chosen": -2.489896297454834, "logits/rejected": -2.475051164627075, "logps/chosen": -104.41996002197266, "logps/rejected": -124.8290786743164, "loss": 0.6254, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5327662229537964, "rewards/margins": 0.18135295808315277, "rewards/rejected": -0.7141191959381104, "step": 8030 }, { "epoch": 1.385251550654721, "grad_norm": 9.862136840820312, "learning_rate": 2.6123674911681077e-08, "logits/chosen": -2.454495668411255, "logits/rejected": -2.4251887798309326, "logps/chosen": -113.9755630493164, "logps/rejected": -127.5917739868164, "loss": 0.6385, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5954891443252563, "rewards/margins": 0.16014710068702698, "rewards/rejected": -0.7556362748146057, "step": 8040 }, { "epoch": 1.38697450034459, "grad_norm": 8.39928913116455, "learning_rate": 2.5991675391742373e-08, "logits/chosen": -2.5200531482696533, "logits/rejected": -2.5020241737365723, "logps/chosen": -109.97434997558594, "logps/rejected": -128.35328674316406, "loss": 0.629, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5486682653427124, "rewards/margins": 0.1764025092124939, "rewards/rejected": -0.7250707745552063, "step": 8050 }, { "epoch": 1.388697450034459, "grad_norm": 9.907453536987305, "learning_rate": 2.5859892980931707e-08, "logits/chosen": -2.5607526302337646, "logits/rejected": -2.5263822078704834, "logps/chosen": -115.3295669555664, "logps/rejected": -122.81480407714844, "loss": 0.6529, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5999096632003784, "rewards/margins": 0.12736575305461884, "rewards/rejected": -0.7272753119468689, "step": 8060 }, { "epoch": 1.390420399724328, "grad_norm": 8.719491004943848, "learning_rate": 2.5728328870967553e-08, "logits/chosen": -2.472240924835205, "logits/rejected": -2.443647623062134, "logps/chosen": -111.67752838134766, "logps/rejected": -123.70146179199219, "loss": 0.6365, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5821923017501831, "rewards/margins": 0.15517497062683105, "rewards/rejected": -0.7373672723770142, "step": 8070 }, { "epoch": 1.392143349414197, "grad_norm": 9.55966567993164, "learning_rate": 2.5596984251594288e-08, "logits/chosen": -2.458172559738159, "logits/rejected": -2.45215106010437, "logps/chosen": -105.4540786743164, "logps/rejected": -128.3647003173828, "loss": 0.6257, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5246763229370117, "rewards/margins": 0.179347425699234, "rewards/rejected": -0.7040236592292786, "step": 8080 }, { "epoch": 1.3938662991040662, "grad_norm": 8.97030258178711, "learning_rate": 2.546586031057142e-08, "logits/chosen": -2.441591262817383, "logits/rejected": -2.415339946746826, "logps/chosen": -109.4200439453125, "logps/rejected": -126.6836166381836, "loss": 0.6242, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5630167722702026, "rewards/margins": 0.18928496539592743, "rewards/rejected": -0.752301812171936, "step": 8090 }, { "epoch": 1.3955892487939352, "grad_norm": 9.099763870239258, "learning_rate": 2.5334958233662874e-08, "logits/chosen": -2.4939823150634766, "logits/rejected": -2.486513137817383, "logps/chosen": -112.15140533447266, "logps/rejected": -130.70358276367188, "loss": 0.641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5806952714920044, "rewards/margins": 0.1454145610332489, "rewards/rejected": -0.7261097431182861, "step": 8100 }, { "epoch": 1.3955892487939352, "eval_logits/chosen": -2.6063179969787598, "eval_logits/rejected": -2.599682092666626, "eval_logps/chosen": -105.29100799560547, "eval_logps/rejected": -119.1444320678711, "eval_loss": 0.6578975915908813, "eval_rewards/accuracies": 0.6177973747253418, "eval_rewards/chosen": -0.46579110622406006, "eval_rewards/margins": 0.09385194629430771, "eval_rewards/rejected": -0.559643030166626, "eval_runtime": 359.7237, "eval_samples_per_second": 11.965, "eval_steps_per_second": 1.496, "step": 8100 }, { "epoch": 1.3973121984838044, "grad_norm": 10.305115699768066, "learning_rate": 2.5204279204626135e-08, "logits/chosen": -2.5142409801483154, "logits/rejected": -2.4806110858917236, "logps/chosen": -113.26153564453125, "logps/rejected": -122.00248718261719, "loss": 0.6423, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5719997882843018, "rewards/margins": 0.13758215308189392, "rewards/rejected": -0.7095819711685181, "step": 8110 }, { "epoch": 1.3990351481736734, "grad_norm": 11.399801254272461, "learning_rate": 2.5073824405201815e-08, "logits/chosen": -2.4770634174346924, "logits/rejected": -2.4584784507751465, "logps/chosen": -108.218994140625, "logps/rejected": -122.0374984741211, "loss": 0.6382, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5411659479141235, "rewards/margins": 0.15037959814071655, "rewards/rejected": -0.6915455460548401, "step": 8120 }, { "epoch": 1.4007580978635423, "grad_norm": 9.684117317199707, "learning_rate": 2.494359501510265e-08, "logits/chosen": -2.4873130321502686, "logits/rejected": -2.4719839096069336, "logps/chosen": -108.14006042480469, "logps/rejected": -119.9603042602539, "loss": 0.6353, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5331994891166687, "rewards/margins": 0.16038167476654053, "rewards/rejected": -0.6935811638832092, "step": 8130 }, { "epoch": 1.4024810475534115, "grad_norm": 10.679859161376953, "learning_rate": 2.4813592212003055e-08, "logits/chosen": -2.4541687965393066, "logits/rejected": -2.4306445121765137, "logps/chosen": -111.33210754394531, "logps/rejected": -127.0435791015625, "loss": 0.6391, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5860517621040344, "rewards/margins": 0.15630805492401123, "rewards/rejected": -0.7423598170280457, "step": 8140 }, { "epoch": 1.4042039972432805, "grad_norm": 10.034843444824219, "learning_rate": 2.4683817171528393e-08, "logits/chosen": -2.5279853343963623, "logits/rejected": -2.508397340774536, "logps/chosen": -109.01942443847656, "logps/rejected": -127.464111328125, "loss": 0.6338, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5687164068222046, "rewards/margins": 0.17518633604049683, "rewards/rejected": -0.7439028024673462, "step": 8150 }, { "epoch": 1.4059269469331497, "grad_norm": 8.829487800598145, "learning_rate": 2.4554271067244347e-08, "logits/chosen": -2.4655117988586426, "logits/rejected": -2.4463486671447754, "logps/chosen": -113.00874328613281, "logps/rejected": -130.19166564941406, "loss": 0.6101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5699566006660461, "rewards/margins": 0.2090790569782257, "rewards/rejected": -0.7790356874465942, "step": 8160 }, { "epoch": 1.4076498966230186, "grad_norm": 9.293244361877441, "learning_rate": 2.4424955070646314e-08, "logits/chosen": -2.533050537109375, "logits/rejected": -2.508591890335083, "logps/chosen": -114.20674896240234, "logps/rejected": -123.04630279541016, "loss": 0.6294, "rewards/accuracies": 0.625, "rewards/chosen": -0.5449591875076294, "rewards/margins": 0.1659817099571228, "rewards/rejected": -0.7109408378601074, "step": 8170 }, { "epoch": 1.4093728463128876, "grad_norm": 9.863910675048828, "learning_rate": 2.4295870351148807e-08, "logits/chosen": -2.516082763671875, "logits/rejected": -2.4976694583892822, "logps/chosen": -117.4303970336914, "logps/rejected": -127.5738754272461, "loss": 0.6541, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6113814115524292, "rewards/margins": 0.11424535512924194, "rewards/rejected": -0.7256268262863159, "step": 8180 }, { "epoch": 1.4110957960027566, "grad_norm": 10.611021041870117, "learning_rate": 2.41670180760749e-08, "logits/chosen": -2.48005747795105, "logits/rejected": -2.4579241275787354, "logps/chosen": -118.31016540527344, "logps/rejected": -127.42778015136719, "loss": 0.6571, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6033154726028442, "rewards/margins": 0.13233919441699982, "rewards/rejected": -0.7356546521186829, "step": 8190 }, { "epoch": 1.4128187456926258, "grad_norm": 11.68910026550293, "learning_rate": 2.4038399410645588e-08, "logits/chosen": -2.50583553314209, "logits/rejected": -2.475226640701294, "logps/chosen": -116.03187561035156, "logps/rejected": -126.24421691894531, "loss": 0.6504, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6103672981262207, "rewards/margins": 0.12656204402446747, "rewards/rejected": -0.7369293570518494, "step": 8200 }, { "epoch": 1.4128187456926258, "eval_logits/chosen": -2.6040306091308594, "eval_logits/rejected": -2.597447633743286, "eval_logps/chosen": -105.78633880615234, "eval_logps/rejected": -119.84130859375, "eval_loss": 0.6571367979049683, "eval_rewards/accuracies": 0.6212825179100037, "eval_rewards/chosen": -0.47074440121650696, "eval_rewards/margins": 0.09586748480796814, "eval_rewards/rejected": -0.5666118860244751, "eval_runtime": 359.4352, "eval_samples_per_second": 11.974, "eval_steps_per_second": 1.497, "step": 8200 }, { "epoch": 1.414541695382495, "grad_norm": 11.93120288848877, "learning_rate": 2.3910015517969434e-08, "logits/chosen": -2.3909761905670166, "logits/rejected": -2.373305082321167, "logps/chosen": -114.6285629272461, "logps/rejected": -131.95509338378906, "loss": 0.6334, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5952364802360535, "rewards/margins": 0.16276808083057404, "rewards/rejected": -0.7580045461654663, "step": 8210 }, { "epoch": 1.416264645072364, "grad_norm": 10.773359298706055, "learning_rate": 2.3781867559031794e-08, "logits/chosen": -2.5130276679992676, "logits/rejected": -2.4877219200134277, "logps/chosen": -110.43150329589844, "logps/rejected": -126.22157287597656, "loss": 0.6344, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5733839869499207, "rewards/margins": 0.15835942327976227, "rewards/rejected": -0.7317434549331665, "step": 8220 }, { "epoch": 1.417987594762233, "grad_norm": 9.052794456481934, "learning_rate": 2.3653956692684602e-08, "logits/chosen": -2.477832794189453, "logits/rejected": -2.462078332901001, "logps/chosen": -115.74124908447266, "logps/rejected": -124.70304870605469, "loss": 0.6514, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6049267053604126, "rewards/margins": 0.1287112683057785, "rewards/rejected": -0.7336378693580627, "step": 8230 }, { "epoch": 1.4197105444521019, "grad_norm": 11.35895824432373, "learning_rate": 2.352628407563561e-08, "logits/chosen": -2.576815128326416, "logits/rejected": -2.551685094833374, "logps/chosen": -116.01173400878906, "logps/rejected": -126.32490539550781, "loss": 0.6397, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5983807444572449, "rewards/margins": 0.14644809067249298, "rewards/rejected": -0.7448288202285767, "step": 8240 }, { "epoch": 1.421433494141971, "grad_norm": 9.576996803283691, "learning_rate": 2.339885086243816e-08, "logits/chosen": -2.509434223175049, "logits/rejected": -2.4891815185546875, "logps/chosen": -113.9859390258789, "logps/rejected": -128.5716094970703, "loss": 0.6249, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5633655786514282, "rewards/margins": 0.17911618947982788, "rewards/rejected": -0.7424817681312561, "step": 8250 }, { "epoch": 1.42315644383184, "grad_norm": 9.92259407043457, "learning_rate": 2.3271658205480586e-08, "logits/chosen": -2.587524890899658, "logits/rejected": -2.5685718059539795, "logps/chosen": -113.36087799072266, "logps/rejected": -133.5902099609375, "loss": 0.6148, "rewards/accuracies": 0.75, "rewards/chosen": -0.5694628953933716, "rewards/margins": 0.21361835300922394, "rewards/rejected": -0.7830812335014343, "step": 8260 }, { "epoch": 1.4248793935217092, "grad_norm": 11.390279769897461, "learning_rate": 2.3144707254975898e-08, "logits/chosen": -2.5706088542938232, "logits/rejected": -2.5607664585113525, "logps/chosen": -112.4377670288086, "logps/rejected": -129.7074432373047, "loss": 0.6412, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5904213190078735, "rewards/margins": 0.15297070145606995, "rewards/rejected": -0.7433920502662659, "step": 8270 }, { "epoch": 1.4266023432115782, "grad_norm": 9.951886177062988, "learning_rate": 2.3017999158951305e-08, "logits/chosen": -2.4798500537872314, "logits/rejected": -2.4695355892181396, "logps/chosen": -115.2122573852539, "logps/rejected": -127.40425872802734, "loss": 0.6564, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6283519864082336, "rewards/margins": 0.11694036424160004, "rewards/rejected": -0.7452923059463501, "step": 8280 }, { "epoch": 1.4283252929014472, "grad_norm": 9.370098114013672, "learning_rate": 2.2891535063237886e-08, "logits/chosen": -2.4599788188934326, "logits/rejected": -2.414714813232422, "logps/chosen": -116.16448974609375, "logps/rejected": -124.69068908691406, "loss": 0.6398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5622825622558594, "rewards/margins": 0.14268946647644043, "rewards/rejected": -0.704971969127655, "step": 8290 }, { "epoch": 1.4300482425913164, "grad_norm": 8.352498054504395, "learning_rate": 2.2765316111460193e-08, "logits/chosen": -2.627147674560547, "logits/rejected": -2.605398178100586, "logps/chosen": -116.4769058227539, "logps/rejected": -127.98457336425781, "loss": 0.6472, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5889750719070435, "rewards/margins": 0.14557085931301117, "rewards/rejected": -0.7345459461212158, "step": 8300 }, { "epoch": 1.4300482425913164, "eval_logits/chosen": -2.6018471717834473, "eval_logits/rejected": -2.5952625274658203, "eval_logps/chosen": -105.32196044921875, "eval_logps/rejected": -119.30452728271484, "eval_loss": 0.6573455929756165, "eval_rewards/accuracies": 0.6217471957206726, "eval_rewards/chosen": -0.46610066294670105, "eval_rewards/margins": 0.0951433777809143, "eval_rewards/rejected": -0.5612440705299377, "eval_runtime": 359.5035, "eval_samples_per_second": 11.972, "eval_steps_per_second": 1.497, "step": 8300 }, { "epoch": 1.4317711922811853, "grad_norm": 12.541621208190918, "learning_rate": 2.2639343445025914e-08, "logits/chosen": -2.4146697521209717, "logits/rejected": -2.3866145610809326, "logps/chosen": -108.785888671875, "logps/rejected": -118.2060317993164, "loss": 0.6407, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5251675844192505, "rewards/margins": 0.1453622281551361, "rewards/rejected": -0.6705297231674194, "step": 8310 }, { "epoch": 1.4334941419710545, "grad_norm": 9.450784683227539, "learning_rate": 2.2513618203115585e-08, "logits/chosen": -2.483109951019287, "logits/rejected": -2.468421220779419, "logps/chosen": -104.41475677490234, "logps/rejected": -123.78426361083984, "loss": 0.6265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5190132260322571, "rewards/margins": 0.17828358709812164, "rewards/rejected": -0.6972967982292175, "step": 8320 }, { "epoch": 1.4352170916609235, "grad_norm": 10.334905624389648, "learning_rate": 2.2388141522672265e-08, "logits/chosen": -2.497607469558716, "logits/rejected": -2.4882454872131348, "logps/chosen": -105.68269348144531, "logps/rejected": -125.55326843261719, "loss": 0.6422, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5706783533096313, "rewards/margins": 0.14622056484222412, "rewards/rejected": -0.7168989181518555, "step": 8330 }, { "epoch": 1.4369400413507925, "grad_norm": 9.96259593963623, "learning_rate": 2.22629145383912e-08, "logits/chosen": -2.5489068031311035, "logits/rejected": -2.5338613986968994, "logps/chosen": -111.05059814453125, "logps/rejected": -123.3394546508789, "loss": 0.6394, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.566070556640625, "rewards/margins": 0.15070785582065582, "rewards/rejected": -0.7167783975601196, "step": 8340 }, { "epoch": 1.4386629910406616, "grad_norm": 10.026483535766602, "learning_rate": 2.213793838270966e-08, "logits/chosen": -2.3980913162231445, "logits/rejected": -2.3814735412597656, "logps/chosen": -115.02793884277344, "logps/rejected": -126.07624816894531, "loss": 0.6536, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6185691356658936, "rewards/margins": 0.11624781787395477, "rewards/rejected": -0.7348170280456543, "step": 8350 }, { "epoch": 1.4403859407305306, "grad_norm": 11.445932388305664, "learning_rate": 2.2013214185796653e-08, "logits/chosen": -2.3885562419891357, "logits/rejected": -2.355684757232666, "logps/chosen": -111.50667572021484, "logps/rejected": -127.6391372680664, "loss": 0.6232, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5695258975028992, "rewards/margins": 0.18785393238067627, "rewards/rejected": -0.7573798894882202, "step": 8360 }, { "epoch": 1.4421088904203998, "grad_norm": 10.501673698425293, "learning_rate": 2.1888743075542692e-08, "logits/chosen": -2.4620909690856934, "logits/rejected": -2.44533109664917, "logps/chosen": -113.21112060546875, "logps/rejected": -130.18067932128906, "loss": 0.6461, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6215483546257019, "rewards/margins": 0.14412672817707062, "rewards/rejected": -0.7656751871109009, "step": 8370 }, { "epoch": 1.4438318401102688, "grad_norm": 8.78179931640625, "learning_rate": 2.1764526177549618e-08, "logits/chosen": -2.5779061317443848, "logits/rejected": -2.5591421127319336, "logps/chosen": -104.83152770996094, "logps/rejected": -125.90281677246094, "loss": 0.6241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5359030961990356, "rewards/margins": 0.1816587895154953, "rewards/rejected": -0.7175619602203369, "step": 8380 }, { "epoch": 1.4455547898001377, "grad_norm": 8.149552345275879, "learning_rate": 2.1640564615120394e-08, "logits/chosen": -2.5387609004974365, "logits/rejected": -2.508159637451172, "logps/chosen": -111.774658203125, "logps/rejected": -135.379638671875, "loss": 0.5986, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5507842302322388, "rewards/margins": 0.241925448179245, "rewards/rejected": -0.7927096486091614, "step": 8390 }, { "epoch": 1.447277739490007, "grad_norm": 10.88570499420166, "learning_rate": 2.151685950924898e-08, "logits/chosen": -2.5128719806671143, "logits/rejected": -2.4928791522979736, "logps/chosen": -104.6343765258789, "logps/rejected": -121.88565826416016, "loss": 0.6298, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5432831645011902, "rewards/margins": 0.1677609086036682, "rewards/rejected": -0.7110441327095032, "step": 8400 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -2.599398374557495, "eval_logits/rejected": -2.5927748680114746, "eval_logps/chosen": -104.80558776855469, "eval_logps/rejected": -118.77681732177734, "eval_loss": 0.6573162078857422, "eval_rewards/accuracies": 0.6205855011940002, "eval_rewards/chosen": -0.460936963558197, "eval_rewards/margins": 0.09503000974655151, "eval_rewards/rejected": -0.5559669733047485, "eval_runtime": 359.7437, "eval_samples_per_second": 11.964, "eval_steps_per_second": 1.496, "step": 8400 }, { "epoch": 1.449000689179876, "grad_norm": 8.112527847290039, "learning_rate": 2.1393411978610172e-08, "logits/chosen": -2.4637961387634277, "logits/rejected": -2.4320006370544434, "logps/chosen": -113.9928207397461, "logps/rejected": -131.36863708496094, "loss": 0.6049, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5633238554000854, "rewards/margins": 0.23221902549266815, "rewards/rejected": -0.7955428957939148, "step": 8410 }, { "epoch": 1.450723638869745, "grad_norm": 10.700355529785156, "learning_rate": 2.1270223139549508e-08, "logits/chosen": -2.3602561950683594, "logits/rejected": -2.3470911979675293, "logps/chosen": -106.328369140625, "logps/rejected": -125.5106430053711, "loss": 0.6249, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5467656254768372, "rewards/margins": 0.19570481777191162, "rewards/rejected": -0.7424705624580383, "step": 8420 }, { "epoch": 1.452446588559614, "grad_norm": 11.980216979980469, "learning_rate": 2.1147294106073104e-08, "logits/chosen": -2.4795050621032715, "logits/rejected": -2.4644033908843994, "logps/chosen": -109.48161315917969, "logps/rejected": -128.14706420898438, "loss": 0.638, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5599022507667542, "rewards/margins": 0.15501144528388977, "rewards/rejected": -0.7149137258529663, "step": 8430 }, { "epoch": 1.454169538249483, "grad_norm": 10.321578025817871, "learning_rate": 2.102462598983773e-08, "logits/chosen": -2.5045294761657715, "logits/rejected": -2.489931583404541, "logps/chosen": -116.15422058105469, "logps/rejected": -124.9060287475586, "loss": 0.6552, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6136160492897034, "rewards/margins": 0.12014230340719223, "rewards/rejected": -0.7337583303451538, "step": 8440 }, { "epoch": 1.4558924879393522, "grad_norm": 10.292564392089844, "learning_rate": 2.090221990014055e-08, "logits/chosen": -2.392467498779297, "logits/rejected": -2.3858132362365723, "logps/chosen": -105.68475341796875, "logps/rejected": -129.52902221679688, "loss": 0.6235, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5767289996147156, "rewards/margins": 0.1895083636045456, "rewards/rejected": -0.7662373781204224, "step": 8450 }, { "epoch": 1.4576154376292212, "grad_norm": 10.444774627685547, "learning_rate": 2.078007694390932e-08, "logits/chosen": -2.5114336013793945, "logits/rejected": -2.4861104488372803, "logps/chosen": -110.15858459472656, "logps/rejected": -128.76162719726562, "loss": 0.6275, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5735193490982056, "rewards/margins": 0.1767922192811966, "rewards/rejected": -0.750311553478241, "step": 8460 }, { "epoch": 1.4593383873190904, "grad_norm": 10.15573787689209, "learning_rate": 2.0658198225692143e-08, "logits/chosen": -2.4851930141448975, "logits/rejected": -2.4597885608673096, "logps/chosen": -116.81849670410156, "logps/rejected": -128.00364685058594, "loss": 0.6459, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6075204014778137, "rewards/margins": 0.14105847477912903, "rewards/rejected": -0.7485788464546204, "step": 8470 }, { "epoch": 1.4610613370089593, "grad_norm": 10.597168922424316, "learning_rate": 2.0536584847647675e-08, "logits/chosen": -2.478397846221924, "logits/rejected": -2.4498822689056396, "logps/chosen": -112.1146011352539, "logps/rejected": -126.52950286865234, "loss": 0.6381, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5907739996910095, "rewards/margins": 0.1586400866508484, "rewards/rejected": -0.7494140863418579, "step": 8480 }, { "epoch": 1.4627842866988283, "grad_norm": 7.842881202697754, "learning_rate": 2.041523790953506e-08, "logits/chosen": -2.5317792892456055, "logits/rejected": -2.5239269733428955, "logps/chosen": -111.8382339477539, "logps/rejected": -130.12106323242188, "loss": 0.6354, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5709823369979858, "rewards/margins": 0.15821108222007751, "rewards/rejected": -0.7291934490203857, "step": 8490 }, { "epoch": 1.4645072363886975, "grad_norm": 9.383299827575684, "learning_rate": 2.0294158508704e-08, "logits/chosen": -2.409794569015503, "logits/rejected": -2.395017147064209, "logps/chosen": -109.6671142578125, "logps/rejected": -127.86033630371094, "loss": 0.6207, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5536033511161804, "rewards/margins": 0.19054511189460754, "rewards/rejected": -0.7441484928131104, "step": 8500 }, { "epoch": 1.4645072363886975, "eval_logits/chosen": -2.595156669616699, "eval_logits/rejected": -2.5885396003723145, "eval_logps/chosen": -104.50138092041016, "eval_logps/rejected": -118.48872375488281, "eval_loss": 0.6572505235671997, "eval_rewards/accuracies": 0.6180297136306763, "eval_rewards/chosen": -0.45789483189582825, "eval_rewards/margins": 0.09519128501415253, "eval_rewards/rejected": -0.5530860424041748, "eval_runtime": 359.686, "eval_samples_per_second": 11.966, "eval_steps_per_second": 1.496, "step": 8500 }, { "epoch": 1.4662301860785665, "grad_norm": 11.489233016967773, "learning_rate": 2.017334774008484e-08, "logits/chosen": -2.4692511558532715, "logits/rejected": -2.4383511543273926, "logps/chosen": -114.93684387207031, "logps/rejected": -128.41346740722656, "loss": 0.6178, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5708756446838379, "rewards/margins": 0.19799299538135529, "rewards/rejected": -0.7688685655593872, "step": 8510 }, { "epoch": 1.4679531357684357, "grad_norm": 10.60742473602295, "learning_rate": 2.0052806696178658e-08, "logits/chosen": -2.4626574516296387, "logits/rejected": -2.4323716163635254, "logps/chosen": -109.2204360961914, "logps/rejected": -121.94303894042969, "loss": 0.6338, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5565052032470703, "rewards/margins": 0.1759699583053589, "rewards/rejected": -0.7324752807617188, "step": 8520 }, { "epoch": 1.4696760854583046, "grad_norm": 9.371819496154785, "learning_rate": 1.99325364670474e-08, "logits/chosen": -2.4444479942321777, "logits/rejected": -2.427464246749878, "logps/chosen": -113.06219482421875, "logps/rejected": -122.6248550415039, "loss": 0.6539, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5917520523071289, "rewards/margins": 0.12022414058446884, "rewards/rejected": -0.7119762301445007, "step": 8530 }, { "epoch": 1.4713990351481736, "grad_norm": 9.851543426513672, "learning_rate": 1.9812538140304008e-08, "logits/chosen": -2.568233013153076, "logits/rejected": -2.5535032749176025, "logps/chosen": -110.19102478027344, "logps/rejected": -123.43714904785156, "loss": 0.6447, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.554968535900116, "rewards/margins": 0.13506591320037842, "rewards/rejected": -0.6900344491004944, "step": 8540 }, { "epoch": 1.4731219848380428, "grad_norm": 9.308712005615234, "learning_rate": 1.9692812801102615e-08, "logits/chosen": -2.4927818775177, "logits/rejected": -2.48159122467041, "logps/chosen": -111.54307556152344, "logps/rejected": -132.26345825195312, "loss": 0.6221, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5770515203475952, "rewards/margins": 0.1922859102487564, "rewards/rejected": -0.7693374752998352, "step": 8550 }, { "epoch": 1.4748449345279118, "grad_norm": 9.304411888122559, "learning_rate": 1.9573361532128635e-08, "logits/chosen": -2.578611373901367, "logits/rejected": -2.5568089485168457, "logps/chosen": -111.9488525390625, "logps/rejected": -129.10482788085938, "loss": 0.6248, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5774158239364624, "rewards/margins": 0.19028854370117188, "rewards/rejected": -0.7677043080329895, "step": 8560 }, { "epoch": 1.476567884217781, "grad_norm": 10.689298629760742, "learning_rate": 1.9454185413589108e-08, "logits/chosen": -2.439042568206787, "logits/rejected": -2.4161269664764404, "logps/chosen": -112.94194030761719, "logps/rejected": -128.4320068359375, "loss": 0.6274, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.585848867893219, "rewards/margins": 0.17812153697013855, "rewards/rejected": -0.7639704346656799, "step": 8570 }, { "epoch": 1.47829083390765, "grad_norm": 10.238391876220703, "learning_rate": 1.9335285523202867e-08, "logits/chosen": -2.5246376991271973, "logits/rejected": -2.5036513805389404, "logps/chosen": -106.84709167480469, "logps/rejected": -129.79830932617188, "loss": 0.6038, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5271083116531372, "rewards/margins": 0.22541937232017517, "rewards/rejected": -0.7525277137756348, "step": 8580 }, { "epoch": 1.480013783597519, "grad_norm": 8.995641708374023, "learning_rate": 1.9216662936190753e-08, "logits/chosen": -2.5343661308288574, "logits/rejected": -2.506255626678467, "logps/chosen": -107.98847961425781, "logps/rejected": -123.33349609375, "loss": 0.617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5099393129348755, "rewards/margins": 0.18957418203353882, "rewards/rejected": -0.6995135545730591, "step": 8590 }, { "epoch": 1.481736733287388, "grad_norm": 10.315347671508789, "learning_rate": 1.909831872526597e-08, "logits/chosen": -2.4555325508117676, "logits/rejected": -2.4294888973236084, "logps/chosen": -112.75224304199219, "logps/rejected": -121.38525390625, "loss": 0.6661, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5931397676467896, "rewards/margins": 0.10323586314916611, "rewards/rejected": -0.6963757276535034, "step": 8600 }, { "epoch": 1.481736733287388, "eval_logits/chosen": -2.591291904449463, "eval_logits/rejected": -2.5845961570739746, "eval_logps/chosen": -105.10528564453125, "eval_logps/rejected": -119.16321563720703, "eval_loss": 0.6571276187896729, "eval_rewards/accuracies": 0.6203531622886658, "eval_rewards/chosen": -0.46393388509750366, "eval_rewards/margins": 0.09589700400829315, "eval_rewards/rejected": -0.559830904006958, "eval_runtime": 359.5964, "eval_samples_per_second": 11.969, "eval_steps_per_second": 1.496, "step": 8600 }, { "epoch": 1.483459682977257, "grad_norm": 9.687759399414062, "learning_rate": 1.898025396062433e-08, "logits/chosen": -2.4798989295959473, "logits/rejected": -2.459146499633789, "logps/chosen": -111.55943298339844, "logps/rejected": -124.44791412353516, "loss": 0.6374, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5741453766822815, "rewards/margins": 0.1647716462612152, "rewards/rejected": -0.7389170527458191, "step": 8610 }, { "epoch": 1.4851826326671262, "grad_norm": 9.408075332641602, "learning_rate": 1.886246970993458e-08, "logits/chosen": -2.4093403816223145, "logits/rejected": -2.394554853439331, "logps/chosen": -114.43864440917969, "logps/rejected": -121.6338882446289, "loss": 0.6598, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5966447591781616, "rewards/margins": 0.10718707740306854, "rewards/rejected": -0.7038318514823914, "step": 8620 }, { "epoch": 1.4869055823569952, "grad_norm": 10.718470573425293, "learning_rate": 1.8744967038328796e-08, "logits/chosen": -2.3900675773620605, "logits/rejected": -2.3785276412963867, "logps/chosen": -103.84339904785156, "logps/rejected": -122.72688293457031, "loss": 0.6201, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5213834643363953, "rewards/margins": 0.1853039264678955, "rewards/rejected": -0.7066873908042908, "step": 8630 }, { "epoch": 1.4886285320468642, "grad_norm": 9.722565650939941, "learning_rate": 1.8627747008392626e-08, "logits/chosen": -2.521705150604248, "logits/rejected": -2.4955756664276123, "logps/chosen": -114.07643127441406, "logps/rejected": -121.75555419921875, "loss": 0.6665, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5833483934402466, "rewards/margins": 0.10640629380941391, "rewards/rejected": -0.6897546648979187, "step": 8640 }, { "epoch": 1.4903514817367332, "grad_norm": 9.940096855163574, "learning_rate": 1.8510810680155898e-08, "logits/chosen": -2.43583607673645, "logits/rejected": -2.4159131050109863, "logps/chosen": -109.96278381347656, "logps/rejected": -126.08406829833984, "loss": 0.6303, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5500949621200562, "rewards/margins": 0.17001654207706451, "rewards/rejected": -0.7201114892959595, "step": 8650 }, { "epoch": 1.4920744314266023, "grad_norm": 9.738779067993164, "learning_rate": 1.8394159111082756e-08, "logits/chosen": -2.447446346282959, "logits/rejected": -2.4335532188415527, "logps/chosen": -110.02061462402344, "logps/rejected": -126.77510833740234, "loss": 0.629, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5665866732597351, "rewards/margins": 0.18393734097480774, "rewards/rejected": -0.75052410364151, "step": 8660 }, { "epoch": 1.4937973811164715, "grad_norm": 8.617897987365723, "learning_rate": 1.8277793356062403e-08, "logits/chosen": -2.5401453971862793, "logits/rejected": -2.523151397705078, "logps/chosen": -111.16178131103516, "logps/rejected": -129.0643768310547, "loss": 0.633, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5916589498519897, "rewards/margins": 0.17111726105213165, "rewards/rejected": -0.762776255607605, "step": 8670 }, { "epoch": 1.4955203308063405, "grad_norm": 11.4242525100708, "learning_rate": 1.8161714467399248e-08, "logits/chosen": -2.4823286533355713, "logits/rejected": -2.4764370918273926, "logps/chosen": -108.46478271484375, "logps/rejected": -122.68070983886719, "loss": 0.6517, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5778648257255554, "rewards/margins": 0.121099554002285, "rewards/rejected": -0.6989644765853882, "step": 8680 }, { "epoch": 1.4972432804962095, "grad_norm": 10.568251609802246, "learning_rate": 1.8045923494803683e-08, "logits/chosen": -2.504668951034546, "logits/rejected": -2.47914457321167, "logps/chosen": -115.25125885009766, "logps/rejected": -117.3531723022461, "loss": 0.6705, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5971495509147644, "rewards/margins": 0.093296580016613, "rewards/rejected": -0.6904462575912476, "step": 8690 }, { "epoch": 1.4989662301860784, "grad_norm": 9.535359382629395, "learning_rate": 1.793042148538234e-08, "logits/chosen": -2.50766921043396, "logits/rejected": -2.476576089859009, "logps/chosen": -115.72029113769531, "logps/rejected": -124.52447509765625, "loss": 0.6475, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5922280550003052, "rewards/margins": 0.12659205496311188, "rewards/rejected": -0.7188200950622559, "step": 8700 }, { "epoch": 1.4989662301860784, "eval_logits/chosen": -2.5911717414855957, "eval_logits/rejected": -2.584584951400757, "eval_logps/chosen": -104.41332244873047, "eval_logps/rejected": -118.42513275146484, "eval_loss": 0.6571927666664124, "eval_rewards/accuracies": 0.6189591288566589, "eval_rewards/chosen": -0.4570142924785614, "eval_rewards/margins": 0.09543582797050476, "eval_rewards/rejected": -0.5524501204490662, "eval_runtime": 359.4233, "eval_samples_per_second": 11.975, "eval_steps_per_second": 1.497, "step": 8700 }, { "epoch": 1.5006891798759476, "grad_norm": 13.408405303955078, "learning_rate": 1.781520948362881e-08, "logits/chosen": -2.4691367149353027, "logits/rejected": -2.4566128253936768, "logps/chosen": -111.83473205566406, "logps/rejected": -123.60860443115234, "loss": 0.6608, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5899911522865295, "rewards/margins": 0.1146332398056984, "rewards/rejected": -0.7046244144439697, "step": 8710 }, { "epoch": 1.5024121295658168, "grad_norm": 9.210076332092285, "learning_rate": 1.7700288531414077e-08, "logits/chosen": -2.479508876800537, "logits/rejected": -2.454876184463501, "logps/chosen": -111.58013916015625, "logps/rejected": -125.40617370605469, "loss": 0.6349, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5523253679275513, "rewards/margins": 0.1571783721446991, "rewards/rejected": -0.7095038294792175, "step": 8720 }, { "epoch": 1.5041350792556858, "grad_norm": 10.0499267578125, "learning_rate": 1.7585659667977177e-08, "logits/chosen": -2.494037389755249, "logits/rejected": -2.4638280868530273, "logps/chosen": -107.62284088134766, "logps/rejected": -125.51090240478516, "loss": 0.6281, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5535154342651367, "rewards/margins": 0.1831100732088089, "rewards/rejected": -0.7366255521774292, "step": 8730 }, { "epoch": 1.5058580289455548, "grad_norm": 10.449970245361328, "learning_rate": 1.747132392991574e-08, "logits/chosen": -2.4582109451293945, "logits/rejected": -2.4484829902648926, "logps/chosen": -110.2053451538086, "logps/rejected": -129.15306091308594, "loss": 0.6306, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5880126357078552, "rewards/margins": 0.17351998388767242, "rewards/rejected": -0.7615326046943665, "step": 8740 }, { "epoch": 1.5075809786354237, "grad_norm": 10.32069206237793, "learning_rate": 1.735728235117665e-08, "logits/chosen": -2.501255512237549, "logits/rejected": -2.4790732860565186, "logps/chosen": -108.33027648925781, "logps/rejected": -128.27000427246094, "loss": 0.6158, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.54632169008255, "rewards/margins": 0.20436473190784454, "rewards/rejected": -0.7506864666938782, "step": 8750 }, { "epoch": 1.509303928325293, "grad_norm": 8.917093276977539, "learning_rate": 1.7243535963046702e-08, "logits/chosen": -2.444776773452759, "logits/rejected": -2.4357781410217285, "logps/chosen": -104.23927307128906, "logps/rejected": -127.4343032836914, "loss": 0.6182, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5274238586425781, "rewards/margins": 0.19519765675067902, "rewards/rejected": -0.7226213812828064, "step": 8760 }, { "epoch": 1.5110268780151621, "grad_norm": 9.523360252380371, "learning_rate": 1.7130085794143213e-08, "logits/chosen": -2.4868338108062744, "logits/rejected": -2.4598746299743652, "logps/chosen": -113.57242584228516, "logps/rejected": -126.35479736328125, "loss": 0.633, "rewards/accuracies": 0.625, "rewards/chosen": -0.5849705338478088, "rewards/margins": 0.18097633123397827, "rewards/rejected": -0.7659467458724976, "step": 8770 }, { "epoch": 1.512749827705031, "grad_norm": 9.518341064453125, "learning_rate": 1.7016932870404804e-08, "logits/chosen": -2.508802652359009, "logits/rejected": -2.5059142112731934, "logps/chosen": -106.00608825683594, "logps/rejected": -129.78359985351562, "loss": 0.6202, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5691240429878235, "rewards/margins": 0.19421690702438354, "rewards/rejected": -0.763340950012207, "step": 8780 }, { "epoch": 1.5144727773949, "grad_norm": 12.33554744720459, "learning_rate": 1.6904078215082085e-08, "logits/chosen": -2.5131776332855225, "logits/rejected": -2.4882612228393555, "logps/chosen": -113.7440414428711, "logps/rejected": -125.62518310546875, "loss": 0.6415, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5882024765014648, "rewards/margins": 0.1477518081665039, "rewards/rejected": -0.7359542846679688, "step": 8790 }, { "epoch": 1.516195727084769, "grad_norm": 11.471351623535156, "learning_rate": 1.6791522848728385e-08, "logits/chosen": -2.4327633380889893, "logits/rejected": -2.402374029159546, "logps/chosen": -109.92652893066406, "logps/rejected": -120.5476303100586, "loss": 0.6476, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5492741465568542, "rewards/margins": 0.13125640153884888, "rewards/rejected": -0.6805306077003479, "step": 8800 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -2.5883116722106934, "eval_logits/rejected": -2.5816092491149902, "eval_logps/chosen": -104.7545394897461, "eval_logps/rejected": -118.84388732910156, "eval_loss": 0.6569488644599915, "eval_rewards/accuracies": 0.6194238066673279, "eval_rewards/chosen": -0.4604264795780182, "eval_rewards/margins": 0.09621115028858185, "eval_rewards/rejected": -0.556637704372406, "eval_runtime": 359.5269, "eval_samples_per_second": 11.971, "eval_steps_per_second": 1.496, "step": 8800 }, { "epoch": 1.5179186767746382, "grad_norm": 11.274436950683594, "learning_rate": 1.667926778919056e-08, "logits/chosen": -2.546412706375122, "logits/rejected": -2.524261951446533, "logps/chosen": -108.3786849975586, "logps/rejected": -128.4949188232422, "loss": 0.6263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.57418292760849, "rewards/margins": 0.18470506370067596, "rewards/rejected": -0.7588880658149719, "step": 8810 }, { "epoch": 1.5196416264645074, "grad_norm": 9.02099895477295, "learning_rate": 1.6567314051599745e-08, "logits/chosen": -2.5184104442596436, "logits/rejected": -2.498175859451294, "logps/chosen": -101.60962677001953, "logps/rejected": -120.52022552490234, "loss": 0.616, "rewards/accuracies": 0.6875, "rewards/chosen": -0.511854350566864, "rewards/margins": 0.19552382826805115, "rewards/rejected": -0.7073782682418823, "step": 8820 }, { "epoch": 1.5213645761543764, "grad_norm": 9.249839782714844, "learning_rate": 1.6455662648362217e-08, "logits/chosen": -2.3751707077026367, "logits/rejected": -2.353773832321167, "logps/chosen": -110.77812194824219, "logps/rejected": -122.40968322753906, "loss": 0.6505, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5618213415145874, "rewards/margins": 0.1348516345024109, "rewards/rejected": -0.6966729164123535, "step": 8830 }, { "epoch": 1.5230875258442453, "grad_norm": 11.191672325134277, "learning_rate": 1.6344314589150214e-08, "logits/chosen": -2.403146982192993, "logits/rejected": -2.3920133113861084, "logps/chosen": -103.38592529296875, "logps/rejected": -124.9670639038086, "loss": 0.6167, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5103756189346313, "rewards/margins": 0.2013304978609085, "rewards/rejected": -0.7117059826850891, "step": 8840 }, { "epoch": 1.5248104755341143, "grad_norm": 9.034074783325195, "learning_rate": 1.6233270880892802e-08, "logits/chosen": -2.566189765930176, "logits/rejected": -2.548550844192505, "logps/chosen": -104.69144439697266, "logps/rejected": -126.1294937133789, "loss": 0.6212, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5244103670120239, "rewards/margins": 0.19395431876182556, "rewards/rejected": -0.7183647155761719, "step": 8850 }, { "epoch": 1.5265334252239835, "grad_norm": 10.647709846496582, "learning_rate": 1.612253252776681e-08, "logits/chosen": -2.4782826900482178, "logits/rejected": -2.4569811820983887, "logps/chosen": -109.85008239746094, "logps/rejected": -128.80184936523438, "loss": 0.6183, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5666898488998413, "rewards/margins": 0.1927156150341034, "rewards/rejected": -0.7594054341316223, "step": 8860 }, { "epoch": 1.5282563749138525, "grad_norm": 11.228182792663574, "learning_rate": 1.601210053118763e-08, "logits/chosen": -2.6204450130462646, "logits/rejected": -2.5886178016662598, "logps/chosen": -112.09134674072266, "logps/rejected": -129.68994140625, "loss": 0.6196, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5532141327857971, "rewards/margins": 0.19342902302742004, "rewards/rejected": -0.7466431856155396, "step": 8870 }, { "epoch": 1.5299793246037217, "grad_norm": 10.30317211151123, "learning_rate": 1.5901975889800383e-08, "logits/chosen": -2.4214556217193604, "logits/rejected": -2.3968639373779297, "logps/chosen": -112.53263092041016, "logps/rejected": -126.5382080078125, "loss": 0.6389, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5908144116401672, "rewards/margins": 0.15907804667949677, "rewards/rejected": -0.7498924136161804, "step": 8880 }, { "epoch": 1.5317022742935906, "grad_norm": 11.484233856201172, "learning_rate": 1.5792159599470616e-08, "logits/chosen": -2.458347797393799, "logits/rejected": -2.4344797134399414, "logps/chosen": -112.7353286743164, "logps/rejected": -126.00569915771484, "loss": 0.6296, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.582409679889679, "rewards/margins": 0.16995909810066223, "rewards/rejected": -0.7523688077926636, "step": 8890 }, { "epoch": 1.5334252239834596, "grad_norm": 8.930644035339355, "learning_rate": 1.5682652653275564e-08, "logits/chosen": -2.5040814876556396, "logits/rejected": -2.475045680999756, "logps/chosen": -112.71919250488281, "logps/rejected": -127.18507385253906, "loss": 0.6336, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5783306360244751, "rewards/margins": 0.1635587066411972, "rewards/rejected": -0.7418893575668335, "step": 8900 }, { "epoch": 1.5334252239834596, "eval_logits/chosen": -2.5838677883148193, "eval_logits/rejected": -2.5771963596343994, "eval_logps/chosen": -105.63292694091797, "eval_logps/rejected": -119.80806732177734, "eval_loss": 0.656794548034668, "eval_rewards/accuracies": 0.6189591288566589, "eval_rewards/chosen": -0.4692104160785675, "eval_rewards/margins": 0.09706905484199524, "eval_rewards/rejected": -0.5662794709205627, "eval_runtime": 359.4324, "eval_samples_per_second": 11.974, "eval_steps_per_second": 1.497, "step": 8900 }, { "epoch": 1.5351481736733288, "grad_norm": 11.859126091003418, "learning_rate": 1.5573456041494926e-08, "logits/chosen": -2.4731361865997314, "logits/rejected": -2.4600205421447754, "logps/chosen": -116.18925476074219, "logps/rejected": -133.89511108398438, "loss": 0.6203, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6051359176635742, "rewards/margins": 0.20322290062904358, "rewards/rejected": -0.8083587884902954, "step": 8910 }, { "epoch": 1.5368711233631978, "grad_norm": 8.969971656799316, "learning_rate": 1.5464570751602078e-08, "logits/chosen": -2.5627894401550293, "logits/rejected": -2.5319936275482178, "logps/chosen": -115.80232238769531, "logps/rejected": -133.4971160888672, "loss": 0.6122, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5875670313835144, "rewards/margins": 0.22629371285438538, "rewards/rejected": -0.8138607740402222, "step": 8920 }, { "epoch": 1.538594073053067, "grad_norm": 9.40312385559082, "learning_rate": 1.5355997768255086e-08, "logits/chosen": -2.506324291229248, "logits/rejected": -2.476980447769165, "logps/chosen": -113.90055847167969, "logps/rejected": -125.7453842163086, "loss": 0.6358, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5921135544776917, "rewards/margins": 0.15963508188724518, "rewards/rejected": -0.7517485618591309, "step": 8930 }, { "epoch": 1.540317022742936, "grad_norm": 9.786426544189453, "learning_rate": 1.5247738073287803e-08, "logits/chosen": -2.488471508026123, "logits/rejected": -2.470641851425171, "logps/chosen": -111.13584899902344, "logps/rejected": -128.58689880371094, "loss": 0.6293, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5550108551979065, "rewards/margins": 0.17295490205287933, "rewards/rejected": -0.7279657125473022, "step": 8940 }, { "epoch": 1.5420399724328049, "grad_norm": 11.296224594116211, "learning_rate": 1.5139792645700976e-08, "logits/chosen": -2.3937854766845703, "logits/rejected": -2.3945984840393066, "logps/chosen": -107.13525390625, "logps/rejected": -126.40647888183594, "loss": 0.6402, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5303353071212769, "rewards/margins": 0.16198968887329102, "rewards/rejected": -0.6923248767852783, "step": 8950 }, { "epoch": 1.5437629221226739, "grad_norm": 10.965607643127441, "learning_rate": 1.503216246165343e-08, "logits/chosen": -2.5009124279022217, "logits/rejected": -2.4750828742980957, "logps/chosen": -118.30435943603516, "logps/rejected": -136.55319213867188, "loss": 0.6181, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6390171051025391, "rewards/margins": 0.20923519134521484, "rewards/rejected": -0.8482524156570435, "step": 8960 }, { "epoch": 1.545485871812543, "grad_norm": 11.08918285369873, "learning_rate": 1.4924848494453214e-08, "logits/chosen": -2.4640207290649414, "logits/rejected": -2.4379353523254395, "logps/chosen": -112.51423645019531, "logps/rejected": -126.09162902832031, "loss": 0.633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5923029184341431, "rewards/margins": 0.15762947499752045, "rewards/rejected": -0.7499323487281799, "step": 8970 }, { "epoch": 1.5472088215024122, "grad_norm": 12.172833442687988, "learning_rate": 1.4817851714548745e-08, "logits/chosen": -2.5477683544158936, "logits/rejected": -2.520660400390625, "logps/chosen": -118.75528717041016, "logps/rejected": -125.38212585449219, "loss": 0.6745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6427457332611084, "rewards/margins": 0.08407661318778992, "rewards/rejected": -0.7268223166465759, "step": 8980 }, { "epoch": 1.5489317711922812, "grad_norm": 9.766542434692383, "learning_rate": 1.4711173089520218e-08, "logits/chosen": -2.451903820037842, "logits/rejected": -2.434842586517334, "logps/chosen": -109.77362060546875, "logps/rejected": -125.96296691894531, "loss": 0.6268, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5482394099235535, "rewards/margins": 0.1711864173412323, "rewards/rejected": -0.7194257974624634, "step": 8990 }, { "epoch": 1.5506547208821502, "grad_norm": 11.284889221191406, "learning_rate": 1.4604813584070597e-08, "logits/chosen": -2.4958648681640625, "logits/rejected": -2.4693679809570312, "logps/chosen": -116.52508544921875, "logps/rejected": -130.39151000976562, "loss": 0.6282, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5911062955856323, "rewards/margins": 0.18699118494987488, "rewards/rejected": -0.7780975103378296, "step": 9000 }, { "epoch": 1.5506547208821502, "eval_logits/chosen": -2.5820982456207275, "eval_logits/rejected": -2.57542085647583, "eval_logps/chosen": -105.7962417602539, "eval_logps/rejected": -120.07608032226562, "eval_loss": 0.6563650369644165, "eval_rewards/accuracies": 0.6187267899513245, "eval_rewards/chosen": -0.4708433747291565, "eval_rewards/margins": 0.09811615198850632, "eval_rewards/rejected": -0.5689595341682434, "eval_runtime": 359.9722, "eval_samples_per_second": 11.956, "eval_steps_per_second": 1.495, "step": 9000 }, { "epoch": 1.5523776705720191, "grad_norm": 9.287176132202148, "learning_rate": 1.4498774160017102e-08, "logits/chosen": -2.4706435203552246, "logits/rejected": -2.447434902191162, "logps/chosen": -112.8412094116211, "logps/rejected": -124.1606216430664, "loss": 0.6432, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5720191597938538, "rewards/margins": 0.14104202389717102, "rewards/rejected": -0.7130612134933472, "step": 9010 }, { "epoch": 1.5541006202618883, "grad_norm": 11.95606803894043, "learning_rate": 1.4393055776282397e-08, "logits/chosen": -2.4061226844787598, "logits/rejected": -2.3893802165985107, "logps/chosen": -110.66056823730469, "logps/rejected": -129.71044921875, "loss": 0.6103, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5673651695251465, "rewards/margins": 0.22053882479667664, "rewards/rejected": -0.7879040837287903, "step": 9020 }, { "epoch": 1.5558235699517575, "grad_norm": 11.436843872070312, "learning_rate": 1.428765938888598e-08, "logits/chosen": -2.422454357147217, "logits/rejected": -2.4265215396881104, "logps/chosen": -107.36305236816406, "logps/rejected": -126.97566223144531, "loss": 0.6313, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5621106028556824, "rewards/margins": 0.1694938689470291, "rewards/rejected": -0.7316044569015503, "step": 9030 }, { "epoch": 1.5575465196416265, "grad_norm": 12.741418838500977, "learning_rate": 1.4182585950935488e-08, "logits/chosen": -2.4318161010742188, "logits/rejected": -2.412322521209717, "logps/chosen": -113.08719635009766, "logps/rejected": -127.19560241699219, "loss": 0.6486, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6191827654838562, "rewards/margins": 0.13276712596416473, "rewards/rejected": -0.7519499063491821, "step": 9040 }, { "epoch": 1.5592694693314955, "grad_norm": 10.499228477478027, "learning_rate": 1.4077836412618122e-08, "logits/chosen": -2.487452983856201, "logits/rejected": -2.470038890838623, "logps/chosen": -114.92085266113281, "logps/rejected": -127.76414489746094, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": -0.6116332411766052, "rewards/margins": 0.14158475399017334, "rewards/rejected": -0.7532179355621338, "step": 9050 }, { "epoch": 1.5609924190213644, "grad_norm": 10.750195503234863, "learning_rate": 1.3973411721192008e-08, "logits/chosen": -2.482776165008545, "logits/rejected": -2.4582302570343018, "logps/chosen": -115.44197082519531, "logps/rejected": -129.9430694580078, "loss": 0.6374, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6072896122932434, "rewards/margins": 0.16011087596416473, "rewards/rejected": -0.7674003839492798, "step": 9060 }, { "epoch": 1.5627153687112336, "grad_norm": 10.500717163085938, "learning_rate": 1.3869312820977696e-08, "logits/chosen": -2.44934344291687, "logits/rejected": -2.431421995162964, "logps/chosen": -110.8654556274414, "logps/rejected": -133.5511016845703, "loss": 0.6098, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5629990100860596, "rewards/margins": 0.22451594471931458, "rewards/rejected": -0.7875149250030518, "step": 9070 }, { "epoch": 1.5644383184011028, "grad_norm": 10.117134094238281, "learning_rate": 1.3765540653349505e-08, "logits/chosen": -2.453556776046753, "logits/rejected": -2.447422742843628, "logps/chosen": -106.34598541259766, "logps/rejected": -127.54154968261719, "loss": 0.6224, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5436528325080872, "rewards/margins": 0.17753125727176666, "rewards/rejected": -0.7211841344833374, "step": 9080 }, { "epoch": 1.5661612680909718, "grad_norm": 11.284063339233398, "learning_rate": 1.3662096156727204e-08, "logits/chosen": -2.428724765777588, "logits/rejected": -2.3964526653289795, "logps/chosen": -113.4808349609375, "logps/rejected": -122.68611907958984, "loss": 0.641, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5821224451065063, "rewards/margins": 0.1405847817659378, "rewards/rejected": -0.722707211971283, "step": 9090 }, { "epoch": 1.5678842177808407, "grad_norm": 10.751830101013184, "learning_rate": 1.3558980266567277e-08, "logits/chosen": -2.490169048309326, "logits/rejected": -2.4657349586486816, "logps/chosen": -116.18363952636719, "logps/rejected": -126.50498962402344, "loss": 0.646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5938640236854553, "rewards/margins": 0.13608434796333313, "rewards/rejected": -0.7299484014511108, "step": 9100 }, { "epoch": 1.5678842177808407, "eval_logits/chosen": -2.579942226409912, "eval_logits/rejected": -2.5732128620147705, "eval_logps/chosen": -105.95286560058594, "eval_logps/rejected": -120.22126770019531, "eval_loss": 0.6564629077911377, "eval_rewards/accuracies": 0.6187267899513245, "eval_rewards/chosen": -0.4724096655845642, "eval_rewards/margins": 0.09800174087285995, "eval_rewards/rejected": -0.5704114437103271, "eval_runtime": 359.7651, "eval_samples_per_second": 11.963, "eval_steps_per_second": 1.495, "step": 9100 }, { "epoch": 1.5696071674707097, "grad_norm": 9.482916831970215, "learning_rate": 1.345619391535472e-08, "logits/chosen": -2.490828037261963, "logits/rejected": -2.4676239490509033, "logps/chosen": -112.53236389160156, "logps/rejected": -126.99684143066406, "loss": 0.6303, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5728387236595154, "rewards/margins": 0.17117749154567719, "rewards/rejected": -0.744016170501709, "step": 9110 }, { "epoch": 1.571330117160579, "grad_norm": 12.508149147033691, "learning_rate": 1.3353738032594358e-08, "logits/chosen": -2.5123047828674316, "logits/rejected": -2.4805686473846436, "logps/chosen": -116.52827453613281, "logps/rejected": -128.24295043945312, "loss": 0.631, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5625268816947937, "rewards/margins": 0.16890086233615875, "rewards/rejected": -0.731427788734436, "step": 9120 }, { "epoch": 1.573053066850448, "grad_norm": 9.84267807006836, "learning_rate": 1.3251613544802692e-08, "logits/chosen": -2.431236505508423, "logits/rejected": -2.408712863922119, "logps/chosen": -112.70121765136719, "logps/rejected": -128.7128143310547, "loss": 0.6296, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.576823353767395, "rewards/margins": 0.16929545998573303, "rewards/rejected": -0.7461186647415161, "step": 9130 }, { "epoch": 1.574776016540317, "grad_norm": 9.542338371276855, "learning_rate": 1.3149821375499282e-08, "logits/chosen": -2.502328872680664, "logits/rejected": -2.4545814990997314, "logps/chosen": -112.37251281738281, "logps/rejected": -126.47017669677734, "loss": 0.6136, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5509093999862671, "rewards/margins": 0.22205904126167297, "rewards/rejected": -0.7729684710502625, "step": 9140 }, { "epoch": 1.576498966230186, "grad_norm": 10.919657707214355, "learning_rate": 1.3048362445198563e-08, "logits/chosen": -2.4875550270080566, "logits/rejected": -2.4600603580474854, "logps/chosen": -119.6942367553711, "logps/rejected": -136.88499450683594, "loss": 0.6158, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6136825680732727, "rewards/margins": 0.21039915084838867, "rewards/rejected": -0.8240815997123718, "step": 9150 }, { "epoch": 1.578221915920055, "grad_norm": 8.170670509338379, "learning_rate": 1.2947237671401463e-08, "logits/chosen": -2.4972095489501953, "logits/rejected": -2.4924275875091553, "logps/chosen": -103.85929107666016, "logps/rejected": -127.87440490722656, "loss": 0.6157, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5470938086509705, "rewards/margins": 0.20080165565013885, "rewards/rejected": -0.7478954195976257, "step": 9160 }, { "epoch": 1.5799448656099242, "grad_norm": 9.501344680786133, "learning_rate": 1.2846447968587087e-08, "logits/chosen": -2.5076279640197754, "logits/rejected": -2.477036237716675, "logps/chosen": -117.05989074707031, "logps/rejected": -129.93243408203125, "loss": 0.629, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6221837997436523, "rewards/margins": 0.18241745233535767, "rewards/rejected": -0.8046013116836548, "step": 9170 }, { "epoch": 1.5816678152997934, "grad_norm": 10.158044815063477, "learning_rate": 1.274599424820449e-08, "logits/chosen": -2.485940933227539, "logits/rejected": -2.467524766921997, "logps/chosen": -115.52877044677734, "logps/rejected": -127.1000747680664, "loss": 0.6454, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6079785823822021, "rewards/margins": 0.14137926697731018, "rewards/rejected": -0.7493578195571899, "step": 9180 }, { "epoch": 1.5833907649896624, "grad_norm": 10.683402061462402, "learning_rate": 1.2645877418664391e-08, "logits/chosen": -2.587747097015381, "logits/rejected": -2.582094669342041, "logps/chosen": -116.20283508300781, "logps/rejected": -136.19113159179688, "loss": 0.6294, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6204559206962585, "rewards/margins": 0.17218129336833954, "rewards/rejected": -0.7926372289657593, "step": 9190 }, { "epoch": 1.5851137146795313, "grad_norm": 11.139490127563477, "learning_rate": 1.2546098385331006e-08, "logits/chosen": -2.488598108291626, "logits/rejected": -2.467776298522949, "logps/chosen": -114.04997253417969, "logps/rejected": -132.52149963378906, "loss": 0.6225, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.611980140209198, "rewards/margins": 0.1971258819103241, "rewards/rejected": -0.8091060519218445, "step": 9200 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -2.5781242847442627, "eval_logits/rejected": -2.5714316368103027, "eval_logps/chosen": -106.330322265625, "eval_logps/rejected": -120.6733169555664, "eval_loss": 0.6562556624412537, "eval_rewards/accuracies": 0.6189591288566589, "eval_rewards/chosen": -0.47618430852890015, "eval_rewards/margins": 0.09874764084815979, "eval_rewards/rejected": -0.5749318599700928, "eval_runtime": 359.8588, "eval_samples_per_second": 11.96, "eval_steps_per_second": 1.495, "step": 9200 }, { "epoch": 1.5868366643694003, "grad_norm": 11.177249908447266, "learning_rate": 1.2446658050513792e-08, "logits/chosen": -2.3836617469787598, "logits/rejected": -2.359231948852539, "logps/chosen": -117.04586029052734, "logps/rejected": -128.28465270996094, "loss": 0.6418, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6119889616966248, "rewards/margins": 0.15295186638832092, "rewards/rejected": -0.7649407982826233, "step": 9210 }, { "epoch": 1.5885596140592695, "grad_norm": 14.950928688049316, "learning_rate": 1.2347557313459355e-08, "logits/chosen": -2.4695205688476562, "logits/rejected": -2.447995662689209, "logps/chosen": -107.27723693847656, "logps/rejected": -122.84378814697266, "loss": 0.636, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5726373791694641, "rewards/margins": 0.1603108048439026, "rewards/rejected": -0.7329481840133667, "step": 9220 }, { "epoch": 1.5902825637491387, "grad_norm": 9.63260269165039, "learning_rate": 1.2248797070343308e-08, "logits/chosen": -2.396231174468994, "logits/rejected": -2.3813791275024414, "logps/chosen": -105.32853698730469, "logps/rejected": -126.0599136352539, "loss": 0.6272, "rewards/accuracies": 0.65625, "rewards/chosen": -0.560218334197998, "rewards/margins": 0.171835258603096, "rewards/rejected": -0.7320536375045776, "step": 9230 }, { "epoch": 1.5920055134390076, "grad_norm": 10.700993537902832, "learning_rate": 1.2150378214262118e-08, "logits/chosen": -2.4350173473358154, "logits/rejected": -2.4298243522644043, "logps/chosen": -114.66731262207031, "logps/rejected": -133.24984741210938, "loss": 0.6242, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5851711630821228, "rewards/margins": 0.20138947665691376, "rewards/rejected": -0.7865606546401978, "step": 9240 }, { "epoch": 1.5937284631288766, "grad_norm": 9.717818260192871, "learning_rate": 1.2052301635225087e-08, "logits/chosen": -2.4408679008483887, "logits/rejected": -2.417024612426758, "logps/chosen": -109.55340576171875, "logps/rejected": -125.67057800292969, "loss": 0.6361, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5476741790771484, "rewards/margins": 0.16295097768306732, "rewards/rejected": -0.710625171661377, "step": 9250 }, { "epoch": 1.5954514128187456, "grad_norm": 10.486372947692871, "learning_rate": 1.1954568220146272e-08, "logits/chosen": -2.4232518672943115, "logits/rejected": -2.4059088230133057, "logps/chosen": -109.2516860961914, "logps/rejected": -126.35398864746094, "loss": 0.6336, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6118893623352051, "rewards/margins": 0.16174429655075073, "rewards/rejected": -0.773633599281311, "step": 9260 }, { "epoch": 1.5971743625086148, "grad_norm": 8.578853607177734, "learning_rate": 1.1857178852836468e-08, "logits/chosen": -2.5483627319335938, "logits/rejected": -2.517956018447876, "logps/chosen": -115.4470443725586, "logps/rejected": -127.4085464477539, "loss": 0.6402, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6055377721786499, "rewards/margins": 0.1504029929637909, "rewards/rejected": -0.7559407353401184, "step": 9270 }, { "epoch": 1.598897312198484, "grad_norm": 10.61180305480957, "learning_rate": 1.1760134413995222e-08, "logits/chosen": -2.4837899208068848, "logits/rejected": -2.4677019119262695, "logps/chosen": -109.03898620605469, "logps/rejected": -128.00949096679688, "loss": 0.626, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5546872019767761, "rewards/margins": 0.18747368454933167, "rewards/rejected": -0.742160975933075, "step": 9280 }, { "epoch": 1.600620261888353, "grad_norm": 9.662049293518066, "learning_rate": 1.1663435781202868e-08, "logits/chosen": -2.380941867828369, "logits/rejected": -2.360813617706299, "logps/chosen": -109.13102722167969, "logps/rejected": -119.53753662109375, "loss": 0.6433, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5447606444358826, "rewards/margins": 0.14195260405540466, "rewards/rejected": -0.6867132186889648, "step": 9290 }, { "epoch": 1.602343211578222, "grad_norm": 12.733745574951172, "learning_rate": 1.15670838289126e-08, "logits/chosen": -2.4522814750671387, "logits/rejected": -2.4458775520324707, "logps/chosen": -116.55436706542969, "logps/rejected": -138.33456420898438, "loss": 0.6223, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6287103891372681, "rewards/margins": 0.195957213640213, "rewards/rejected": -0.8246676325798035, "step": 9300 }, { "epoch": 1.602343211578222, "eval_logits/chosen": -2.575881004333496, "eval_logits/rejected": -2.56917405128479, "eval_logps/chosen": -106.3382797241211, "eval_logps/rejected": -120.71073150634766, "eval_loss": 0.6561717391014099, "eval_rewards/accuracies": 0.6180297136306763, "eval_rewards/chosen": -0.4762639105319977, "eval_rewards/margins": 0.09904211759567261, "eval_rewards/rejected": -0.5753059983253479, "eval_runtime": 359.8227, "eval_samples_per_second": 11.961, "eval_steps_per_second": 1.495, "step": 9300 }, { "epoch": 1.6040661612680909, "grad_norm": 13.567085266113281, "learning_rate": 1.1471079428442499e-08, "logits/chosen": -2.473120927810669, "logits/rejected": -2.4475350379943848, "logps/chosen": -115.34141540527344, "logps/rejected": -127.76911926269531, "loss": 0.6353, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6159577965736389, "rewards/margins": 0.1579708755016327, "rewards/rejected": -0.7739286422729492, "step": 9310 }, { "epoch": 1.60578911095796, "grad_norm": 9.380661964416504, "learning_rate": 1.1375423447967814e-08, "logits/chosen": -2.528146266937256, "logits/rejected": -2.5089707374572754, "logps/chosen": -114.348876953125, "logps/rejected": -136.70558166503906, "loss": 0.6247, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6351672410964966, "rewards/margins": 0.187762051820755, "rewards/rejected": -0.8229292631149292, "step": 9320 }, { "epoch": 1.607512060647829, "grad_norm": 12.360684394836426, "learning_rate": 1.1280116752512875e-08, "logits/chosen": -2.489619016647339, "logits/rejected": -2.464404582977295, "logps/chosen": -118.02415466308594, "logps/rejected": -136.76522827148438, "loss": 0.6226, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6144608855247498, "rewards/margins": 0.20259025692939758, "rewards/rejected": -0.8170512318611145, "step": 9330 }, { "epoch": 1.6092350103376982, "grad_norm": 8.799599647521973, "learning_rate": 1.1185160203943528e-08, "logits/chosen": -2.4144415855407715, "logits/rejected": -2.4025747776031494, "logps/chosen": -114.1600112915039, "logps/rejected": -132.9754180908203, "loss": 0.6244, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6235083341598511, "rewards/margins": 0.1903582364320755, "rewards/rejected": -0.8138664960861206, "step": 9340 }, { "epoch": 1.6109579600275672, "grad_norm": 34.12635040283203, "learning_rate": 1.1090554660959117e-08, "logits/chosen": -2.479396104812622, "logits/rejected": -2.450967311859131, "logps/chosen": -109.45731353759766, "logps/rejected": -124.7352294921875, "loss": 0.6427, "rewards/accuracies": 0.71875, "rewards/chosen": -0.569072425365448, "rewards/margins": 0.15909281373023987, "rewards/rejected": -0.7281652688980103, "step": 9350 }, { "epoch": 1.6126809097174362, "grad_norm": 10.003735542297363, "learning_rate": 1.0996300979084855e-08, "logits/chosen": -2.536195755004883, "logits/rejected": -2.5000789165496826, "logps/chosen": -115.14347839355469, "logps/rejected": -134.09971618652344, "loss": 0.6073, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5846112966537476, "rewards/margins": 0.2306203842163086, "rewards/rejected": -0.8152316808700562, "step": 9360 }, { "epoch": 1.6144038594073054, "grad_norm": 13.890110969543457, "learning_rate": 1.0902400010664053e-08, "logits/chosen": -2.5462453365325928, "logits/rejected": -2.5133166313171387, "logps/chosen": -111.9314956665039, "logps/rejected": -126.63999938964844, "loss": 0.6309, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5809223055839539, "rewards/margins": 0.17146947979927063, "rewards/rejected": -0.7523918151855469, "step": 9370 }, { "epoch": 1.6161268090971743, "grad_norm": 11.433756828308105, "learning_rate": 1.0808852604850399e-08, "logits/chosen": -2.5420687198638916, "logits/rejected": -2.526165723800659, "logps/chosen": -109.38885498046875, "logps/rejected": -125.33390045166016, "loss": 0.6335, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5498009920120239, "rewards/margins": 0.16597667336463928, "rewards/rejected": -0.7157777547836304, "step": 9380 }, { "epoch": 1.6178497587870435, "grad_norm": 10.514060974121094, "learning_rate": 1.0715659607600275e-08, "logits/chosen": -2.346153736114502, "logits/rejected": -2.331106424331665, "logps/chosen": -112.1683349609375, "logps/rejected": -132.49806213378906, "loss": 0.6226, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5906833410263062, "rewards/margins": 0.1861725151538849, "rewards/rejected": -0.7768558263778687, "step": 9390 }, { "epoch": 1.6195727084769125, "grad_norm": 8.785523414611816, "learning_rate": 1.0622821861665148e-08, "logits/chosen": -2.4343514442443848, "logits/rejected": -2.41630482673645, "logps/chosen": -111.20463562011719, "logps/rejected": -128.87466430664062, "loss": 0.6288, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5713671445846558, "rewards/margins": 0.17466680705547333, "rewards/rejected": -0.7460339665412903, "step": 9400 }, { "epoch": 1.6195727084769125, "eval_logits/chosen": -2.5731117725372314, "eval_logits/rejected": -2.5664474964141846, "eval_logps/chosen": -106.89212036132812, "eval_logps/rejected": -121.37098693847656, "eval_loss": 0.6558800935745239, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.48180222511291504, "eval_rewards/margins": 0.10010644048452377, "eval_rewards/rejected": -0.581908643245697, "eval_runtime": 359.1559, "eval_samples_per_second": 11.984, "eval_steps_per_second": 1.498, "step": 9400 }, { "epoch": 1.6212956581667815, "grad_norm": 11.417981147766113, "learning_rate": 1.0530340206583904e-08, "logits/chosen": -2.4483437538146973, "logits/rejected": -2.4333364963531494, "logps/chosen": -113.51603698730469, "logps/rejected": -127.11344909667969, "loss": 0.6412, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5964230298995972, "rewards/margins": 0.15341778099536896, "rewards/rejected": -0.7498408555984497, "step": 9410 }, { "epoch": 1.6230186078566504, "grad_norm": 14.008613586425781, "learning_rate": 1.0438215478675232e-08, "logits/chosen": -2.475764513015747, "logits/rejected": -2.458162307739258, "logps/chosen": -119.61146545410156, "logps/rejected": -127.99200439453125, "loss": 0.6564, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6457251310348511, "rewards/margins": 0.11228646337985992, "rewards/rejected": -0.7580116391181946, "step": 9420 }, { "epoch": 1.6247415575465196, "grad_norm": 9.169174194335938, "learning_rate": 1.0346448511030198e-08, "logits/chosen": -2.4436872005462646, "logits/rejected": -2.430039167404175, "logps/chosen": -114.7473373413086, "logps/rejected": -130.70941162109375, "loss": 0.6372, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6231524348258972, "rewards/margins": 0.19460538029670715, "rewards/rejected": -0.8177579045295715, "step": 9430 }, { "epoch": 1.6264645072363888, "grad_norm": 10.711912155151367, "learning_rate": 1.0255040133504512e-08, "logits/chosen": -2.432291269302368, "logits/rejected": -2.425797939300537, "logps/chosen": -110.25630187988281, "logps/rejected": -127.0741195678711, "loss": 0.6458, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6062155961990356, "rewards/margins": 0.13991376757621765, "rewards/rejected": -0.7461293339729309, "step": 9440 }, { "epoch": 1.6281874569262578, "grad_norm": 11.031365394592285, "learning_rate": 1.0163991172711184e-08, "logits/chosen": -2.5792200565338135, "logits/rejected": -2.5714049339294434, "logps/chosen": -119.13470458984375, "logps/rejected": -127.2987289428711, "loss": 0.6619, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6574575304985046, "rewards/margins": 0.10914991796016693, "rewards/rejected": -0.7666074633598328, "step": 9450 }, { "epoch": 1.6299104066161267, "grad_norm": 9.260765075683594, "learning_rate": 1.0073302452012977e-08, "logits/chosen": -2.437150001525879, "logits/rejected": -2.4315149784088135, "logps/chosen": -106.49686431884766, "logps/rejected": -139.8819580078125, "loss": 0.5715, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5424596667289734, "rewards/margins": 0.31610342860221863, "rewards/rejected": -0.8585631251335144, "step": 9460 }, { "epoch": 1.6316333563059957, "grad_norm": 10.981246948242188, "learning_rate": 9.98297479151498e-09, "logits/chosen": -2.312954902648926, "logits/rejected": -2.284334659576416, "logps/chosen": -113.22406005859375, "logps/rejected": -122.59544372558594, "loss": 0.6581, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6057893633842468, "rewards/margins": 0.11547912657260895, "rewards/rejected": -0.7212685346603394, "step": 9470 }, { "epoch": 1.633356305995865, "grad_norm": 10.705116271972656, "learning_rate": 9.89300900805718e-09, "logits/chosen": -2.441366672515869, "logits/rejected": -2.409208297729492, "logps/chosen": -115.3991470336914, "logps/rejected": -131.6602020263672, "loss": 0.628, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6140663027763367, "rewards/margins": 0.1759718358516693, "rewards/rejected": -0.7900381088256836, "step": 9480 }, { "epoch": 1.635079255685734, "grad_norm": 12.51076602935791, "learning_rate": 9.80340591520708e-09, "logits/chosen": -2.4258036613464355, "logits/rejected": -2.40817928314209, "logps/chosen": -114.81951904296875, "logps/rejected": -129.5521240234375, "loss": 0.636, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6070705652236938, "rewards/margins": 0.16568514704704285, "rewards/rejected": -0.7727557420730591, "step": 9490 }, { "epoch": 1.636802205375603, "grad_norm": 10.093734741210938, "learning_rate": 9.714166323252348e-09, "logits/chosen": -2.4748165607452393, "logits/rejected": -2.4608230590820312, "logps/chosen": -110.67747497558594, "logps/rejected": -130.24691772460938, "loss": 0.6223, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5703105926513672, "rewards/margins": 0.18143048882484436, "rewards/rejected": -0.7517410516738892, "step": 9500 }, { "epoch": 1.636802205375603, "eval_logits/chosen": -2.571709632873535, "eval_logits/rejected": -2.564974308013916, "eval_logps/chosen": -106.93737030029297, "eval_logps/rejected": -121.4600830078125, "eval_loss": 0.6557111144065857, "eval_rewards/accuracies": 0.6175650358200073, "eval_rewards/chosen": -0.4822547733783722, "eval_rewards/margins": 0.10054484009742737, "eval_rewards/rejected": -0.5827996134757996, "eval_runtime": 359.7295, "eval_samples_per_second": 11.965, "eval_steps_per_second": 1.496, "step": 9500 }, { "epoch": 1.638525155065472, "grad_norm": 10.927206039428711, "learning_rate": 9.625291039193495e-09, "logits/chosen": -2.389148712158203, "logits/rejected": -2.3725249767303467, "logps/chosen": -108.08433532714844, "logps/rejected": -132.2962188720703, "loss": 0.6147, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5720130205154419, "rewards/margins": 0.21152958273887634, "rewards/rejected": -0.7835426330566406, "step": 9510 }, { "epoch": 1.640248104755341, "grad_norm": 8.823415756225586, "learning_rate": 9.536780866736544e-09, "logits/chosen": -2.5889158248901367, "logits/rejected": -2.5745842456817627, "logps/chosen": -111.9592056274414, "logps/rejected": -139.13119506835938, "loss": 0.612, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5962141156196594, "rewards/margins": 0.21659378707408905, "rewards/rejected": -0.8128078579902649, "step": 9520 }, { "epoch": 1.6419710544452102, "grad_norm": 13.857977867126465, "learning_rate": 9.44863660628582e-09, "logits/chosen": -2.494685649871826, "logits/rejected": -2.4717154502868652, "logps/chosen": -112.91267395019531, "logps/rejected": -134.03042602539062, "loss": 0.6056, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6001669764518738, "rewards/margins": 0.2312859743833542, "rewards/rejected": -0.8314529657363892, "step": 9530 }, { "epoch": 1.6436940041350794, "grad_norm": 9.767173767089844, "learning_rate": 9.360859054936621e-09, "logits/chosen": -2.4332919120788574, "logits/rejected": -2.4117319583892822, "logps/chosen": -113.34354400634766, "logps/rejected": -131.25839233398438, "loss": 0.6263, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5922496318817139, "rewards/margins": 0.1862720549106598, "rewards/rejected": -0.7785216569900513, "step": 9540 }, { "epoch": 1.6454169538249483, "grad_norm": 10.432958602905273, "learning_rate": 9.273449006468148e-09, "logits/chosen": -2.4807279109954834, "logits/rejected": -2.4802868366241455, "logps/chosen": -109.90055847167969, "logps/rejected": -132.210693359375, "loss": 0.6246, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5667723417282104, "rewards/margins": 0.18738147616386414, "rewards/rejected": -0.754153847694397, "step": 9550 }, { "epoch": 1.6471399035148173, "grad_norm": 13.059929847717285, "learning_rate": 9.186407251336153e-09, "logits/chosen": -2.539731025695801, "logits/rejected": -2.5303072929382324, "logps/chosen": -112.74991607666016, "logps/rejected": -132.66310119628906, "loss": 0.628, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5912712812423706, "rewards/margins": 0.20478937029838562, "rewards/rejected": -0.7960606813430786, "step": 9560 }, { "epoch": 1.6488628532046863, "grad_norm": 10.162718772888184, "learning_rate": 9.099734576665975e-09, "logits/chosen": -2.4429385662078857, "logits/rejected": -2.4294652938842773, "logps/chosen": -110.95243072509766, "logps/rejected": -128.51576232910156, "loss": 0.6345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6070041060447693, "rewards/margins": 0.17155200242996216, "rewards/rejected": -0.7785560488700867, "step": 9570 }, { "epoch": 1.6505858028945555, "grad_norm": 10.574372291564941, "learning_rate": 9.013431766245255e-09, "logits/chosen": -2.5211551189422607, "logits/rejected": -2.503304958343506, "logps/chosen": -115.49881744384766, "logps/rejected": -136.0047607421875, "loss": 0.6301, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6101093888282776, "rewards/margins": 0.18049034476280212, "rewards/rejected": -0.7905997037887573, "step": 9580 }, { "epoch": 1.6523087525844247, "grad_norm": 9.696565628051758, "learning_rate": 8.927499600516958e-09, "logits/chosen": -2.594769239425659, "logits/rejected": -2.577392101287842, "logps/chosen": -116.54072570800781, "logps/rejected": -130.11834716796875, "loss": 0.6505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6044396758079529, "rewards/margins": 0.13363295793533325, "rewards/rejected": -0.7380726933479309, "step": 9590 }, { "epoch": 1.6540317022742936, "grad_norm": 10.850507736206055, "learning_rate": 8.841938856572278e-09, "logits/chosen": -2.429518699645996, "logits/rejected": -2.399512529373169, "logps/chosen": -116.4022445678711, "logps/rejected": -130.88446044921875, "loss": 0.6363, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6134180426597595, "rewards/margins": 0.17180520296096802, "rewards/rejected": -0.7852233052253723, "step": 9600 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -2.5682740211486816, "eval_logits/rejected": -2.5615086555480957, "eval_logps/chosen": -107.62434387207031, "eval_logps/rejected": -122.20415496826172, "eval_loss": 0.6556257009506226, "eval_rewards/accuracies": 0.6196561455726624, "eval_rewards/chosen": -0.4891245663166046, "eval_rewards/margins": 0.10111591964960098, "eval_rewards/rejected": -0.590240478515625, "eval_runtime": 359.5427, "eval_samples_per_second": 11.971, "eval_steps_per_second": 1.496, "step": 9600 }, { "epoch": 1.6557546519641626, "grad_norm": 11.242480278015137, "learning_rate": 8.756750308143613e-09, "logits/chosen": -2.4543559551239014, "logits/rejected": -2.4368574619293213, "logps/chosen": -111.0262680053711, "logps/rejected": -129.50485229492188, "loss": 0.6185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5471795797348022, "rewards/margins": 0.19828134775161743, "rewards/rejected": -0.7454609274864197, "step": 9610 }, { "epoch": 1.6574776016540316, "grad_norm": 10.200096130371094, "learning_rate": 8.671934725597574e-09, "logits/chosen": -2.5679984092712402, "logits/rejected": -2.5582830905914307, "logps/chosen": -116.09355163574219, "logps/rejected": -129.18531799316406, "loss": 0.6516, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6150714755058289, "rewards/margins": 0.12277498096227646, "rewards/rejected": -0.7378464937210083, "step": 9620 }, { "epoch": 1.6592005513439008, "grad_norm": 14.261276245117188, "learning_rate": 8.587492875927965e-09, "logits/chosen": -2.480210781097412, "logits/rejected": -2.454237461090088, "logps/chosen": -119.82755279541016, "logps/rejected": -134.42041015625, "loss": 0.6343, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6436928510665894, "rewards/margins": 0.1701437532901764, "rewards/rejected": -0.8138366937637329, "step": 9630 }, { "epoch": 1.66092350103377, "grad_norm": 10.625503540039062, "learning_rate": 8.503425522748997e-09, "logits/chosen": -2.5318641662597656, "logits/rejected": -2.5037484169006348, "logps/chosen": -117.7755355834961, "logps/rejected": -127.9942398071289, "loss": 0.6371, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6177867650985718, "rewards/margins": 0.1568756103515625, "rewards/rejected": -0.7746623754501343, "step": 9640 }, { "epoch": 1.662646450723639, "grad_norm": 10.061976432800293, "learning_rate": 8.419733426288155e-09, "logits/chosen": -2.419862747192383, "logits/rejected": -2.39957332611084, "logps/chosen": -118.35591888427734, "logps/rejected": -129.8536834716797, "loss": 0.652, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6287807822227478, "rewards/margins": 0.12983281910419464, "rewards/rejected": -0.758613646030426, "step": 9650 }, { "epoch": 1.664369400413508, "grad_norm": 9.312915802001953, "learning_rate": 8.336417343379565e-09, "logits/chosen": -2.436915636062622, "logits/rejected": -2.430359125137329, "logps/chosen": -110.80461120605469, "logps/rejected": -132.95384216308594, "loss": 0.6305, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6022011041641235, "rewards/margins": 0.19084128737449646, "rewards/rejected": -0.7930424809455872, "step": 9660 }, { "epoch": 1.6660923501033769, "grad_norm": 11.565176963806152, "learning_rate": 8.253478027456945e-09, "logits/chosen": -2.5210142135620117, "logits/rejected": -2.495251178741455, "logps/chosen": -113.76652526855469, "logps/rejected": -131.93850708007812, "loss": 0.6148, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5764992833137512, "rewards/margins": 0.2094554603099823, "rewards/rejected": -0.7859547138214111, "step": 9670 }, { "epoch": 1.667815299793246, "grad_norm": 9.607332229614258, "learning_rate": 8.170916228546925e-09, "logits/chosen": -2.476175546646118, "logits/rejected": -2.4628398418426514, "logps/chosen": -113.82244873046875, "logps/rejected": -122.46971130371094, "loss": 0.6612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5935350060462952, "rewards/margins": 0.10920468717813492, "rewards/rejected": -0.7027397155761719, "step": 9680 }, { "epoch": 1.6695382494831152, "grad_norm": 12.794886589050293, "learning_rate": 8.088732693262213e-09, "logits/chosen": -2.3811025619506836, "logits/rejected": -2.364696741104126, "logps/chosen": -117.9845962524414, "logps/rejected": -130.56321716308594, "loss": 0.6451, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6215648651123047, "rewards/margins": 0.13795089721679688, "rewards/rejected": -0.7595157027244568, "step": 9690 }, { "epoch": 1.6712611991729842, "grad_norm": 11.55554485321045, "learning_rate": 8.006928164794841e-09, "logits/chosen": -2.440767288208008, "logits/rejected": -2.4117016792297363, "logps/chosen": -112.40250396728516, "logps/rejected": -129.28512573242188, "loss": 0.6355, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5969595909118652, "rewards/margins": 0.1731598973274231, "rewards/rejected": -0.7701194882392883, "step": 9700 }, { "epoch": 1.6712611991729842, "eval_logits/chosen": -2.567650556564331, "eval_logits/rejected": -2.560908555984497, "eval_logps/chosen": -107.51300048828125, "eval_logps/rejected": -122.10337829589844, "eval_loss": 0.6555582880973816, "eval_rewards/accuracies": 0.6210501790046692, "eval_rewards/chosen": -0.48801106214523315, "eval_rewards/margins": 0.10122145712375641, "eval_rewards/rejected": -0.5892325639724731, "eval_runtime": 359.6285, "eval_samples_per_second": 11.968, "eval_steps_per_second": 1.496, "step": 9700 }, { "epoch": 1.6729841488628532, "grad_norm": 12.837274551391602, "learning_rate": 7.925503382909459e-09, "logits/chosen": -2.3130369186401367, "logits/rejected": -2.299406051635742, "logps/chosen": -115.46980285644531, "logps/rejected": -129.88174438476562, "loss": 0.644, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6184121370315552, "rewards/margins": 0.16119442880153656, "rewards/rejected": -0.7796066999435425, "step": 9710 }, { "epoch": 1.6747070985527222, "grad_norm": 11.885695457458496, "learning_rate": 7.844459083936644e-09, "logits/chosen": -2.4749066829681396, "logits/rejected": -2.461320400238037, "logps/chosen": -114.23624420166016, "logps/rejected": -131.29092407226562, "loss": 0.6259, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6087769865989685, "rewards/margins": 0.1906432807445526, "rewards/rejected": -0.7994202971458435, "step": 9720 }, { "epoch": 1.6764300482425913, "grad_norm": 9.85313606262207, "learning_rate": 7.763796000766231e-09, "logits/chosen": -2.4554290771484375, "logits/rejected": -2.4421284198760986, "logps/chosen": -112.19117736816406, "logps/rejected": -133.25039672851562, "loss": 0.6204, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5845453143119812, "rewards/margins": 0.20357480645179749, "rewards/rejected": -0.7881200909614563, "step": 9730 }, { "epoch": 1.6781529979324605, "grad_norm": 10.972027778625488, "learning_rate": 7.683514862840701e-09, "logits/chosen": -2.4811558723449707, "logits/rejected": -2.463585138320923, "logps/chosen": -116.2174072265625, "logps/rejected": -129.1941680908203, "loss": 0.6451, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6174328327178955, "rewards/margins": 0.14185911417007446, "rewards/rejected": -0.7592920064926147, "step": 9740 }, { "epoch": 1.6798759476223295, "grad_norm": 9.66303539276123, "learning_rate": 7.603616396148533e-09, "logits/chosen": -2.4816958904266357, "logits/rejected": -2.4432339668273926, "logps/chosen": -117.9178695678711, "logps/rejected": -127.9693603515625, "loss": 0.6382, "rewards/accuracies": 0.625, "rewards/chosen": -0.627899706363678, "rewards/margins": 0.15825578570365906, "rewards/rejected": -0.7861555814743042, "step": 9750 }, { "epoch": 1.6815988973121985, "grad_norm": 9.837355613708496, "learning_rate": 7.524101323217763e-09, "logits/chosen": -2.4979424476623535, "logits/rejected": -2.4742271900177, "logps/chosen": -114.5103759765625, "logps/rejected": -128.76182556152344, "loss": 0.6449, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6132702231407166, "rewards/margins": 0.1615351140499115, "rewards/rejected": -0.7748053073883057, "step": 9760 }, { "epoch": 1.6833218470020674, "grad_norm": 10.263019561767578, "learning_rate": 7.4449703631092596e-09, "logits/chosen": -2.4073855876922607, "logits/rejected": -2.388444423675537, "logps/chosen": -114.07319641113281, "logps/rejected": -130.03860473632812, "loss": 0.6368, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5996518135070801, "rewards/margins": 0.16121190786361694, "rewards/rejected": -0.7608636617660522, "step": 9770 }, { "epoch": 1.6850447966919366, "grad_norm": 8.965226173400879, "learning_rate": 7.366224231410451e-09, "logits/chosen": -2.4908275604248047, "logits/rejected": -2.44807767868042, "logps/chosen": -120.42997741699219, "logps/rejected": -132.4361114501953, "loss": 0.6172, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5954740643501282, "rewards/margins": 0.21281616389751434, "rewards/rejected": -0.8082901835441589, "step": 9780 }, { "epoch": 1.6867677463818056, "grad_norm": 9.79748249053955, "learning_rate": 7.28786364022862e-09, "logits/chosen": -2.462313175201416, "logits/rejected": -2.4611024856567383, "logps/chosen": -112.70414733886719, "logps/rejected": -141.95468139648438, "loss": 0.6079, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.592925488948822, "rewards/margins": 0.25078535079956055, "rewards/rejected": -0.8437107801437378, "step": 9790 }, { "epoch": 1.6884906960716748, "grad_norm": 11.095004081726074, "learning_rate": 7.209889298184646e-09, "logits/chosen": -2.490950107574463, "logits/rejected": -2.457878351211548, "logps/chosen": -117.1009750366211, "logps/rejected": -133.694580078125, "loss": 0.6247, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6029871702194214, "rewards/margins": 0.19695623219013214, "rewards/rejected": -0.7999434471130371, "step": 9800 }, { "epoch": 1.6884906960716748, "eval_logits/chosen": -2.5670363903045654, "eval_logits/rejected": -2.5602853298187256, "eval_logps/chosen": -107.6542739868164, "eval_logps/rejected": -122.27554321289062, "eval_loss": 0.6554702520370483, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.4894237220287323, "eval_rewards/margins": 0.10153036564588547, "eval_rewards/rejected": -0.5909541249275208, "eval_runtime": 359.471, "eval_samples_per_second": 11.973, "eval_steps_per_second": 1.497, "step": 9800 }, { "epoch": 1.6902136457615438, "grad_norm": 10.662399291992188, "learning_rate": 7.132301910406502e-09, "logits/chosen": -2.425443649291992, "logits/rejected": -2.3938422203063965, "logps/chosen": -115.65562438964844, "logps/rejected": -134.46087646484375, "loss": 0.6255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6023647785186768, "rewards/margins": 0.2033398449420929, "rewards/rejected": -0.8057045936584473, "step": 9810 }, { "epoch": 1.6919365954514127, "grad_norm": 10.450873374938965, "learning_rate": 7.05510217852292e-09, "logits/chosen": -2.4725582599639893, "logits/rejected": -2.448711395263672, "logps/chosen": -112.17340087890625, "logps/rejected": -134.11328125, "loss": 0.6117, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5805162191390991, "rewards/margins": 0.220969557762146, "rewards/rejected": -0.8014858365058899, "step": 9820 }, { "epoch": 1.693659545141282, "grad_norm": 9.498969078063965, "learning_rate": 6.978290800657022e-09, "logits/chosen": -2.504911422729492, "logits/rejected": -2.498277425765991, "logps/chosen": -119.9706039428711, "logps/rejected": -136.05508422851562, "loss": 0.6471, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6198269128799438, "rewards/margins": 0.1672651469707489, "rewards/rejected": -0.7870920896530151, "step": 9830 }, { "epoch": 1.6953824948311509, "grad_norm": 8.868417739868164, "learning_rate": 6.901868471419981e-09, "logits/chosen": -2.44360613822937, "logits/rejected": -2.415085792541504, "logps/chosen": -119.94268798828125, "logps/rejected": -135.39608764648438, "loss": 0.6387, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6406481266021729, "rewards/margins": 0.17294269800186157, "rewards/rejected": -0.8135908842086792, "step": 9840 }, { "epoch": 1.69710544452102, "grad_norm": 10.25020694732666, "learning_rate": 6.825835881904846e-09, "logits/chosen": -2.458670139312744, "logits/rejected": -2.4489316940307617, "logps/chosen": -110.32945251464844, "logps/rejected": -124.38945007324219, "loss": 0.6304, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5572781562805176, "rewards/margins": 0.16997310519218445, "rewards/rejected": -0.7272512316703796, "step": 9850 }, { "epoch": 1.698828394210889, "grad_norm": 11.723773956298828, "learning_rate": 6.750193719680142e-09, "logits/chosen": -2.527556896209717, "logits/rejected": -2.5174648761749268, "logps/chosen": -122.32655334472656, "logps/rejected": -132.7868194580078, "loss": 0.656, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6600021123886108, "rewards/margins": 0.12902231514453888, "rewards/rejected": -0.7890244722366333, "step": 9860 }, { "epoch": 1.700551343900758, "grad_norm": 10.979412078857422, "learning_rate": 6.674942668783806e-09, "logits/chosen": -2.4933903217315674, "logits/rejected": -2.469564914703369, "logps/chosen": -112.51314544677734, "logps/rejected": -127.08055114746094, "loss": 0.6332, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5978198051452637, "rewards/margins": 0.1641312837600708, "rewards/rejected": -0.7619511485099792, "step": 9870 }, { "epoch": 1.7022742935906272, "grad_norm": 9.7615385055542, "learning_rate": 6.60008340971685e-09, "logits/chosen": -2.404886484146118, "logits/rejected": -2.3997762203216553, "logps/chosen": -111.34349060058594, "logps/rejected": -132.93601989746094, "loss": 0.6291, "rewards/accuracies": 0.65625, "rewards/chosen": -0.592779278755188, "rewards/margins": 0.1820637732744217, "rewards/rejected": -0.7748430967330933, "step": 9880 }, { "epoch": 1.7039972432804962, "grad_norm": 9.9425630569458, "learning_rate": 6.525616619437335e-09, "logits/chosen": -2.3720321655273438, "logits/rejected": -2.364936590194702, "logps/chosen": -113.4536361694336, "logps/rejected": -134.3558349609375, "loss": 0.6028, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5703598260879517, "rewards/margins": 0.23440685868263245, "rewards/rejected": -0.8047667741775513, "step": 9890 }, { "epoch": 1.7057201929703654, "grad_norm": 12.73387336730957, "learning_rate": 6.451542971354173e-09, "logits/chosen": -2.4607715606689453, "logits/rejected": -2.4324750900268555, "logps/chosen": -105.95125579833984, "logps/rejected": -132.67672729492188, "loss": 0.5826, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5424472689628601, "rewards/margins": 0.286365807056427, "rewards/rejected": -0.8288131952285767, "step": 9900 }, { "epoch": 1.7057201929703654, "eval_logits/chosen": -2.5658624172210693, "eval_logits/rejected": -2.559070348739624, "eval_logps/chosen": -107.81818389892578, "eval_logps/rejected": -122.47151184082031, "eval_loss": 0.6553860306739807, "eval_rewards/accuracies": 0.6205855011940002, "eval_rewards/chosen": -0.49106287956237793, "eval_rewards/margins": 0.1018509715795517, "eval_rewards/rejected": -0.5929138660430908, "eval_runtime": 359.8195, "eval_samples_per_second": 11.962, "eval_steps_per_second": 1.495, "step": 9900 }, { "epoch": 1.7074431426602343, "grad_norm": 8.75168514251709, "learning_rate": 6.377863135321066e-09, "logits/chosen": -2.5228099822998047, "logits/rejected": -2.481369733810425, "logps/chosen": -117.33937072753906, "logps/rejected": -129.14263916015625, "loss": 0.6249, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.597710907459259, "rewards/margins": 0.1872120201587677, "rewards/rejected": -0.7849228382110596, "step": 9910 }, { "epoch": 1.7091660923501033, "grad_norm": 10.73083209991455, "learning_rate": 6.304577777630454e-09, "logits/chosen": -2.523171901702881, "logits/rejected": -2.4971327781677246, "logps/chosen": -112.56683349609375, "logps/rejected": -128.8625030517578, "loss": 0.6182, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5987794399261475, "rewards/margins": 0.19654551148414612, "rewards/rejected": -0.7953248620033264, "step": 9920 }, { "epoch": 1.7108890420399723, "grad_norm": 13.223040580749512, "learning_rate": 6.231687561007459e-09, "logits/chosen": -2.4568839073181152, "logits/rejected": -2.452805280685425, "logps/chosen": -109.905517578125, "logps/rejected": -133.2655029296875, "loss": 0.6177, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5799885392189026, "rewards/margins": 0.2040647566318512, "rewards/rejected": -0.7840532660484314, "step": 9930 }, { "epoch": 1.7126119917298415, "grad_norm": 8.866291046142578, "learning_rate": 6.1591931446039306e-09, "logits/chosen": -2.533574104309082, "logits/rejected": -2.52690052986145, "logps/chosen": -114.55931091308594, "logps/rejected": -135.74856567382812, "loss": 0.6405, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5949760675430298, "rewards/margins": 0.15510234236717224, "rewards/rejected": -0.7500783205032349, "step": 9940 }, { "epoch": 1.7143349414197107, "grad_norm": 12.860427856445312, "learning_rate": 6.087095183992452e-09, "logits/chosen": -2.4990382194519043, "logits/rejected": -2.46634840965271, "logps/chosen": -114.99503326416016, "logps/rejected": -131.1891632080078, "loss": 0.6247, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6103070974349976, "rewards/margins": 0.1882888674736023, "rewards/rejected": -0.7985959649085999, "step": 9950 }, { "epoch": 1.7160578911095796, "grad_norm": 11.31727123260498, "learning_rate": 6.015394331160439e-09, "logits/chosen": -2.5300450325012207, "logits/rejected": -2.514608144760132, "logps/chosen": -118.2430419921875, "logps/rejected": -130.07740783691406, "loss": 0.6632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.648059606552124, "rewards/margins": 0.10662396997213364, "rewards/rejected": -0.7546836137771606, "step": 9960 }, { "epoch": 1.7177808407994486, "grad_norm": 11.905298233032227, "learning_rate": 5.944091234504228e-09, "logits/chosen": -2.4777989387512207, "logits/rejected": -2.4606761932373047, "logps/chosen": -110.2918472290039, "logps/rejected": -128.54547119140625, "loss": 0.6299, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5893170237541199, "rewards/margins": 0.17843811213970184, "rewards/rejected": -0.7677551507949829, "step": 9970 }, { "epoch": 1.7195037904893176, "grad_norm": 11.689369201660156, "learning_rate": 5.8731865388231825e-09, "logits/chosen": -2.425647020339966, "logits/rejected": -2.413154125213623, "logps/chosen": -113.85538482666016, "logps/rejected": -134.4022216796875, "loss": 0.6234, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6105948090553284, "rewards/margins": 0.2016739547252655, "rewards/rejected": -0.8122687339782715, "step": 9980 }, { "epoch": 1.7212267401791868, "grad_norm": 12.944132804870605, "learning_rate": 5.802680885313971e-09, "logits/chosen": -2.508399486541748, "logits/rejected": -2.4662134647369385, "logps/chosen": -116.12748718261719, "logps/rejected": -125.13218688964844, "loss": 0.6538, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.62394779920578, "rewards/margins": 0.12821142375469208, "rewards/rejected": -0.7521592378616333, "step": 9990 }, { "epoch": 1.722949689869056, "grad_norm": 10.039288520812988, "learning_rate": 5.732574911564603e-09, "logits/chosen": -2.509500026702881, "logits/rejected": -2.491403818130493, "logps/chosen": -115.86087799072266, "logps/rejected": -136.30348205566406, "loss": 0.6181, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6011232137680054, "rewards/margins": 0.1922348439693451, "rewards/rejected": -0.7933580279350281, "step": 10000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -2.564741849899292, "eval_logits/rejected": -2.5579206943511963, "eval_logps/chosen": -107.93734741210938, "eval_logps/rejected": -122.6296157836914, "eval_loss": 0.6552858352661133, "eval_rewards/accuracies": 0.6203531622886658, "eval_rewards/chosen": -0.49225449562072754, "eval_rewards/margins": 0.1022404208779335, "eval_rewards/rejected": -0.5944948792457581, "eval_runtime": 359.7482, "eval_samples_per_second": 11.964, "eval_steps_per_second": 1.495, "step": 10000 }, { "epoch": 1.724672639558925, "grad_norm": 10.848592758178711, "learning_rate": 5.662869251548835e-09, "logits/chosen": -2.494889736175537, "logits/rejected": -2.4740841388702393, "logps/chosen": -110.259521484375, "logps/rejected": -134.33322143554688, "loss": 0.6036, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.578801691532135, "rewards/margins": 0.23675695061683655, "rewards/rejected": -0.8155585527420044, "step": 10010 }, { "epoch": 1.7263955892487939, "grad_norm": 11.91601848602295, "learning_rate": 5.5935645356202935e-09, "logits/chosen": -2.436588764190674, "logits/rejected": -2.4070498943328857, "logps/chosen": -116.73612976074219, "logps/rejected": -128.99179077148438, "loss": 0.6409, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6306353211402893, "rewards/margins": 0.14948755502700806, "rewards/rejected": -0.7801228761672974, "step": 10020 }, { "epoch": 1.7281185389386629, "grad_norm": 14.203158378601074, "learning_rate": 5.524661390506863e-09, "logits/chosen": -2.4671003818511963, "logits/rejected": -2.4386401176452637, "logps/chosen": -117.65501403808594, "logps/rejected": -125.408203125, "loss": 0.6486, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6026381254196167, "rewards/margins": 0.13013479113578796, "rewards/rejected": -0.732772946357727, "step": 10030 }, { "epoch": 1.729841488628532, "grad_norm": 8.88199234008789, "learning_rate": 5.456160439305007e-09, "logits/chosen": -2.599710464477539, "logits/rejected": -2.5841920375823975, "logps/chosen": -116.64871978759766, "logps/rejected": -134.38421630859375, "loss": 0.6312, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6180770993232727, "rewards/margins": 0.18140612542629242, "rewards/rejected": -0.7994831800460815, "step": 10040 }, { "epoch": 1.7315644383184012, "grad_norm": 14.16472053527832, "learning_rate": 5.388062301474072e-09, "logits/chosen": -2.4518356323242188, "logits/rejected": -2.449625015258789, "logps/chosen": -120.41426086425781, "logps/rejected": -131.92730712890625, "loss": 0.6666, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.6644247770309448, "rewards/margins": 0.09248615801334381, "rewards/rejected": -0.756911039352417, "step": 10050 }, { "epoch": 1.7332873880082702, "grad_norm": 11.537849426269531, "learning_rate": 5.320367592830799e-09, "logits/chosen": -2.4141101837158203, "logits/rejected": -2.4072766304016113, "logps/chosen": -108.069091796875, "logps/rejected": -130.06570434570312, "loss": 0.6295, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5933195948600769, "rewards/margins": 0.1834740787744522, "rewards/rejected": -0.7767936587333679, "step": 10060 }, { "epoch": 1.7350103376981392, "grad_norm": 10.482232093811035, "learning_rate": 5.253076925543609e-09, "logits/chosen": -2.386352062225342, "logits/rejected": -2.375824451446533, "logps/chosen": -112.65206146240234, "logps/rejected": -134.59097290039062, "loss": 0.6205, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6019715070724487, "rewards/margins": 0.2191394567489624, "rewards/rejected": -0.8211109042167664, "step": 10070 }, { "epoch": 1.7367332873880081, "grad_norm": 13.752614974975586, "learning_rate": 5.18619090812723e-09, "logits/chosen": -2.5031204223632812, "logits/rejected": -2.4887642860412598, "logps/chosen": -120.53104400634766, "logps/rejected": -133.3077392578125, "loss": 0.6471, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6399309039115906, "rewards/margins": 0.14228789508342743, "rewards/rejected": -0.7822188138961792, "step": 10080 }, { "epoch": 1.7384562370778773, "grad_norm": 9.386380195617676, "learning_rate": 5.1197101454370285e-09, "logits/chosen": -2.5299363136291504, "logits/rejected": -2.5103542804718018, "logps/chosen": -113.98204040527344, "logps/rejected": -136.5449981689453, "loss": 0.6228, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5960878729820251, "rewards/margins": 0.19220301508903503, "rewards/rejected": -0.7882908582687378, "step": 10090 }, { "epoch": 1.7401791867677465, "grad_norm": 12.28760051727295, "learning_rate": 5.0536352386636945e-09, "logits/chosen": -2.4863905906677246, "logits/rejected": -2.477647304534912, "logps/chosen": -114.24980163574219, "logps/rejected": -131.3005828857422, "loss": 0.6365, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5926389098167419, "rewards/margins": 0.1753116101026535, "rewards/rejected": -0.7679504752159119, "step": 10100 }, { "epoch": 1.7401791867677465, "eval_logits/chosen": -2.563511848449707, "eval_logits/rejected": -2.556731700897217, "eval_logps/chosen": -107.8778305053711, "eval_logps/rejected": -122.56346130371094, "eval_loss": 0.655293345451355, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.49165940284729004, "eval_rewards/margins": 0.10217391699552536, "eval_rewards/rejected": -0.5938332676887512, "eval_runtime": 359.4587, "eval_samples_per_second": 11.974, "eval_steps_per_second": 1.497, "step": 10100 }, { "epoch": 1.7419021364576155, "grad_norm": 12.70230484008789, "learning_rate": 4.9879667853276795e-09, "logits/chosen": -2.529376268386841, "logits/rejected": -2.533937454223633, "logps/chosen": -113.8519287109375, "logps/rejected": -137.6707000732422, "loss": 0.6241, "rewards/accuracies": 0.6875, "rewards/chosen": -0.632787823677063, "rewards/margins": 0.19746533036231995, "rewards/rejected": -0.830253005027771, "step": 10110 }, { "epoch": 1.7436250861474845, "grad_norm": 12.27637767791748, "learning_rate": 4.9227053792738615e-09, "logits/chosen": -2.4879720211029053, "logits/rejected": -2.4670259952545166, "logps/chosen": -114.29881286621094, "logps/rejected": -131.1550750732422, "loss": 0.6197, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6057515740394592, "rewards/margins": 0.19973024725914001, "rewards/rejected": -0.8054817914962769, "step": 10120 }, { "epoch": 1.7453480358373534, "grad_norm": 10.527959823608398, "learning_rate": 4.857851610666164e-09, "logits/chosen": -2.513655424118042, "logits/rejected": -2.4933128356933594, "logps/chosen": -109.53932189941406, "logps/rejected": -130.92755126953125, "loss": 0.6176, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5619297027587891, "rewards/margins": 0.2108466923236847, "rewards/rejected": -0.7727764844894409, "step": 10130 }, { "epoch": 1.7470709855272226, "grad_norm": 11.256921768188477, "learning_rate": 4.793406065982214e-09, "logits/chosen": -2.4904751777648926, "logits/rejected": -2.4507718086242676, "logps/chosen": -120.92155456542969, "logps/rejected": -133.06362915039062, "loss": 0.6289, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6049574613571167, "rewards/margins": 0.18600977957248688, "rewards/rejected": -0.7909671664237976, "step": 10140 }, { "epoch": 1.7487939352170918, "grad_norm": 10.61954116821289, "learning_rate": 4.729369328008032e-09, "logits/chosen": -2.437351703643799, "logits/rejected": -2.4185705184936523, "logps/chosen": -114.20558166503906, "logps/rejected": -133.32203674316406, "loss": 0.6249, "rewards/accuracies": 0.625, "rewards/chosen": -0.5883208513259888, "rewards/margins": 0.19086460769176483, "rewards/rejected": -0.77918541431427, "step": 10150 }, { "epoch": 1.7505168849069608, "grad_norm": 9.216764450073242, "learning_rate": 4.665741975832765e-09, "logits/chosen": -2.462484836578369, "logits/rejected": -2.446476697921753, "logps/chosen": -111.08219146728516, "logps/rejected": -131.91702270507812, "loss": 0.615, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.590103268623352, "rewards/margins": 0.20701715350151062, "rewards/rejected": -0.7971204519271851, "step": 10160 }, { "epoch": 1.7522398345968297, "grad_norm": 8.60057544708252, "learning_rate": 4.602524584843464e-09, "logits/chosen": -2.4618074893951416, "logits/rejected": -2.4513816833496094, "logps/chosen": -113.1440658569336, "logps/rejected": -131.1116485595703, "loss": 0.625, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5823419690132141, "rewards/margins": 0.18472710251808167, "rewards/rejected": -0.7670690417289734, "step": 10170 }, { "epoch": 1.7539627842866987, "grad_norm": 10.92617130279541, "learning_rate": 4.539717726719872e-09, "logits/chosen": -2.402416944503784, "logits/rejected": -2.375006914138794, "logps/chosen": -112.00247955322266, "logps/rejected": -135.15814208984375, "loss": 0.6077, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5956050157546997, "rewards/margins": 0.22287814319133759, "rewards/rejected": -0.8184831738471985, "step": 10180 }, { "epoch": 1.755685733976568, "grad_norm": 13.182476997375488, "learning_rate": 4.4773219694292155e-09, "logits/chosen": -2.4350714683532715, "logits/rejected": -2.4115071296691895, "logps/chosen": -106.44425201416016, "logps/rejected": -129.00108337402344, "loss": 0.6103, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5548610091209412, "rewards/margins": 0.22107870876789093, "rewards/rejected": -0.7759397625923157, "step": 10190 }, { "epoch": 1.757408683666437, "grad_norm": 10.107029914855957, "learning_rate": 4.415337877221164e-09, "logits/chosen": -2.5051677227020264, "logits/rejected": -2.4862966537475586, "logps/chosen": -112.60433197021484, "logps/rejected": -129.8455047607422, "loss": 0.6269, "rewards/accuracies": 0.65625, "rewards/chosen": -0.601836085319519, "rewards/margins": 0.1873604953289032, "rewards/rejected": -0.7891966104507446, "step": 10200 }, { "epoch": 1.757408683666437, "eval_logits/chosen": -2.5624215602874756, "eval_logits/rejected": -2.555640935897827, "eval_logps/chosen": -108.23209381103516, "eval_logps/rejected": -122.9497299194336, "eval_loss": 0.6552406549453735, "eval_rewards/accuracies": 0.6208178400993347, "eval_rewards/chosen": -0.4952020049095154, "eval_rewards/margins": 0.10249407589435577, "eval_rewards/rejected": -0.59769606590271, "eval_runtime": 359.3223, "eval_samples_per_second": 11.978, "eval_steps_per_second": 1.497, "step": 10200 }, { "epoch": 1.759131633356306, "grad_norm": 11.625716209411621, "learning_rate": 4.353766010622606e-09, "logits/chosen": -2.4938254356384277, "logits/rejected": -2.489194631576538, "logps/chosen": -119.16197204589844, "logps/rejected": -144.89639282226562, "loss": 0.6132, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6514904499053955, "rewards/margins": 0.22621452808380127, "rewards/rejected": -0.8777049779891968, "step": 10210 }, { "epoch": 1.760854583046175, "grad_norm": 11.112654685974121, "learning_rate": 4.2926069264327066e-09, "logits/chosen": -2.441725254058838, "logits/rejected": -2.435220241546631, "logps/chosen": -106.16336822509766, "logps/rejected": -126.24835205078125, "loss": 0.6301, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.577778697013855, "rewards/margins": 0.17347396910190582, "rewards/rejected": -0.7512526512145996, "step": 10220 }, { "epoch": 1.762577532736044, "grad_norm": 9.004151344299316, "learning_rate": 4.231861177717733e-09, "logits/chosen": -2.3886735439300537, "logits/rejected": -2.3622939586639404, "logps/chosen": -112.79914855957031, "logps/rejected": -125.49736022949219, "loss": 0.6467, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5935821533203125, "rewards/margins": 0.14280185103416443, "rewards/rejected": -0.7363839745521545, "step": 10230 }, { "epoch": 1.7643004824259132, "grad_norm": 10.80211353302002, "learning_rate": 4.17152931380621e-09, "logits/chosen": -2.434027910232544, "logits/rejected": -2.4118659496307373, "logps/chosen": -112.94795989990234, "logps/rejected": -137.31430053710938, "loss": 0.6096, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6149092316627502, "rewards/margins": 0.2345750778913498, "rewards/rejected": -0.8494843244552612, "step": 10240 }, { "epoch": 1.7660234321157822, "grad_norm": 11.60608959197998, "learning_rate": 4.111611880283794e-09, "logits/chosen": -2.487051010131836, "logits/rejected": -2.4494423866271973, "logps/chosen": -118.3154067993164, "logps/rejected": -131.3991241455078, "loss": 0.6198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.615398645401001, "rewards/margins": 0.20179829001426697, "rewards/rejected": -0.8171968460083008, "step": 10250 }, { "epoch": 1.7677463818056514, "grad_norm": 10.059064865112305, "learning_rate": 4.05210941898847e-09, "logits/chosen": -2.446291446685791, "logits/rejected": -2.448866605758667, "logps/chosen": -110.99190521240234, "logps/rejected": -132.66427612304688, "loss": 0.6233, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5772714614868164, "rewards/margins": 0.18064741790294647, "rewards/rejected": -0.7579189538955688, "step": 10260 }, { "epoch": 1.7694693314955203, "grad_norm": 10.58187198638916, "learning_rate": 3.993022468005575e-09, "logits/chosen": -2.4398512840270996, "logits/rejected": -2.4271864891052246, "logps/chosen": -119.28215026855469, "logps/rejected": -141.2032012939453, "loss": 0.6172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6385139226913452, "rewards/margins": 0.21705254912376404, "rewards/rejected": -0.8555665016174316, "step": 10270 }, { "epoch": 1.7711922811853893, "grad_norm": 12.797431945800781, "learning_rate": 3.934351561662935e-09, "logits/chosen": -2.4263205528259277, "logits/rejected": -2.4106249809265137, "logps/chosen": -120.3524169921875, "logps/rejected": -128.5592803955078, "loss": 0.6645, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.6602329015731812, "rewards/margins": 0.09498941153287888, "rewards/rejected": -0.7552222609519958, "step": 10280 }, { "epoch": 1.7729152308752585, "grad_norm": 9.863990783691406, "learning_rate": 3.876097230526109e-09, "logits/chosen": -2.398904800415039, "logits/rejected": -2.3914239406585693, "logps/chosen": -106.53401947021484, "logps/rejected": -135.8355255126953, "loss": 0.6003, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5509772896766663, "rewards/margins": 0.2525397539138794, "rewards/rejected": -0.8035169839859009, "step": 10290 }, { "epoch": 1.7746381805651275, "grad_norm": 11.00568675994873, "learning_rate": 3.818260001393464e-09, "logits/chosen": -2.462790012359619, "logits/rejected": -2.4483377933502197, "logps/chosen": -118.48091125488281, "logps/rejected": -130.38723754882812, "loss": 0.6573, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6383308172225952, "rewards/margins": 0.12067344039678574, "rewards/rejected": -0.7590042948722839, "step": 10300 }, { "epoch": 1.7746381805651275, "eval_logits/chosen": -2.5610098838806152, "eval_logits/rejected": -2.5541930198669434, "eval_logps/chosen": -108.33472442626953, "eval_logps/rejected": -123.06450653076172, "eval_loss": 0.6552524566650391, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.49622830748558044, "eval_rewards/margins": 0.10261543095111847, "eval_rewards/rejected": -0.5988436937332153, "eval_runtime": 359.658, "eval_samples_per_second": 11.967, "eval_steps_per_second": 1.496, "step": 10300 }, { "epoch": 1.7763611302549966, "grad_norm": 10.546584129333496, "learning_rate": 3.760840397291548e-09, "logits/chosen": -2.39471697807312, "logits/rejected": -2.3739752769470215, "logps/chosen": -119.74263763427734, "logps/rejected": -134.30592346191406, "loss": 0.6227, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6347993016242981, "rewards/margins": 0.19372475147247314, "rewards/rejected": -0.8285239934921265, "step": 10310 }, { "epoch": 1.7780840799448656, "grad_norm": 10.581847190856934, "learning_rate": 3.7038389374702382e-09, "logits/chosen": -2.456726312637329, "logits/rejected": -2.427189350128174, "logps/chosen": -121.62674713134766, "logps/rejected": -133.1729736328125, "loss": 0.649, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6486510634422302, "rewards/margins": 0.14786870777606964, "rewards/rejected": -0.7965198755264282, "step": 10320 }, { "epoch": 1.7798070296347346, "grad_norm": 10.441367149353027, "learning_rate": 3.6472561373981305e-09, "logits/chosen": -2.4716782569885254, "logits/rejected": -2.4550414085388184, "logps/chosen": -115.976806640625, "logps/rejected": -135.39369201660156, "loss": 0.6275, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6383245587348938, "rewards/margins": 0.17045047879219055, "rewards/rejected": -0.8087749481201172, "step": 10330 }, { "epoch": 1.7815299793246038, "grad_norm": 10.83610725402832, "learning_rate": 3.5910925087578535e-09, "logits/chosen": -2.4928503036499023, "logits/rejected": -2.47050142288208, "logps/chosen": -115.1012191772461, "logps/rejected": -135.30702209472656, "loss": 0.6166, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5901371836662292, "rewards/margins": 0.21725019812583923, "rewards/rejected": -0.8073874711990356, "step": 10340 }, { "epoch": 1.7832529290144727, "grad_norm": 10.322022438049316, "learning_rate": 3.535348559441409e-09, "logits/chosen": -2.569450855255127, "logits/rejected": -2.545938730239868, "logps/chosen": -112.86637878417969, "logps/rejected": -129.04212951660156, "loss": 0.6243, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5878443121910095, "rewards/margins": 0.19202940165996552, "rewards/rejected": -0.7798737287521362, "step": 10350 }, { "epoch": 1.784975878704342, "grad_norm": 10.957462310791016, "learning_rate": 3.4800247935456383e-09, "logits/chosen": -2.4213013648986816, "logits/rejected": -2.390179395675659, "logps/chosen": -119.5207290649414, "logps/rejected": -128.385009765625, "loss": 0.6536, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6357388496398926, "rewards/margins": 0.12005865573883057, "rewards/rejected": -0.7557975053787231, "step": 10360 }, { "epoch": 1.786698828394211, "grad_norm": 10.56867790222168, "learning_rate": 3.425121711367607e-09, "logits/chosen": -2.305194139480591, "logits/rejected": -2.276939630508423, "logps/chosen": -111.28253173828125, "logps/rejected": -131.2523956298828, "loss": 0.6216, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5930512547492981, "rewards/margins": 0.19067077338695526, "rewards/rejected": -0.7837220430374146, "step": 10370 }, { "epoch": 1.7884217780840799, "grad_norm": 11.401347160339355, "learning_rate": 3.3706398094001167e-09, "logits/chosen": -2.448519229888916, "logits/rejected": -2.4372894763946533, "logps/chosen": -118.3085708618164, "logps/rejected": -128.80023193359375, "loss": 0.6673, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6642967462539673, "rewards/margins": 0.10065841674804688, "rewards/rejected": -0.7649552226066589, "step": 10380 }, { "epoch": 1.7901447277739488, "grad_norm": 10.860513687133789, "learning_rate": 3.3165795803272057e-09, "logits/chosen": -2.489330768585205, "logits/rejected": -2.463158130645752, "logps/chosen": -115.2964096069336, "logps/rejected": -130.6220703125, "loss": 0.6361, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5948722958564758, "rewards/margins": 0.17043809592723846, "rewards/rejected": -0.7653104066848755, "step": 10390 }, { "epoch": 1.791867677463818, "grad_norm": 12.60634994506836, "learning_rate": 3.2629415130196793e-09, "logits/chosen": -2.447645902633667, "logits/rejected": -2.430975914001465, "logps/chosen": -113.55403900146484, "logps/rejected": -140.37448120117188, "loss": 0.6036, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6141378283500671, "rewards/margins": 0.2365102469921112, "rewards/rejected": -0.850648045539856, "step": 10400 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -2.560978412628174, "eval_logits/rejected": -2.554239273071289, "eval_logps/chosen": -108.23997497558594, "eval_logps/rejected": -122.97843933105469, "eval_loss": 0.6551907658576965, "eval_rewards/accuracies": 0.6196561455726624, "eval_rewards/chosen": -0.49528077244758606, "eval_rewards/margins": 0.10270221531391144, "eval_rewards/rejected": -0.5979831218719482, "eval_runtime": 359.6358, "eval_samples_per_second": 11.968, "eval_steps_per_second": 1.496, "step": 10400 }, { "epoch": 1.7935906271536872, "grad_norm": 10.753764152526855, "learning_rate": 3.2097260925307235e-09, "logits/chosen": -2.4378273487091064, "logits/rejected": -2.4052460193634033, "logps/chosen": -120.6829605102539, "logps/rejected": -132.27468872070312, "loss": 0.6423, "rewards/accuracies": 0.625, "rewards/chosen": -0.6416543126106262, "rewards/margins": 0.15033070743083954, "rewards/rejected": -0.791985034942627, "step": 10410 }, { "epoch": 1.7953135768435562, "grad_norm": 10.544516563415527, "learning_rate": 3.1569338000914656e-09, "logits/chosen": -2.458979368209839, "logits/rejected": -2.4329304695129395, "logps/chosen": -115.31663513183594, "logps/rejected": -133.2076416015625, "loss": 0.6337, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.611960768699646, "rewards/margins": 0.17886130511760712, "rewards/rejected": -0.7908221483230591, "step": 10420 }, { "epoch": 1.7970365265334252, "grad_norm": 9.884859085083008, "learning_rate": 3.1045651131066886e-09, "logits/chosen": -2.5237693786621094, "logits/rejected": -2.5072269439697266, "logps/chosen": -113.82945251464844, "logps/rejected": -126.14137268066406, "loss": 0.6464, "rewards/accuracies": 0.65625, "rewards/chosen": -0.58958899974823, "rewards/margins": 0.14127321541309357, "rewards/rejected": -0.7308622598648071, "step": 10430 }, { "epoch": 1.7987594762232941, "grad_norm": 11.38304615020752, "learning_rate": 3.0526205051504437e-09, "logits/chosen": -2.54156494140625, "logits/rejected": -2.532452344894409, "logps/chosen": -121.62736511230469, "logps/rejected": -132.5194549560547, "loss": 0.6554, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6375734210014343, "rewards/margins": 0.11469124257564545, "rewards/rejected": -0.752264678478241, "step": 10440 }, { "epoch": 1.8004824259131633, "grad_norm": 9.76710033416748, "learning_rate": 3.001100445961846e-09, "logits/chosen": -2.485647439956665, "logits/rejected": -2.4800970554351807, "logps/chosen": -114.85188293457031, "logps/rejected": -139.90597534179688, "loss": 0.6126, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6122986078262329, "rewards/margins": 0.2103799283504486, "rewards/rejected": -0.8226785659790039, "step": 10450 }, { "epoch": 1.8022053756030325, "grad_norm": 10.32025146484375, "learning_rate": 2.9500054014407307e-09, "logits/chosen": -2.47493052482605, "logits/rejected": -2.449911594390869, "logps/chosen": -115.2611312866211, "logps/rejected": -131.6233367919922, "loss": 0.6262, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6068614721298218, "rewards/margins": 0.17909392714500427, "rewards/rejected": -0.7859554290771484, "step": 10460 }, { "epoch": 1.8039283252929015, "grad_norm": 13.728989601135254, "learning_rate": 2.899335833643529e-09, "logits/chosen": -2.3405070304870605, "logits/rejected": -2.3186278343200684, "logps/chosen": -110.76844787597656, "logps/rejected": -126.5798568725586, "loss": 0.636, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5886200666427612, "rewards/margins": 0.16937750577926636, "rewards/rejected": -0.7579976320266724, "step": 10470 }, { "epoch": 1.8056512749827704, "grad_norm": 9.618743896484375, "learning_rate": 2.849092200779046e-09, "logits/chosen": -2.47365140914917, "logits/rejected": -2.440391778945923, "logps/chosen": -115.9509048461914, "logps/rejected": -134.37042236328125, "loss": 0.6136, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5998003482818604, "rewards/margins": 0.21988093852996826, "rewards/rejected": -0.8196811676025391, "step": 10480 }, { "epoch": 1.8073742246726394, "grad_norm": 9.683095932006836, "learning_rate": 2.7992749572043282e-09, "logits/chosen": -2.468331813812256, "logits/rejected": -2.441288471221924, "logps/chosen": -109.4289321899414, "logps/rejected": -126.5555419921875, "loss": 0.6245, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5561849474906921, "rewards/margins": 0.18384456634521484, "rewards/rejected": -0.7400294542312622, "step": 10490 }, { "epoch": 1.8090971743625086, "grad_norm": 11.747886657714844, "learning_rate": 2.7498845534205393e-09, "logits/chosen": -2.4546990394592285, "logits/rejected": -2.4392428398132324, "logps/chosen": -115.9336929321289, "logps/rejected": -139.72801208496094, "loss": 0.6178, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6323421597480774, "rewards/margins": 0.2138226330280304, "rewards/rejected": -0.8461647033691406, "step": 10500 }, { "epoch": 1.8090971743625086, "eval_logits/chosen": -2.5598411560058594, "eval_logits/rejected": -2.553055763244629, "eval_logps/chosen": -108.27570343017578, "eval_logps/rejected": -123.08312225341797, "eval_loss": 0.6549313068389893, "eval_rewards/accuracies": 0.6212825179100037, "eval_rewards/chosen": -0.49563807249069214, "eval_rewards/margins": 0.10339171439409256, "eval_rewards/rejected": -0.5990298986434937, "eval_runtime": 360.0185, "eval_samples_per_second": 11.955, "eval_steps_per_second": 1.494, "step": 10500 }, { "epoch": 1.8108201240523778, "grad_norm": 10.245859146118164, "learning_rate": 2.7009214360688924e-09, "logits/chosen": -2.453220844268799, "logits/rejected": -2.4300804138183594, "logps/chosen": -115.05948638916016, "logps/rejected": -132.7764434814453, "loss": 0.6191, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5947138071060181, "rewards/margins": 0.20130130648612976, "rewards/rejected": -0.7960150241851807, "step": 10510 }, { "epoch": 1.8125430737422468, "grad_norm": 10.916816711425781, "learning_rate": 2.6523860479266525e-09, "logits/chosen": -2.513354778289795, "logits/rejected": -2.513002872467041, "logps/chosen": -110.2159423828125, "logps/rejected": -138.3251190185547, "loss": 0.6045, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5973041653633118, "rewards/margins": 0.24054870009422302, "rewards/rejected": -0.8378528356552124, "step": 10520 }, { "epoch": 1.8142660234321157, "grad_norm": 11.534632682800293, "learning_rate": 2.6042788279030392e-09, "logits/chosen": -2.423563003540039, "logits/rejected": -2.4029486179351807, "logps/chosen": -116.3944091796875, "logps/rejected": -129.7278594970703, "loss": 0.6443, "rewards/accuracies": 0.625, "rewards/chosen": -0.6305351853370667, "rewards/margins": 0.14548929035663605, "rewards/rejected": -0.7760244011878967, "step": 10530 }, { "epoch": 1.8159889731219847, "grad_norm": 9.460017204284668, "learning_rate": 2.556600211035381e-09, "logits/chosen": -2.393198251724243, "logits/rejected": -2.3695731163024902, "logps/chosen": -113.17286682128906, "logps/rejected": -128.18748474121094, "loss": 0.6299, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6084216833114624, "rewards/margins": 0.17950792610645294, "rewards/rejected": -0.7879296541213989, "step": 10540 }, { "epoch": 1.817711922811854, "grad_norm": 12.474471092224121, "learning_rate": 2.509350628485063e-09, "logits/chosen": -2.560568332672119, "logits/rejected": -2.5556082725524902, "logps/chosen": -116.18548583984375, "logps/rejected": -137.69532775878906, "loss": 0.6243, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.624345600605011, "rewards/margins": 0.19479550421237946, "rewards/rejected": -0.8191410899162292, "step": 10550 }, { "epoch": 1.819434872501723, "grad_norm": 14.021306037902832, "learning_rate": 2.4625305075337e-09, "logits/chosen": -2.423170328140259, "logits/rejected": -2.4037978649139404, "logps/chosen": -116.9132080078125, "logps/rejected": -134.77378845214844, "loss": 0.6298, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6128100156784058, "rewards/margins": 0.18871495127677917, "rewards/rejected": -0.8015249371528625, "step": 10560 }, { "epoch": 1.821157822191592, "grad_norm": 10.682099342346191, "learning_rate": 2.4161402715792533e-09, "logits/chosen": -2.498974561691284, "logits/rejected": -2.482527732849121, "logps/chosen": -117.31404876708984, "logps/rejected": -138.8473358154297, "loss": 0.6149, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6136723756790161, "rewards/margins": 0.2071593701839447, "rewards/rejected": -0.8208317756652832, "step": 10570 }, { "epoch": 1.822880771881461, "grad_norm": 9.404667854309082, "learning_rate": 2.370180340132194e-09, "logits/chosen": -2.5023789405822754, "logits/rejected": -2.4861600399017334, "logps/chosen": -112.813232421875, "logps/rejected": -131.14877319335938, "loss": 0.6206, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5945123434066772, "rewards/margins": 0.18883763253688812, "rewards/rejected": -0.7833499908447266, "step": 10580 }, { "epoch": 1.82460372157133, "grad_norm": 8.862701416015625, "learning_rate": 2.3246511288117274e-09, "logits/chosen": -2.449619770050049, "logits/rejected": -2.413877010345459, "logps/chosen": -119.61332702636719, "logps/rejected": -129.75070190429688, "loss": 0.6371, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6370880603790283, "rewards/margins": 0.16107021272182465, "rewards/rejected": -0.7981582880020142, "step": 10590 }, { "epoch": 1.8263266712611992, "grad_norm": 10.304811477661133, "learning_rate": 2.2795530493420144e-09, "logits/chosen": -2.528219699859619, "logits/rejected": -2.5179848670959473, "logps/chosen": -112.78125, "logps/rejected": -130.56149291992188, "loss": 0.6403, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5989866852760315, "rewards/margins": 0.15974877774715424, "rewards/rejected": -0.7587353587150574, "step": 10600 }, { "epoch": 1.8263266712611992, "eval_logits/chosen": -2.5594441890716553, "eval_logits/rejected": -2.5526580810546875, "eval_logps/chosen": -108.38090515136719, "eval_logps/rejected": -123.14496612548828, "eval_loss": 0.6551013588905334, "eval_rewards/accuracies": 0.6203531622886658, "eval_rewards/chosen": -0.49669015407562256, "eval_rewards/margins": 0.10295825451612473, "eval_rewards/rejected": -0.5996482968330383, "eval_runtime": 359.9344, "eval_samples_per_second": 11.958, "eval_steps_per_second": 1.495, "step": 10600 }, { "epoch": 1.8280496209510684, "grad_norm": 11.823186874389648, "learning_rate": 2.2348865095484614e-09, "logits/chosen": -2.435786485671997, "logits/rejected": -2.4266715049743652, "logps/chosen": -113.290771484375, "logps/rejected": -131.25927734375, "loss": 0.6309, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6222672462463379, "rewards/margins": 0.17166352272033691, "rewards/rejected": -0.7939307689666748, "step": 10610 }, { "epoch": 1.8297725706409373, "grad_norm": 14.5044527053833, "learning_rate": 2.19065191335403e-09, "logits/chosen": -2.3871350288391113, "logits/rejected": -2.3654909133911133, "logps/chosen": -117.86210632324219, "logps/rejected": -133.91995239257812, "loss": 0.6323, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6405826807022095, "rewards/margins": 0.17532998323440552, "rewards/rejected": -0.8159125447273254, "step": 10620 }, { "epoch": 1.8314955203308063, "grad_norm": 8.305620193481445, "learning_rate": 2.1468496607755625e-09, "logits/chosen": -2.469578981399536, "logits/rejected": -2.450049877166748, "logps/chosen": -114.07830810546875, "logps/rejected": -135.2241973876953, "loss": 0.6219, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5981978178024292, "rewards/margins": 0.2123533934354782, "rewards/rejected": -0.8105511665344238, "step": 10630 }, { "epoch": 1.8332184700206753, "grad_norm": 11.741875648498535, "learning_rate": 2.103480147920228e-09, "logits/chosen": -2.4531044960021973, "logits/rejected": -2.431525468826294, "logps/chosen": -113.56483459472656, "logps/rejected": -131.6667938232422, "loss": 0.6196, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5849887728691101, "rewards/margins": 0.2058919370174408, "rewards/rejected": -0.7908806204795837, "step": 10640 }, { "epoch": 1.8349414197105445, "grad_norm": 12.1553373336792, "learning_rate": 2.0605437669818426e-09, "logits/chosen": -2.5188522338867188, "logits/rejected": -2.498983144760132, "logps/chosen": -115.35491943359375, "logps/rejected": -139.0282745361328, "loss": 0.6116, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.617334246635437, "rewards/margins": 0.23477241396903992, "rewards/rejected": -0.8521067500114441, "step": 10650 }, { "epoch": 1.8366643694004137, "grad_norm": 10.504712104797363, "learning_rate": 2.0180409062374336e-09, "logits/chosen": -2.4251739978790283, "logits/rejected": -2.4132907390594482, "logps/chosen": -120.6828384399414, "logps/rejected": -132.45217895507812, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": -0.65984708070755, "rewards/margins": 0.1128741055727005, "rewards/rejected": -0.7727211713790894, "step": 10660 }, { "epoch": 1.8383873190902826, "grad_norm": 9.784821510314941, "learning_rate": 1.97597195004362e-09, "logits/chosen": -2.4410600662231445, "logits/rejected": -2.4224324226379395, "logps/chosen": -117.0425033569336, "logps/rejected": -135.34555053710938, "loss": 0.6278, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6490435004234314, "rewards/margins": 0.1887730062007904, "rewards/rejected": -0.8378164172172546, "step": 10670 }, { "epoch": 1.8401102687801516, "grad_norm": 10.796539306640625, "learning_rate": 1.934337278833231e-09, "logits/chosen": -2.48850417137146, "logits/rejected": -2.4493298530578613, "logps/chosen": -116.40985107421875, "logps/rejected": -126.93431091308594, "loss": 0.6371, "rewards/accuracies": 0.625, "rewards/chosen": -0.6012075543403625, "rewards/margins": 0.1603269726037979, "rewards/rejected": -0.7615344524383545, "step": 10680 }, { "epoch": 1.8418332184700206, "grad_norm": 10.557440757751465, "learning_rate": 1.8931372691117887e-09, "logits/chosen": -2.460437774658203, "logits/rejected": -2.4529013633728027, "logps/chosen": -114.23712158203125, "logps/rejected": -138.14364624023438, "loss": 0.6154, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6048712730407715, "rewards/margins": 0.21279887855052948, "rewards/rejected": -0.8176702260971069, "step": 10690 }, { "epoch": 1.8435561681598898, "grad_norm": 12.950105667114258, "learning_rate": 1.8523722934541575e-09, "logits/chosen": -2.517003059387207, "logits/rejected": -2.479825496673584, "logps/chosen": -116.20123291015625, "logps/rejected": -126.58357238769531, "loss": 0.6341, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6003366708755493, "rewards/margins": 0.1621071845293045, "rewards/rejected": -0.7624439001083374, "step": 10700 }, { "epoch": 1.8435561681598898, "eval_logits/chosen": -2.5590357780456543, "eval_logits/rejected": -2.5522735118865967, "eval_logps/chosen": -108.35945892333984, "eval_logps/rejected": -123.14959716796875, "eval_loss": 0.6550170183181763, "eval_rewards/accuracies": 0.6205855011940002, "eval_rewards/chosen": -0.4964757263660431, "eval_rewards/margins": 0.10321904718875885, "eval_rewards/rejected": -0.5996947884559631, "eval_runtime": 360.261, "eval_samples_per_second": 11.947, "eval_steps_per_second": 1.493, "step": 10700 }, { "epoch": 1.8452791178497587, "grad_norm": 10.944367408752441, "learning_rate": 1.8120427205011556e-09, "logits/chosen": -2.4294328689575195, "logits/rejected": -2.4029085636138916, "logps/chosen": -113.1238784790039, "logps/rejected": -125.20323181152344, "loss": 0.6357, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5967923402786255, "rewards/margins": 0.16086360812187195, "rewards/rejected": -0.757655918598175, "step": 10710 }, { "epoch": 1.847002067539628, "grad_norm": 14.83804988861084, "learning_rate": 1.7721489149562063e-09, "logits/chosen": -2.453725814819336, "logits/rejected": -2.4256491661071777, "logps/chosen": -117.40971374511719, "logps/rejected": -132.19723510742188, "loss": 0.6463, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6482272148132324, "rewards/margins": 0.15093299746513367, "rewards/rejected": -0.7991601228713989, "step": 10720 }, { "epoch": 1.848725017229497, "grad_norm": 10.805709838867188, "learning_rate": 1.7326912375820846e-09, "logits/chosen": -2.5418336391448975, "logits/rejected": -2.5095081329345703, "logps/chosen": -113.35749816894531, "logps/rejected": -127.86077880859375, "loss": 0.6213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5939838290214539, "rewards/margins": 0.1858406960964203, "rewards/rejected": -0.7798245549201965, "step": 10730 }, { "epoch": 1.8504479669193659, "grad_norm": 13.219216346740723, "learning_rate": 1.6936700451975817e-09, "logits/chosen": -2.4094748497009277, "logits/rejected": -2.397881031036377, "logps/chosen": -118.57723236083984, "logps/rejected": -133.8645782470703, "loss": 0.6393, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6388689279556274, "rewards/margins": 0.14827385544776917, "rewards/rejected": -0.7871428728103638, "step": 10740 }, { "epoch": 1.852170916609235, "grad_norm": 9.872903823852539, "learning_rate": 1.6550856906743627e-09, "logits/chosen": -2.4142613410949707, "logits/rejected": -2.386833667755127, "logps/chosen": -111.84124755859375, "logps/rejected": -135.484130859375, "loss": 0.6086, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5978298187255859, "rewards/margins": 0.23036417365074158, "rewards/rejected": -0.8281939625740051, "step": 10750 }, { "epoch": 1.853893866299104, "grad_norm": 11.279659271240234, "learning_rate": 1.6169385229337086e-09, "logits/chosen": -2.4235472679138184, "logits/rejected": -2.4074952602386475, "logps/chosen": -113.22697448730469, "logps/rejected": -127.94795227050781, "loss": 0.6393, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5954617261886597, "rewards/margins": 0.15615740418434143, "rewards/rejected": -0.7516191601753235, "step": 10760 }, { "epoch": 1.8556168159889732, "grad_norm": 10.54137134552002, "learning_rate": 1.5792288869433902e-09, "logits/chosen": -2.4071712493896484, "logits/rejected": -2.3937089443206787, "logps/chosen": -116.57283020019531, "logps/rejected": -132.6007537841797, "loss": 0.627, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5950114130973816, "rewards/margins": 0.1851048767566681, "rewards/rejected": -0.7801163196563721, "step": 10770 }, { "epoch": 1.8573397656788422, "grad_norm": 9.987702369689941, "learning_rate": 1.5419571237145601e-09, "logits/chosen": -2.5344901084899902, "logits/rejected": -2.5173232555389404, "logps/chosen": -114.1861343383789, "logps/rejected": -132.4864959716797, "loss": 0.6373, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.610164999961853, "rewards/margins": 0.16384710371494293, "rewards/rejected": -0.7740120887756348, "step": 10780 }, { "epoch": 1.8590627153687111, "grad_norm": 11.684725761413574, "learning_rate": 1.5051235702986331e-09, "logits/chosen": -2.561619758605957, "logits/rejected": -2.5328845977783203, "logps/chosen": -107.4722671508789, "logps/rejected": -129.6492462158203, "loss": 0.6156, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5513117909431458, "rewards/margins": 0.21112540364265442, "rewards/rejected": -0.7624371647834778, "step": 10790 }, { "epoch": 1.8607856650585803, "grad_norm": 10.52690315246582, "learning_rate": 1.4687285597842768e-09, "logits/chosen": -2.5493738651275635, "logits/rejected": -2.5295369625091553, "logps/chosen": -111.5053482055664, "logps/rejected": -131.95693969726562, "loss": 0.627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5954598784446716, "rewards/margins": 0.18490615487098694, "rewards/rejected": -0.7803661227226257, "step": 10800 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -2.558943510055542, "eval_logits/rejected": -2.5521273612976074, "eval_logps/chosen": -108.42156982421875, "eval_logps/rejected": -123.24092102050781, "eval_loss": 0.6549394726753235, "eval_rewards/accuracies": 0.6210501790046692, "eval_rewards/chosen": -0.4970967471599579, "eval_rewards/margins": 0.10351123660802841, "eval_rewards/rejected": -0.6006080508232117, "eval_runtime": 360.1737, "eval_samples_per_second": 11.95, "eval_steps_per_second": 1.494, "step": 10800 }, { "epoch": 1.8625086147484493, "grad_norm": 9.803145408630371, "learning_rate": 1.4327724212943704e-09, "logits/chosen": -2.4846789836883545, "logits/rejected": -2.4454684257507324, "logps/chosen": -119.3404769897461, "logps/rejected": -129.8537139892578, "loss": 0.63, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.619606614112854, "rewards/margins": 0.1778271496295929, "rewards/rejected": -0.7974337339401245, "step": 10810 }, { "epoch": 1.8642315644383185, "grad_norm": 9.98331356048584, "learning_rate": 1.3972554799830394e-09, "logits/chosen": -2.4303512573242188, "logits/rejected": -2.408212184906006, "logps/chosen": -108.08897399902344, "logps/rejected": -125.14892578125, "loss": 0.6255, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5499401092529297, "rewards/margins": 0.18750813603401184, "rewards/rejected": -0.7374482154846191, "step": 10820 }, { "epoch": 1.8659545141281875, "grad_norm": 13.596193313598633, "learning_rate": 1.3621780570327257e-09, "logits/chosen": -2.451287031173706, "logits/rejected": -2.426119327545166, "logps/chosen": -112.04255676269531, "logps/rejected": -130.3465576171875, "loss": 0.6321, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6022640466690063, "rewards/margins": 0.1681433916091919, "rewards/rejected": -0.7704073190689087, "step": 10830 }, { "epoch": 1.8676774638180564, "grad_norm": 10.210439682006836, "learning_rate": 1.3275404696512615e-09, "logits/chosen": -2.4836678504943848, "logits/rejected": -2.4577431678771973, "logps/chosen": -111.8627700805664, "logps/rejected": -125.012451171875, "loss": 0.6429, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5887194275856018, "rewards/margins": 0.15233774483203888, "rewards/rejected": -0.7410570979118347, "step": 10840 }, { "epoch": 1.8694004135079254, "grad_norm": 10.683871269226074, "learning_rate": 1.2933430310690218e-09, "logits/chosen": -2.4083244800567627, "logits/rejected": -2.3927559852600098, "logps/chosen": -107.12113952636719, "logps/rejected": -130.18138122558594, "loss": 0.6245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5696035623550415, "rewards/margins": 0.18765828013420105, "rewards/rejected": -0.7572618722915649, "step": 10850 }, { "epoch": 1.8711233631977946, "grad_norm": 12.726272583007812, "learning_rate": 1.25958605053606e-09, "logits/chosen": -2.5296437740325928, "logits/rejected": -2.498725414276123, "logps/chosen": -116.60414123535156, "logps/rejected": -130.74502563476562, "loss": 0.636, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6134532690048218, "rewards/margins": 0.1698613315820694, "rewards/rejected": -0.78331458568573, "step": 10860 }, { "epoch": 1.8728463128876638, "grad_norm": 8.664299011230469, "learning_rate": 1.2262698333193766e-09, "logits/chosen": -2.3817386627197266, "logits/rejected": -2.3768210411071777, "logps/chosen": -107.44478607177734, "logps/rejected": -135.75704956054688, "loss": 0.6121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.56959068775177, "rewards/margins": 0.22736403346061707, "rewards/rejected": -0.7969546914100647, "step": 10870 }, { "epoch": 1.8745692625775328, "grad_norm": 11.063324928283691, "learning_rate": 1.1933946807000606e-09, "logits/chosen": -2.458096981048584, "logits/rejected": -2.439481735229492, "logps/chosen": -112.50048828125, "logps/rejected": -130.33523559570312, "loss": 0.6182, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5829136967658997, "rewards/margins": 0.19349297881126404, "rewards/rejected": -0.7764066457748413, "step": 10880 }, { "epoch": 1.8762922122674017, "grad_norm": 9.031937599182129, "learning_rate": 1.1609608899706803e-09, "logits/chosen": -2.5011849403381348, "logits/rejected": -2.483492374420166, "logps/chosen": -113.2860107421875, "logps/rejected": -133.9983367919922, "loss": 0.6233, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6012560129165649, "rewards/margins": 0.1904800534248352, "rewards/rejected": -0.7917360067367554, "step": 10890 }, { "epoch": 1.8780151619572707, "grad_norm": 10.209151268005371, "learning_rate": 1.1289687544324745e-09, "logits/chosen": -2.5554192066192627, "logits/rejected": -2.5241541862487793, "logps/chosen": -112.42668151855469, "logps/rejected": -124.28495788574219, "loss": 0.6335, "rewards/accuracies": 0.625, "rewards/chosen": -0.5743435621261597, "rewards/margins": 0.16246747970581055, "rewards/rejected": -0.7368109822273254, "step": 10900 }, { "epoch": 1.8780151619572707, "eval_logits/chosen": -2.559028148651123, "eval_logits/rejected": -2.5522618293762207, "eval_logps/chosen": -108.45635986328125, "eval_logps/rejected": -123.27279663085938, "eval_loss": 0.654963493347168, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.49744468927383423, "eval_rewards/margins": 0.1034821942448616, "eval_rewards/rejected": -0.6009268760681152, "eval_runtime": 360.2674, "eval_samples_per_second": 11.947, "eval_steps_per_second": 1.493, "step": 10900 }, { "epoch": 1.8797381116471399, "grad_norm": 11.109238624572754, "learning_rate": 1.097418563392799e-09, "logits/chosen": -2.4565396308898926, "logits/rejected": -2.427241086959839, "logps/chosen": -112.29621887207031, "logps/rejected": -128.05030822753906, "loss": 0.63, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5662978887557983, "rewards/margins": 0.1724037230014801, "rewards/rejected": -0.7387016415596008, "step": 10910 }, { "epoch": 1.881461061337009, "grad_norm": 12.130841255187988, "learning_rate": 1.0663106021624623e-09, "logits/chosen": -2.428583860397339, "logits/rejected": -2.4066367149353027, "logps/chosen": -117.6747817993164, "logps/rejected": -132.8242950439453, "loss": 0.6256, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6183713674545288, "rewards/margins": 0.18066565692424774, "rewards/rejected": -0.799036979675293, "step": 10920 }, { "epoch": 1.883184011026878, "grad_norm": 12.480398178100586, "learning_rate": 1.035645152053155e-09, "logits/chosen": -2.393200635910034, "logits/rejected": -2.3660690784454346, "logps/chosen": -116.7625503540039, "logps/rejected": -127.45570373535156, "loss": 0.648, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6255054473876953, "rewards/margins": 0.1341763436794281, "rewards/rejected": -0.759681761264801, "step": 10930 }, { "epoch": 1.884906960716747, "grad_norm": 12.46471118927002, "learning_rate": 1.0054224903748964e-09, "logits/chosen": -2.362339735031128, "logits/rejected": -2.3466224670410156, "logps/chosen": -119.80097961425781, "logps/rejected": -132.3865509033203, "loss": 0.6437, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6334177851676941, "rewards/margins": 0.13872984051704407, "rewards/rejected": -0.772147536277771, "step": 10940 }, { "epoch": 1.886629910406616, "grad_norm": 14.05789852142334, "learning_rate": 9.75642890433548e-10, "logits/chosen": -2.4196391105651855, "logits/rejected": -2.4031875133514404, "logps/chosen": -118.99332427978516, "logps/rejected": -133.8544464111328, "loss": 0.6424, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6438688039779663, "rewards/margins": 0.1477302461862564, "rewards/rejected": -0.7915989756584167, "step": 10950 }, { "epoch": 1.8883528600964852, "grad_norm": 13.794063568115234, "learning_rate": 9.463066215283254e-10, "logits/chosen": -2.477527618408203, "logits/rejected": -2.456815719604492, "logps/chosen": -116.08634948730469, "logps/rejected": -136.8908233642578, "loss": 0.6227, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6227591037750244, "rewards/margins": 0.22041161358356476, "rewards/rejected": -0.8431707620620728, "step": 10960 }, { "epoch": 1.8900758097863544, "grad_norm": 11.207356452941895, "learning_rate": 9.174139489493582e-10, "logits/chosen": -2.4887819290161133, "logits/rejected": -2.4592578411102295, "logps/chosen": -117.4792709350586, "logps/rejected": -134.54624938964844, "loss": 0.6263, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.606163501739502, "rewards/margins": 0.19818837940692902, "rewards/rejected": -0.8043519258499146, "step": 10970 }, { "epoch": 1.8917987594762233, "grad_norm": 12.16650676727295, "learning_rate": 8.889651339753279e-10, "logits/chosen": -2.4826903343200684, "logits/rejected": -2.46773099899292, "logps/chosen": -119.41780853271484, "logps/rejected": -141.7852325439453, "loss": 0.6275, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6484467387199402, "rewards/margins": 0.20761927962303162, "rewards/rejected": -0.8560660481452942, "step": 10980 }, { "epoch": 1.8935217091660923, "grad_norm": 14.031679153442383, "learning_rate": 8.609604338710441e-10, "logits/chosen": -2.4016706943511963, "logits/rejected": -2.389274835586548, "logps/chosen": -112.11262512207031, "logps/rejected": -130.65145874023438, "loss": 0.6284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5962507128715515, "rewards/margins": 0.18530747294425964, "rewards/rejected": -0.7815582752227783, "step": 10990 }, { "epoch": 1.8952446588559613, "grad_norm": 11.361660957336426, "learning_rate": 8.334001018851622e-10, "logits/chosen": -2.5866708755493164, "logits/rejected": -2.5802624225616455, "logps/chosen": -116.65419006347656, "logps/rejected": -140.32632446289062, "loss": 0.6262, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6376508474349976, "rewards/margins": 0.1957683116197586, "rewards/rejected": -0.833419144153595, "step": 11000 }, { "epoch": 1.8952446588559613, "eval_logits/chosen": -2.558842897415161, "eval_logits/rejected": -2.5520331859588623, "eval_logps/chosen": -108.41845703125, "eval_logps/rejected": -123.21259307861328, "eval_loss": 0.6550112366676331, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.49706554412841797, "eval_rewards/margins": 0.10325910896062851, "eval_rewards/rejected": -0.6003247499465942, "eval_runtime": 359.7213, "eval_samples_per_second": 11.965, "eval_steps_per_second": 1.496, "step": 11000 }, { "epoch": 1.8969676085458305, "grad_norm": 9.476881980895996, "learning_rate": 8.062843872479019e-10, "logits/chosen": -2.513068675994873, "logits/rejected": -2.4836723804473877, "logps/chosen": -110.36959075927734, "logps/rejected": -131.13397216796875, "loss": 0.6105, "rewards/accuracies": 0.75, "rewards/chosen": -0.5716327428817749, "rewards/margins": 0.2250044345855713, "rewards/rejected": -0.7966371774673462, "step": 11010 }, { "epoch": 1.8986905582356997, "grad_norm": 11.814251899719238, "learning_rate": 7.796135351687494e-10, "logits/chosen": -2.524017572402954, "logits/rejected": -2.506251573562622, "logps/chosen": -115.6934814453125, "logps/rejected": -130.05941772460938, "loss": 0.6333, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6068946123123169, "rewards/margins": 0.16329768300056458, "rewards/rejected": -0.7701922655105591, "step": 11020 }, { "epoch": 1.9004135079255686, "grad_norm": 11.158601760864258, "learning_rate": 7.533877868342698e-10, "logits/chosen": -2.496063709259033, "logits/rejected": -2.4887490272521973, "logps/chosen": -118.58612060546875, "logps/rejected": -133.88473510742188, "loss": 0.6387, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6379790306091309, "rewards/margins": 0.1559731513261795, "rewards/rejected": -0.7939521670341492, "step": 11030 }, { "epoch": 1.9021364576154376, "grad_norm": 8.867852210998535, "learning_rate": 7.276073794059367e-10, "logits/chosen": -2.5248653888702393, "logits/rejected": -2.5092899799346924, "logps/chosen": -120.84211730957031, "logps/rejected": -136.56988525390625, "loss": 0.6361, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6437533497810364, "rewards/margins": 0.166532963514328, "rewards/rejected": -0.8102862238883972, "step": 11040 }, { "epoch": 1.9038594073053066, "grad_norm": 10.516443252563477, "learning_rate": 7.022725460179457e-10, "logits/chosen": -2.4529166221618652, "logits/rejected": -2.4327664375305176, "logps/chosen": -107.2201919555664, "logps/rejected": -132.77243041992188, "loss": 0.6088, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5525605082511902, "rewards/margins": 0.2225067913532257, "rewards/rejected": -0.7750672101974487, "step": 11050 }, { "epoch": 1.9055823569951758, "grad_norm": 12.075571060180664, "learning_rate": 6.77383515775154e-10, "logits/chosen": -2.488499879837036, "logits/rejected": -2.447962999343872, "logps/chosen": -115.3367919921875, "logps/rejected": -134.97335815429688, "loss": 0.6115, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6106191873550415, "rewards/margins": 0.22174124419689178, "rewards/rejected": -0.8323603868484497, "step": 11060 }, { "epoch": 1.907305306685045, "grad_norm": 11.85994815826416, "learning_rate": 6.529405137509824e-10, "logits/chosen": -2.5081701278686523, "logits/rejected": -2.4635863304138184, "logps/chosen": -109.13908386230469, "logps/rejected": -131.86538696289062, "loss": 0.5922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5596362352371216, "rewards/margins": 0.26336470246315, "rewards/rejected": -0.8230009078979492, "step": 11070 }, { "epoch": 1.909028256374914, "grad_norm": 13.558547973632812, "learning_rate": 6.289437609853731e-10, "logits/chosen": -2.4891202449798584, "logits/rejected": -2.4640064239501953, "logps/chosen": -118.3816909790039, "logps/rejected": -136.5568084716797, "loss": 0.6141, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.628782331943512, "rewards/margins": 0.21184420585632324, "rewards/rejected": -0.84062659740448, "step": 11080 }, { "epoch": 1.9107512060647829, "grad_norm": 12.248273849487305, "learning_rate": 6.053934744828071e-10, "logits/chosen": -2.465141773223877, "logits/rejected": -2.4455647468566895, "logps/chosen": -122.22418212890625, "logps/rejected": -132.9927215576172, "loss": 0.6372, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6440066695213318, "rewards/margins": 0.15865349769592285, "rewards/rejected": -0.8026601672172546, "step": 11090 }, { "epoch": 1.9124741557546519, "grad_norm": 10.838395118713379, "learning_rate": 5.822898672103449e-10, "logits/chosen": -2.3649260997772217, "logits/rejected": -2.3505358695983887, "logps/chosen": -107.90660095214844, "logps/rejected": -127.75433349609375, "loss": 0.6311, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5702854990959167, "rewards/margins": 0.16702958941459656, "rewards/rejected": -0.7373150587081909, "step": 11100 }, { "epoch": 1.9124741557546519, "eval_logits/chosen": -2.558917760848999, "eval_logits/rejected": -2.552147150039673, "eval_logps/chosen": -108.42533874511719, "eval_logps/rejected": -123.26876068115234, "eval_loss": 0.6548120975494385, "eval_rewards/accuracies": 0.6210501790046692, "eval_rewards/chosen": -0.49713435769081116, "eval_rewards/margins": 0.10375203937292099, "eval_rewards/rejected": -0.6008864045143127, "eval_runtime": 359.348, "eval_samples_per_second": 11.977, "eval_steps_per_second": 1.497, "step": 11100 }, { "epoch": 1.914197105444521, "grad_norm": 11.10058879852295, "learning_rate": 5.59633148095684e-10, "logits/chosen": -2.4862568378448486, "logits/rejected": -2.4628608226776123, "logps/chosen": -119.6026382446289, "logps/rejected": -141.5771942138672, "loss": 0.6134, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6570100784301758, "rewards/margins": 0.22355251014232635, "rewards/rejected": -0.8805624842643738, "step": 11110 }, { "epoch": 1.9159200551343902, "grad_norm": 10.916470527648926, "learning_rate": 5.374235220252765e-10, "logits/chosen": -2.577754497528076, "logits/rejected": -2.5522544384002686, "logps/chosen": -123.24705505371094, "logps/rejected": -133.11061096191406, "loss": 0.6389, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6373213529586792, "rewards/margins": 0.15703292191028595, "rewards/rejected": -0.7943544387817383, "step": 11120 }, { "epoch": 1.9176430048242592, "grad_norm": 11.398584365844727, "learning_rate": 5.156611898424867e-10, "logits/chosen": -2.4134035110473633, "logits/rejected": -2.391727924346924, "logps/chosen": -122.10980224609375, "logps/rejected": -132.87570190429688, "loss": 0.6458, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6623373031616211, "rewards/margins": 0.13456371426582336, "rewards/rejected": -0.7969009876251221, "step": 11130 }, { "epoch": 1.9193659545141282, "grad_norm": 11.0700044631958, "learning_rate": 4.943463483457588e-10, "logits/chosen": -2.3942623138427734, "logits/rejected": -2.3760738372802734, "logps/chosen": -112.96749114990234, "logps/rejected": -128.13754272460938, "loss": 0.6285, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5915412306785583, "rewards/margins": 0.17406094074249268, "rewards/rejected": -0.765602171421051, "step": 11140 }, { "epoch": 1.9210889042039971, "grad_norm": 12.193937301635742, "learning_rate": 4.734791902868462e-10, "logits/chosen": -2.4600093364715576, "logits/rejected": -2.4479706287384033, "logps/chosen": -122.27796936035156, "logps/rejected": -134.85580444335938, "loss": 0.6445, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6418440937995911, "rewards/margins": 0.14160865545272827, "rewards/rejected": -0.7834526300430298, "step": 11150 }, { "epoch": 1.9228118538938663, "grad_norm": 10.338449478149414, "learning_rate": 4.530599043690575e-10, "logits/chosen": -2.5353057384490967, "logits/rejected": -2.5182223320007324, "logps/chosen": -111.48570251464844, "logps/rejected": -126.96525573730469, "loss": 0.6341, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5936704874038696, "rewards/margins": 0.16582946479320526, "rewards/rejected": -0.7594999074935913, "step": 11160 }, { "epoch": 1.9245348035837355, "grad_norm": 9.943690299987793, "learning_rate": 4.3308867524557425e-10, "logits/chosen": -2.3883252143859863, "logits/rejected": -2.3627078533172607, "logps/chosen": -115.91829681396484, "logps/rejected": -121.95379638671875, "loss": 0.6436, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5875613689422607, "rewards/margins": 0.14175362884998322, "rewards/rejected": -0.7293149828910828, "step": 11170 }, { "epoch": 1.9262577532736045, "grad_norm": 9.826698303222656, "learning_rate": 4.135656835177581e-10, "logits/chosen": -2.5109176635742188, "logits/rejected": -2.4892756938934326, "logps/chosen": -118.7507095336914, "logps/rejected": -133.6431121826172, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -0.6478487253189087, "rewards/margins": 0.18034231662750244, "rewards/rejected": -0.8281909227371216, "step": 11180 }, { "epoch": 1.9279807029634735, "grad_norm": 11.667350769042969, "learning_rate": 3.944911057335354e-10, "logits/chosen": -2.505736827850342, "logits/rejected": -2.477687120437622, "logps/chosen": -114.22786712646484, "logps/rejected": -133.70339965820312, "loss": 0.6072, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5934600830078125, "rewards/margins": 0.22718998789787292, "rewards/rejected": -0.8206501007080078, "step": 11190 }, { "epoch": 1.9297036526533424, "grad_norm": 10.847187995910645, "learning_rate": 3.7586511438576496e-10, "logits/chosen": -2.5051229000091553, "logits/rejected": -2.4858336448669434, "logps/chosen": -112.1025161743164, "logps/rejected": -130.5367431640625, "loss": 0.6239, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5878943204879761, "rewards/margins": 0.18000969290733337, "rewards/rejected": -0.7679039835929871, "step": 11200 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -2.5583388805389404, "eval_logits/rejected": -2.5515522956848145, "eval_logps/chosen": -108.4262924194336, "eval_logps/rejected": -123.20613098144531, "eval_loss": 0.6550823450088501, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.4971439838409424, "eval_rewards/margins": 0.10311610996723175, "eval_rewards/rejected": -0.6002600789070129, "eval_runtime": 359.6901, "eval_samples_per_second": 11.966, "eval_steps_per_second": 1.496, "step": 11200 }, { "epoch": 1.9314266023432116, "grad_norm": 9.750901222229004, "learning_rate": 3.5768787791073394e-10, "logits/chosen": -2.3721094131469727, "logits/rejected": -2.3552279472351074, "logps/chosen": -108.65545654296875, "logps/rejected": -134.5958251953125, "loss": 0.6131, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5824468731880188, "rewards/margins": 0.2240564525127411, "rewards/rejected": -0.806503176689148, "step": 11210 }, { "epoch": 1.9331495520330806, "grad_norm": 11.33436107635498, "learning_rate": 3.3995956068658683e-10, "logits/chosen": -2.465871572494507, "logits/rejected": -2.4521288871765137, "logps/chosen": -111.4020004272461, "logps/rejected": -133.30909729003906, "loss": 0.6324, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5935233235359192, "rewards/margins": 0.1827230155467987, "rewards/rejected": -0.7762463688850403, "step": 11220 }, { "epoch": 1.9348725017229498, "grad_norm": 10.559438705444336, "learning_rate": 3.2268032303185977e-10, "logits/chosen": -2.43168306350708, "logits/rejected": -2.4194796085357666, "logps/chosen": -124.14222717285156, "logps/rejected": -130.67507934570312, "loss": 0.6634, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6837325096130371, "rewards/margins": 0.1077658161520958, "rewards/rejected": -0.7914983630180359, "step": 11230 }, { "epoch": 1.9365954514128187, "grad_norm": 8.981484413146973, "learning_rate": 3.0585032120403196e-10, "logits/chosen": -2.4348931312561035, "logits/rejected": -2.419318437576294, "logps/chosen": -115.5736312866211, "logps/rejected": -130.85311889648438, "loss": 0.6338, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6092356443405151, "rewards/margins": 0.16486208140850067, "rewards/rejected": -0.774097740650177, "step": 11240 }, { "epoch": 1.9383184011026877, "grad_norm": 11.513097763061523, "learning_rate": 2.894697073981045e-10, "logits/chosen": -2.4622929096221924, "logits/rejected": -2.4382715225219727, "logps/chosen": -112.7807846069336, "logps/rejected": -133.18942260742188, "loss": 0.6138, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5849259495735168, "rewards/margins": 0.2107054740190506, "rewards/rejected": -0.7956314086914062, "step": 11250 }, { "epoch": 1.940041350792557, "grad_norm": 10.974672317504883, "learning_rate": 2.735386297452291e-10, "logits/chosen": -2.4070940017700195, "logits/rejected": -2.3761181831359863, "logps/chosen": -117.33487701416016, "logps/rejected": -126.53621673583984, "loss": 0.6421, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5950134992599487, "rewards/margins": 0.1547488272190094, "rewards/rejected": -0.7497623562812805, "step": 11260 }, { "epoch": 1.9417643004824259, "grad_norm": 10.253097534179688, "learning_rate": 2.5805723231137057e-10, "logits/chosen": -2.4497439861297607, "logits/rejected": -2.4220428466796875, "logps/chosen": -114.47210693359375, "logps/rejected": -130.89596557617188, "loss": 0.6141, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5890557169914246, "rewards/margins": 0.2095029354095459, "rewards/rejected": -0.7985587120056152, "step": 11270 }, { "epoch": 1.943487250172295, "grad_norm": 9.800058364868164, "learning_rate": 2.430256550959908e-10, "logits/chosen": -2.5011367797851562, "logits/rejected": -2.473191738128662, "logps/chosen": -116.41319274902344, "logps/rejected": -127.36231994628906, "loss": 0.6393, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6010106205940247, "rewards/margins": 0.14850196242332458, "rewards/rejected": -0.7495125532150269, "step": 11280 }, { "epoch": 1.945210199862164, "grad_norm": 10.676453590393066, "learning_rate": 2.2844403403081137e-10, "logits/chosen": -2.4754865169525146, "logits/rejected": -2.4605138301849365, "logps/chosen": -117.28421783447266, "logps/rejected": -134.24063110351562, "loss": 0.6134, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5732543468475342, "rewards/margins": 0.20773577690124512, "rewards/rejected": -0.7809900045394897, "step": 11290 }, { "epoch": 1.946933149552033, "grad_norm": 10.65670108795166, "learning_rate": 2.1431250097854182e-10, "logits/chosen": -2.436732292175293, "logits/rejected": -2.413374185562134, "logps/chosen": -123.99664306640625, "logps/rejected": -128.61732482910156, "loss": 0.6629, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6507250070571899, "rewards/margins": 0.09833811223506927, "rewards/rejected": -0.7490631341934204, "step": 11300 }, { "epoch": 1.946933149552033, "eval_logits/chosen": -2.558652400970459, "eval_logits/rejected": -2.5518338680267334, "eval_logps/chosen": -108.41070556640625, "eval_logps/rejected": -123.20658111572266, "eval_loss": 0.6549978256225586, "eval_rewards/accuracies": 0.6205855011940002, "eval_rewards/chosen": -0.49698811769485474, "eval_rewards/margins": 0.10327637940645218, "eval_rewards/rejected": -0.6002644896507263, "eval_runtime": 359.9247, "eval_samples_per_second": 11.958, "eval_steps_per_second": 1.495, "step": 11300 }, { "epoch": 1.948656099241902, "grad_norm": 11.778402328491211, "learning_rate": 2.0063118373173648e-10, "logits/chosen": -2.5401158332824707, "logits/rejected": -2.5334606170654297, "logps/chosen": -109.58613586425781, "logps/rejected": -133.18478393554688, "loss": 0.6093, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5912298560142517, "rewards/margins": 0.22539111971855164, "rewards/rejected": -0.8166210055351257, "step": 11310 }, { "epoch": 1.9503790489317712, "grad_norm": 9.3758544921875, "learning_rate": 1.8740020601158425e-10, "logits/chosen": -2.471465587615967, "logits/rejected": -2.4445977210998535, "logps/chosen": -111.59794616699219, "logps/rejected": -132.0358428955078, "loss": 0.6153, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5667641162872314, "rewards/margins": 0.2145020067691803, "rewards/rejected": -0.7812660932540894, "step": 11320 }, { "epoch": 1.9521019986216404, "grad_norm": 16.428855895996094, "learning_rate": 1.746196874668482e-10, "logits/chosen": -2.4795517921447754, "logits/rejected": -2.4475464820861816, "logps/chosen": -118.28900146484375, "logps/rejected": -131.59017944335938, "loss": 0.6344, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6573032736778259, "rewards/margins": 0.17725330591201782, "rewards/rejected": -0.8345565795898438, "step": 11330 }, { "epoch": 1.9538249483115093, "grad_norm": 12.055771827697754, "learning_rate": 1.6228974367273883e-10, "logits/chosen": -2.3732800483703613, "logits/rejected": -2.3519484996795654, "logps/chosen": -118.50630187988281, "logps/rejected": -138.6522979736328, "loss": 0.6227, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6312685608863831, "rewards/margins": 0.20766310393810272, "rewards/rejected": -0.838931679725647, "step": 11340 }, { "epoch": 1.9555478980013783, "grad_norm": 11.385031700134277, "learning_rate": 1.5041048612988717e-10, "logits/chosen": -2.5143306255340576, "logits/rejected": -2.5021491050720215, "logps/chosen": -121.97713470458984, "logps/rejected": -136.8004608154297, "loss": 0.6402, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6507047414779663, "rewards/margins": 0.15681439638137817, "rewards/rejected": -0.8075190782546997, "step": 11350 }, { "epoch": 1.9572708476912473, "grad_norm": 9.432991981506348, "learning_rate": 1.3898202226333423e-10, "logits/chosen": -2.47906756401062, "logits/rejected": -2.451880931854248, "logps/chosen": -114.74503326416016, "logps/rejected": -129.50633239746094, "loss": 0.6267, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6249033808708191, "rewards/margins": 0.18457625806331635, "rewards/rejected": -0.8094797134399414, "step": 11360 }, { "epoch": 1.9589937973811165, "grad_norm": 10.39565658569336, "learning_rate": 1.280044554215598e-10, "logits/chosen": -2.442214012145996, "logits/rejected": -2.4250168800354004, "logps/chosen": -119.72346496582031, "logps/rejected": -133.8603515625, "loss": 0.6336, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6344811320304871, "rewards/margins": 0.16784489154815674, "rewards/rejected": -0.8023262023925781, "step": 11370 }, { "epoch": 1.9607167470709856, "grad_norm": 14.005722045898438, "learning_rate": 1.1747788487553866e-10, "logits/chosen": -2.415670871734619, "logits/rejected": -2.394944667816162, "logps/chosen": -114.1079330444336, "logps/rejected": -130.89517211914062, "loss": 0.6307, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6227467656135559, "rewards/margins": 0.18552573025226593, "rewards/rejected": -0.8082724809646606, "step": 11380 }, { "epoch": 1.9624396967608546, "grad_norm": 10.821507453918457, "learning_rate": 1.0740240581786353e-10, "logits/chosen": -2.5239498615264893, "logits/rejected": -2.502776622772217, "logps/chosen": -108.56727600097656, "logps/rejected": -135.5474395751953, "loss": 0.6051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5624256730079651, "rewards/margins": 0.239888995885849, "rewards/rejected": -0.8023146390914917, "step": 11390 }, { "epoch": 1.9641626464507236, "grad_norm": 11.778271675109863, "learning_rate": 9.777810936187347e-11, "logits/chosen": -2.4854180812835693, "logits/rejected": -2.455705404281616, "logps/chosen": -113.80528259277344, "logps/rejected": -129.2537384033203, "loss": 0.6308, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5789285898208618, "rewards/margins": 0.1805000901222229, "rewards/rejected": -0.7594286799430847, "step": 11400 }, { "epoch": 1.9641626464507236, "eval_logits/chosen": -2.5585732460021973, "eval_logits/rejected": -2.5517868995666504, "eval_logps/chosen": -108.43598175048828, "eval_logps/rejected": -123.23049926757812, "eval_loss": 0.655043363571167, "eval_rewards/accuracies": 0.6196561455726624, "eval_rewards/chosen": -0.4972408711910248, "eval_rewards/margins": 0.10326271504163742, "eval_rewards/rejected": -0.6005036234855652, "eval_runtime": 361.0916, "eval_samples_per_second": 11.919, "eval_steps_per_second": 1.49, "step": 11400 }, { "epoch": 1.9658855961405926, "grad_norm": 11.219842910766602, "learning_rate": 8.860508254081577e-11, "logits/chosen": -2.48612642288208, "logits/rejected": -2.4600718021392822, "logps/chosen": -112.62696838378906, "logps/rejected": -127.6418228149414, "loss": 0.6378, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.569412350654602, "rewards/margins": 0.15631404519081116, "rewards/rejected": -0.7257263660430908, "step": 11410 }, { "epoch": 1.9676085458304617, "grad_norm": 11.249455451965332, "learning_rate": 7.98834083070743e-11, "logits/chosen": -2.4693493843078613, "logits/rejected": -2.441267728805542, "logps/chosen": -112.8091812133789, "logps/rejected": -126.4327621459961, "loss": 0.6304, "rewards/accuracies": 0.65625, "rewards/chosen": -0.591242253780365, "rewards/margins": 0.17049895226955414, "rewards/rejected": -0.7617412209510803, "step": 11420 }, { "epoch": 1.969331495520331, "grad_norm": 10.52036190032959, "learning_rate": 7.161316553143115e-11, "logits/chosen": -2.4036307334899902, "logits/rejected": -2.3796024322509766, "logps/chosen": -116.31755065917969, "logps/rejected": -130.1694793701172, "loss": 0.6355, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6134723424911499, "rewards/margins": 0.1615884006023407, "rewards/rejected": -0.775060772895813, "step": 11430 }, { "epoch": 1.9710544452102, "grad_norm": 10.630167961120605, "learning_rate": 6.37944290023229e-11, "logits/chosen": -2.472803831100464, "logits/rejected": -2.461055278778076, "logps/chosen": -116.23243713378906, "logps/rejected": -127.49246978759766, "loss": 0.6552, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6368228793144226, "rewards/margins": 0.1119503378868103, "rewards/rejected": -0.7487732172012329, "step": 11440 }, { "epoch": 1.9727773949000689, "grad_norm": 11.976935386657715, "learning_rate": 5.64272694251855e-11, "logits/chosen": -2.4077811241149902, "logits/rejected": -2.3849339485168457, "logps/chosen": -113.08724212646484, "logps/rejected": -134.1420135498047, "loss": 0.6083, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.603790819644928, "rewards/margins": 0.22062405943870544, "rewards/rejected": -0.8244149088859558, "step": 11450 }, { "epoch": 1.9745003445899378, "grad_norm": 10.31814956665039, "learning_rate": 4.951175342181035e-11, "logits/chosen": -2.4443137645721436, "logits/rejected": -2.4257240295410156, "logps/chosen": -112.2625732421875, "logps/rejected": -131.751953125, "loss": 0.617, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5672473907470703, "rewards/margins": 0.20691117644309998, "rewards/rejected": -0.7741585969924927, "step": 11460 }, { "epoch": 1.976223294279807, "grad_norm": 10.4486722946167, "learning_rate": 4.3047943529739283e-11, "logits/chosen": -2.481876850128174, "logits/rejected": -2.452486038208008, "logps/chosen": -121.09492492675781, "logps/rejected": -136.9915313720703, "loss": 0.6171, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6425241231918335, "rewards/margins": 0.21760614216327667, "rewards/rejected": -0.8601303100585938, "step": 11470 }, { "epoch": 1.9779462439696762, "grad_norm": 10.591318130493164, "learning_rate": 3.703589820170938e-11, "logits/chosen": -2.5215744972229004, "logits/rejected": -2.4884209632873535, "logps/chosen": -112.20654296875, "logps/rejected": -126.6222915649414, "loss": 0.6253, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5611206293106079, "rewards/margins": 0.20080527663230896, "rewards/rejected": -0.7619259357452393, "step": 11480 }, { "epoch": 1.9796691936595452, "grad_norm": 11.51418685913086, "learning_rate": 3.1475671805103465e-11, "logits/chosen": -2.4421164989471436, "logits/rejected": -2.4178805351257324, "logps/chosen": -120.77961730957031, "logps/rejected": -128.45925903320312, "loss": 0.6591, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6608833074569702, "rewards/margins": 0.11253005266189575, "rewards/rejected": -0.773413360118866, "step": 11490 }, { "epoch": 1.9813921433494142, "grad_norm": 10.343287467956543, "learning_rate": 2.6367314621483783e-11, "logits/chosen": -2.4889087677001953, "logits/rejected": -2.4696598052978516, "logps/chosen": -109.34969329833984, "logps/rejected": -122.16093444824219, "loss": 0.6532, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5906139612197876, "rewards/margins": 0.12310566753149033, "rewards/rejected": -0.7137196063995361, "step": 11500 }, { "epoch": 1.9813921433494142, "eval_logits/chosen": -2.5584716796875, "eval_logits/rejected": -2.551682710647583, "eval_logps/chosen": -108.43128967285156, "eval_logps/rejected": -123.231689453125, "eval_loss": 0.6550213694572449, "eval_rewards/accuracies": 0.6196561455726624, "eval_rewards/chosen": -0.49719396233558655, "eval_rewards/margins": 0.10332164913415909, "eval_rewards/rejected": -0.600515604019165, "eval_runtime": 360.843, "eval_samples_per_second": 11.928, "eval_steps_per_second": 1.491, "step": 11500 }, { "epoch": 1.9831150930392831, "grad_norm": 13.34192180633545, "learning_rate": 2.1710872846109062e-11, "logits/chosen": -2.4032585620880127, "logits/rejected": -2.3933894634246826, "logps/chosen": -115.02870178222656, "logps/rejected": -132.9548797607422, "loss": 0.6298, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5954822301864624, "rewards/margins": 0.20137843489646912, "rewards/rejected": -0.7968606948852539, "step": 11510 }, { "epoch": 1.9848380427291523, "grad_norm": 10.762691497802734, "learning_rate": 1.7506388587540387e-11, "logits/chosen": -2.4918160438537598, "logits/rejected": -2.4419684410095215, "logps/chosen": -118.88272857666016, "logps/rejected": -127.25289154052734, "loss": 0.6276, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5908408761024475, "rewards/margins": 0.1819257289171219, "rewards/rejected": -0.7727667093276978, "step": 11520 }, { "epoch": 1.9865609924190215, "grad_norm": 8.538677215576172, "learning_rate": 1.3753899867263718e-11, "logits/chosen": -2.462484836578369, "logits/rejected": -2.431194305419922, "logps/chosen": -113.8895492553711, "logps/rejected": -125.02347564697266, "loss": 0.637, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6042777895927429, "rewards/margins": 0.15585239231586456, "rewards/rejected": -0.7601302862167358, "step": 11530 }, { "epoch": 1.9882839421088905, "grad_norm": 11.066951751708984, "learning_rate": 1.0453440619312414e-11, "logits/chosen": -2.500387668609619, "logits/rejected": -2.489388942718506, "logps/chosen": -109.51570892333984, "logps/rejected": -137.24916076660156, "loss": 0.6022, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.587682843208313, "rewards/margins": 0.23254191875457764, "rewards/rejected": -0.8202247619628906, "step": 11540 }, { "epoch": 1.9900068917987594, "grad_norm": 9.847150802612305, "learning_rate": 7.605040690000786e-12, "logits/chosen": -2.4396908283233643, "logits/rejected": -2.4210143089294434, "logps/chosen": -111.09547424316406, "logps/rejected": -130.10366821289062, "loss": 0.6317, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5976601839065552, "rewards/margins": 0.17508384585380554, "rewards/rejected": -0.7727439999580383, "step": 11550 }, { "epoch": 1.9917298414886284, "grad_norm": 9.202311515808105, "learning_rate": 5.208725837624328e-12, "logits/chosen": -2.466907024383545, "logits/rejected": -2.4393181800842285, "logps/chosen": -118.4568862915039, "logps/rejected": -131.09475708007812, "loss": 0.6389, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.634739100933075, "rewards/margins": 0.15840479731559753, "rewards/rejected": -0.7931438684463501, "step": 11560 }, { "epoch": 1.9934527911784976, "grad_norm": 10.943995475769043, "learning_rate": 3.2645177322432327e-12, "logits/chosen": -2.4700872898101807, "logits/rejected": -2.4510324001312256, "logps/chosen": -119.45172119140625, "logps/rejected": -139.00967407226562, "loss": 0.6356, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.653407096862793, "rewards/margins": 0.17969268560409546, "rewards/rejected": -0.8330997228622437, "step": 11570 }, { "epoch": 1.9951757408683668, "grad_norm": 11.605945587158203, "learning_rate": 1.7724339554880952e-12, "logits/chosen": -2.508540630340576, "logits/rejected": -2.490703582763672, "logps/chosen": -118.93894958496094, "logps/rejected": -133.46774291992188, "loss": 0.6431, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6268073320388794, "rewards/margins": 0.15112532675266266, "rewards/rejected": -0.7779326438903809, "step": 11580 }, { "epoch": 1.9968986905582358, "grad_norm": 13.840242385864258, "learning_rate": 7.324880003767298e-13, "logits/chosen": -2.4034335613250732, "logits/rejected": -2.369175434112549, "logps/chosen": -120.6702880859375, "logps/rejected": -123.08309173583984, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": -0.611688494682312, "rewards/margins": 0.10892312228679657, "rewards/rejected": -0.7206116318702698, "step": 11590 }, { "epoch": 1.9986216402481047, "grad_norm": 9.633621215820312, "learning_rate": 1.4468927122535113e-13, "logits/chosen": -2.453922748565674, "logits/rejected": -2.4403982162475586, "logps/chosen": -106.54515075683594, "logps/rejected": -127.30931091308594, "loss": 0.6257, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5609582662582397, "rewards/margins": 0.17430973052978516, "rewards/rejected": -0.7352679967880249, "step": 11600 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -2.5584018230438232, "eval_logits/rejected": -2.5515997409820557, "eval_logps/chosen": -108.46733856201172, "eval_logps/rejected": -123.28102111816406, "eval_loss": 0.6549462676048279, "eval_rewards/accuracies": 0.6194238066673279, "eval_rewards/chosen": -0.49755439162254333, "eval_rewards/margins": 0.10345453023910522, "eval_rewards/rejected": -0.6010088920593262, "eval_runtime": 360.3867, "eval_samples_per_second": 11.943, "eval_steps_per_second": 1.493, "step": 11600 }, { "epoch": 2.0, "step": 11608, "total_flos": 0.0, "train_loss": 0.6539983197297005, "train_runtime": 91180.3592, "train_samples_per_second": 2.037, "train_steps_per_second": 0.127 } ], "logging_steps": 10, "max_steps": 11608, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }