{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1359, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.209654163226836, "learning_rate": 3.676470588235294e-09, "logits/chosen": -1.4681403636932373, "logits/rejected": -0.8821791410446167, "logps/chosen": -326.7279052734375, "logps/rejected": -393.66143798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 7.833527457219724, "learning_rate": 3.676470588235294e-08, "logits/chosen": -1.1554194688796997, "logits/rejected": -1.069737434387207, "logps/chosen": -260.11224365234375, "logps/rejected": -278.21954345703125, "loss": 0.693, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 0.001127632916904986, "rewards/margins": 0.001941706403158605, "rewards/margins_max": 0.0066660139709711075, "rewards/margins_min": -0.0027826009318232536, "rewards/margins_std": 0.006681179627776146, "rewards/rejected": -0.0008140733698382974, "step": 10 }, { "epoch": 0.01, "grad_norm": 5.539912592900294, "learning_rate": 7.352941176470588e-08, "logits/chosen": -1.1387906074523926, "logits/rejected": -1.2151895761489868, "logps/chosen": -226.5954132080078, "logps/rejected": -194.97735595703125, "loss": 0.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004003068897873163, "rewards/margins": 0.0006232298910617828, "rewards/margins_max": 0.0029323583003133535, "rewards/margins_min": -0.0016858980525285006, "rewards/margins_std": 0.0032655999530106783, "rewards/rejected": -0.0002229233068646863, "step": 20 }, { "epoch": 0.02, "grad_norm": 9.074297699065875, "learning_rate": 1.1029411764705881e-07, "logits/chosen": -0.9134622812271118, "logits/rejected": -1.1061055660247803, "logps/chosen": -286.9056091308594, "logps/rejected": -306.0609130859375, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0033608167432248592, "rewards/margins": 0.0010996473720297217, "rewards/margins_max": 0.003882316406816244, "rewards/margins_min": -0.0016830215463414788, "rewards/margins_std": 0.003935288172215223, "rewards/rejected": 0.0022611692547798157, "step": 30 }, { "epoch": 0.03, "grad_norm": 6.630354149069055, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -0.9963411092758179, "logits/rejected": -1.3301975727081299, "logps/chosen": -237.13650512695312, "logps/rejected": -233.420654296875, "loss": 0.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0003052559623029083, "rewards/margins": -0.00031435777782462537, "rewards/margins_max": 0.003875983878970146, "rewards/margins_min": -0.0045046997256577015, "rewards/margins_std": 0.005926038138568401, "rewards/rejected": 9.101861905946862e-06, "step": 40 }, { "epoch": 0.04, "grad_norm": 5.11233300662631, "learning_rate": 1.8382352941176472e-07, "logits/chosen": -0.9264333844184875, "logits/rejected": -1.0728222131729126, "logps/chosen": -219.332763671875, "logps/rejected": -220.7531280517578, "loss": 0.6927, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0008567962795495987, "rewards/margins": 0.0023684161715209484, "rewards/margins_max": 0.00644815806299448, "rewards/margins_min": -0.001711326651275158, "rewards/margins_std": 0.005769627168774605, "rewards/rejected": -0.001511619659140706, "step": 50 }, { "epoch": 0.04, "grad_norm": 6.3634169311373245, "learning_rate": 2.2058823529411763e-07, "logits/chosen": -1.1445600986480713, "logits/rejected": -1.3254610300064087, "logps/chosen": -269.0830993652344, "logps/rejected": -234.78726196289062, "loss": 0.6913, "rewards/accuracies": 0.75, "rewards/chosen": 0.0016944237286224961, "rewards/margins": 0.004126362036913633, "rewards/margins_max": 0.006431617774069309, "rewards/margins_min": 0.0018211060669273138, "rewards/margins_std": 0.0032601244747638702, "rewards/rejected": -0.0024319379590451717, "step": 60 }, { "epoch": 0.05, "grad_norm": 5.891925253538908, "learning_rate": 2.5735294117647057e-07, "logits/chosen": -1.414535403251648, "logits/rejected": -1.5020934343338013, "logps/chosen": -295.0069580078125, "logps/rejected": -283.39984130859375, "loss": 0.6904, "rewards/accuracies": 0.75, "rewards/chosen": 0.003248781431466341, "rewards/margins": 0.005960130598396063, "rewards/margins_max": 0.01083610113710165, "rewards/margins_min": 0.0010841598268598318, "rewards/margins_std": 0.006895663682371378, "rewards/rejected": -0.0027113493997603655, "step": 70 }, { "epoch": 0.06, "grad_norm": 27.08598358859191, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.1046959161758423, "logits/rejected": -1.121512770652771, "logps/chosen": -233.47909545898438, "logps/rejected": -228.24447631835938, "loss": 0.6879, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0034522090572863817, "rewards/margins": 0.010937942191958427, "rewards/margins_max": 0.015041169710457325, "rewards/margins_min": 0.006834716536104679, "rewards/margins_std": 0.005802837200462818, "rewards/rejected": -0.007485733367502689, "step": 80 }, { "epoch": 0.07, "grad_norm": 5.043695997862729, "learning_rate": 3.3088235294117644e-07, "logits/chosen": -1.1739518642425537, "logits/rejected": -1.1855499744415283, "logps/chosen": -201.79940795898438, "logps/rejected": -239.0184783935547, "loss": 0.6861, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.005821605678647757, "rewards/margins": 0.016343099996447563, "rewards/margins_max": 0.022502990439534187, "rewards/margins_min": 0.010183211416006088, "rewards/margins_std": 0.008711399510502815, "rewards/rejected": -0.010521495714783669, "step": 90 }, { "epoch": 0.07, "grad_norm": 6.240948244130348, "learning_rate": 3.6764705882352943e-07, "logits/chosen": -1.226905345916748, "logits/rejected": -1.402093529701233, "logps/chosen": -276.8337707519531, "logps/rejected": -248.4552459716797, "loss": 0.6816, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.008135917596518993, "rewards/margins": 0.024016622453927994, "rewards/margins_max": 0.033059027045965195, "rewards/margins_min": 0.014974219724535942, "rewards/margins_std": 0.012787890620529652, "rewards/rejected": -0.015880707651376724, "step": 100 }, { "epoch": 0.07, "eval_logits/chosen": -1.1694660186767578, "eval_logits/rejected": -1.1956290006637573, "eval_logps/chosen": -345.8330993652344, "eval_logps/rejected": -336.38427734375, "eval_loss": 0.6919357776641846, "eval_rewards/accuracies": 0.5416666865348816, "eval_rewards/chosen": 2.3678861907683313e-05, "eval_rewards/margins": 0.002057413337752223, "eval_rewards/margins_max": 0.027664856985211372, "eval_rewards/margins_min": -0.02450541965663433, "eval_rewards/margins_std": 0.017513444647192955, "eval_rewards/rejected": -0.002033734694123268, "eval_runtime": 419.0939, "eval_samples_per_second": 9.544, "eval_steps_per_second": 0.15, "step": 100 }, { "epoch": 0.08, "grad_norm": 5.566894084924568, "learning_rate": 4.044117647058823e-07, "logits/chosen": -1.3186091184616089, "logits/rejected": -1.2772490978240967, "logps/chosen": -379.5386657714844, "logps/rejected": -246.4805450439453, "loss": 0.6773, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.007909432053565979, "rewards/margins": 0.024571493268013, "rewards/margins_max": 0.036393627524375916, "rewards/margins_min": 0.012749359011650085, "rewards/margins_std": 0.016719024628400803, "rewards/rejected": -0.01666206307709217, "step": 110 }, { "epoch": 0.09, "grad_norm": 5.5380569300473175, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -0.9861418008804321, "logits/rejected": -1.2131096124649048, "logps/chosen": -280.57135009765625, "logps/rejected": -222.57217407226562, "loss": 0.6696, "rewards/accuracies": 1.0, "rewards/chosen": 0.017649073153734207, "rewards/margins": 0.06698472797870636, "rewards/margins_max": 0.10137734562158585, "rewards/margins_min": 0.032592128962278366, "rewards/margins_std": 0.0486384816467762, "rewards/rejected": -0.04933566227555275, "step": 120 }, { "epoch": 0.1, "grad_norm": 15.361998761868088, "learning_rate": 4.779411764705882e-07, "logits/chosen": -1.0785776376724243, "logits/rejected": -0.898257851600647, "logps/chosen": -283.1363525390625, "logps/rejected": -214.15316772460938, "loss": 0.6611, "rewards/accuracies": 1.0, "rewards/chosen": 0.02150227688252926, "rewards/margins": 0.08108994364738464, "rewards/margins_max": 0.10384353250265121, "rewards/margins_min": 0.05833636596798897, "rewards/margins_std": 0.032178424298763275, "rewards/rejected": -0.059587668627500534, "step": 130 }, { "epoch": 0.1, "grad_norm": 5.451118326133565, "learning_rate": 4.999868030671756e-07, "logits/chosen": -0.9526296854019165, "logits/rejected": -0.9190389513969421, "logps/chosen": -236.9579620361328, "logps/rejected": -269.78240966796875, "loss": 0.6586, "rewards/accuracies": 1.0, "rewards/chosen": 0.015391260385513306, "rewards/margins": 0.07113742083311081, "rewards/margins_max": 0.10363912582397461, "rewards/margins_min": 0.0386357307434082, "rewards/margins_std": 0.04596434161067009, "rewards/rejected": -0.0557461753487587, "step": 140 }, { "epoch": 0.11, "grad_norm": 5.767935898839982, "learning_rate": 4.998383535732973e-07, "logits/chosen": -1.1545963287353516, "logits/rejected": -1.3083815574645996, "logps/chosen": -272.58392333984375, "logps/rejected": -251.5518798828125, "loss": 0.637, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027133097872138023, "rewards/margins": 0.1410999596118927, "rewards/margins_max": 0.18325701355934143, "rewards/margins_min": 0.09894292801618576, "rewards/margins_std": 0.05961906909942627, "rewards/rejected": -0.11396688222885132, "step": 150 }, { "epoch": 0.12, "grad_norm": 9.075069261969173, "learning_rate": 4.995250566954361e-07, "logits/chosen": -1.2339075803756714, "logits/rejected": -1.3427120447158813, "logps/chosen": -278.045654296875, "logps/rejected": -249.33016967773438, "loss": 0.621, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.021242624148726463, "rewards/margins": 0.1352781355381012, "rewards/margins_max": 0.18264132738113403, "rewards/margins_min": 0.08791494369506836, "rewards/margins_std": 0.06698166579008102, "rewards/rejected": -0.11403550952672958, "step": 160 }, { "epoch": 0.13, "grad_norm": 5.288881821825863, "learning_rate": 4.990471191519357e-07, "logits/chosen": -1.2296701669692993, "logits/rejected": -1.3137729167938232, "logps/chosen": -271.8497009277344, "logps/rejected": -257.36285400390625, "loss": 0.6231, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.027015607804059982, "rewards/margins": 0.2098924219608307, "rewards/margins_max": 0.28914040327072144, "rewards/margins_min": 0.13064439594745636, "rewards/margins_std": 0.1120736226439476, "rewards/rejected": -0.1828767955303192, "step": 170 }, { "epoch": 0.13, "grad_norm": 4.722529871025577, "learning_rate": 4.984048562937129e-07, "logits/chosen": -1.104107141494751, "logits/rejected": -1.2799243927001953, "logps/chosen": -267.16131591796875, "logps/rejected": -320.7081298828125, "loss": 0.599, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.009208987466990948, "rewards/margins": 0.15969006717205048, "rewards/margins_max": 0.21888110041618347, "rewards/margins_min": 0.1004989966750145, "rewards/margins_std": 0.08370877802371979, "rewards/rejected": -0.15048107504844666, "step": 180 }, { "epoch": 0.14, "grad_norm": 4.706882294745904, "learning_rate": 4.975986918961825e-07, "logits/chosen": -1.1564669609069824, "logits/rejected": -1.3084397315979004, "logps/chosen": -287.58294677734375, "logps/rejected": -235.0350799560547, "loss": 0.5751, "rewards/accuracies": 1.0, "rewards/chosen": 0.015282683074474335, "rewards/margins": 0.2435847818851471, "rewards/margins_max": 0.34512418508529663, "rewards/margins_min": 0.14204536378383636, "rewards/margins_std": 0.14359840750694275, "rewards/rejected": -0.22830209136009216, "step": 190 }, { "epoch": 0.15, "grad_norm": 6.10064839157769, "learning_rate": 4.966291578796448e-07, "logits/chosen": -1.2383778095245361, "logits/rejected": -1.2699321508407593, "logps/chosen": -246.54550170898438, "logps/rejected": -299.7005920410156, "loss": 0.5468, "rewards/accuracies": 1.0, "rewards/chosen": 0.00011487379379104823, "rewards/margins": 0.3421292304992676, "rewards/margins_max": 0.5104770064353943, "rewards/margins_min": 0.17378148436546326, "rewards/margins_std": 0.23807969689369202, "rewards/rejected": -0.3420143723487854, "step": 200 }, { "epoch": 0.15, "eval_logits/chosen": -1.1466065645217896, "eval_logits/rejected": -1.1508780717849731, "eval_logps/chosen": -357.1989440917969, "eval_logps/rejected": -350.5012512207031, "eval_loss": 0.679348349571228, "eval_rewards/accuracies": 0.579365074634552, "eval_rewards/chosen": -0.11363494396209717, "eval_rewards/margins": 0.029568513855338097, "eval_rewards/margins_max": 0.24946285784244537, "eval_rewards/margins_min": -0.1965206265449524, "eval_rewards/margins_std": 0.1510881930589676, "eval_rewards/rejected": -0.143203467130661, "eval_runtime": 417.1858, "eval_samples_per_second": 9.588, "eval_steps_per_second": 0.151, "step": 200 }, { "epoch": 0.15, "grad_norm": 7.130935509068585, "learning_rate": 4.954968939583149e-07, "logits/chosen": -0.82276850938797, "logits/rejected": -1.0703377723693848, "logps/chosen": -308.36981201171875, "logps/rejected": -285.35321044921875, "loss": 0.5356, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.06309916079044342, "rewards/margins": 0.28919515013694763, "rewards/margins_max": 0.4261881709098816, "rewards/margins_min": 0.1522020846605301, "rewards/margins_std": 0.1937374323606491, "rewards/rejected": -0.35229426622390747, "step": 210 }, { "epoch": 0.16, "grad_norm": 5.18218978578797, "learning_rate": 4.942026472182297e-07, "logits/chosen": -1.133894681930542, "logits/rejected": -0.9819344282150269, "logps/chosen": -357.5079345703125, "logps/rejected": -290.6125183105469, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": -0.12442765384912491, "rewards/margins": 0.420942485332489, "rewards/margins_max": 0.6367592215538025, "rewards/margins_min": 0.20512573421001434, "rewards/margins_std": 0.30521097779273987, "rewards/rejected": -0.5453701615333557, "step": 220 }, { "epoch": 0.17, "grad_norm": 6.451783890738213, "learning_rate": 4.92747271624308e-07, "logits/chosen": -1.1002264022827148, "logits/rejected": -1.1289845705032349, "logps/chosen": -307.14483642578125, "logps/rejected": -330.2859802246094, "loss": 0.494, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.14728474617004395, "rewards/margins": 0.6978201866149902, "rewards/margins_max": 1.030912160873413, "rewards/margins_min": 0.36472827196121216, "rewards/margins_std": 0.47106313705444336, "rewards/rejected": -0.845104992389679, "step": 230 }, { "epoch": 0.18, "grad_norm": 6.092745398297892, "learning_rate": 4.911317274568909e-07, "logits/chosen": -1.1411150693893433, "logits/rejected": -1.1094478368759155, "logps/chosen": -294.82550048828125, "logps/rejected": -408.50970458984375, "loss": 0.4335, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1695319563150406, "rewards/margins": 0.7641543745994568, "rewards/margins_max": 1.1791099309921265, "rewards/margins_min": 0.34919896721839905, "rewards/margins_std": 0.586835503578186, "rewards/rejected": -0.933686375617981, "step": 240 }, { "epoch": 0.18, "grad_norm": 13.168192652840903, "learning_rate": 4.89357080678133e-07, "logits/chosen": -1.0950664281845093, "logits/rejected": -1.240697979927063, "logps/chosen": -269.51092529296875, "logps/rejected": -296.14837646484375, "loss": 0.4457, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.32782456278800964, "rewards/margins": 0.6738765835762024, "rewards/margins_max": 0.9242515563964844, "rewards/margins_min": 0.423501580953598, "rewards/margins_std": 0.35408374667167664, "rewards/rejected": -1.0017011165618896, "step": 250 }, { "epoch": 0.19, "grad_norm": 5.63734344760071, "learning_rate": 4.874245022286637e-07, "logits/chosen": -1.1380219459533691, "logits/rejected": -0.8845139741897583, "logps/chosen": -245.44686889648438, "logps/rejected": -377.0203552246094, "loss": 0.4311, "rewards/accuracies": 1.0, "rewards/chosen": -0.39118385314941406, "rewards/margins": 0.8953431844711304, "rewards/margins_max": 1.3878755569458008, "rewards/margins_min": 0.40281087160110474, "rewards/margins_std": 0.6965457797050476, "rewards/rejected": -1.2865270376205444, "step": 260 }, { "epoch": 0.2, "grad_norm": 5.2265046259602705, "learning_rate": 4.853352672549815e-07, "logits/chosen": -0.9493010640144348, "logits/rejected": -0.9017621874809265, "logps/chosen": -434.3206481933594, "logps/rejected": -370.5262451171875, "loss": 0.4015, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5151541829109192, "rewards/margins": 0.754838764667511, "rewards/margins_max": 1.1408073902130127, "rewards/margins_min": 0.3688700795173645, "rewards/margins_std": 0.5458420515060425, "rewards/rejected": -1.2699930667877197, "step": 270 }, { "epoch": 0.21, "grad_norm": 9.133504257567045, "learning_rate": 4.830907542680918e-07, "logits/chosen": -1.0836373567581177, "logits/rejected": -0.9045012593269348, "logps/chosen": -264.9966125488281, "logps/rejected": -428.46539306640625, "loss": 0.3691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5287370681762695, "rewards/margins": 2.0548110008239746, "rewards/margins_max": 3.620879650115967, "rewards/margins_min": 0.48874226212501526, "rewards/margins_std": 2.2147555351257324, "rewards/rejected": -2.583548069000244, "step": 280 }, { "epoch": 0.21, "grad_norm": 5.917073426239516, "learning_rate": 4.806924442339425e-07, "logits/chosen": -1.0086328983306885, "logits/rejected": -0.8821426630020142, "logps/chosen": -305.4242248535156, "logps/rejected": -435.61737060546875, "loss": 0.3813, "rewards/accuracies": 1.0, "rewards/chosen": -0.40681153535842896, "rewards/margins": 1.1146458387374878, "rewards/margins_max": 1.7330601215362549, "rewards/margins_min": 0.4962318539619446, "rewards/margins_std": 0.8745697140693665, "rewards/rejected": -1.5214574337005615, "step": 290 }, { "epoch": 0.22, "grad_norm": 7.285903481113855, "learning_rate": 4.781419195962598e-07, "logits/chosen": -0.997855544090271, "logits/rejected": -0.9541902542114258, "logps/chosen": -299.9017639160156, "logps/rejected": -388.34246826171875, "loss": 0.3597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.6117764711380005, "rewards/margins": 0.7978827953338623, "rewards/margins_max": 1.1113024950027466, "rewards/margins_min": 0.4844631552696228, "rewards/margins_std": 0.44324231147766113, "rewards/rejected": -1.4096593856811523, "step": 300 }, { "epoch": 0.22, "eval_logits/chosen": -1.0628585815429688, "eval_logits/rejected": -1.051159143447876, "eval_logps/chosen": -439.3020324707031, "eval_logps/rejected": -442.590576171875, "eval_loss": 0.6787940859794617, "eval_rewards/accuracies": 0.5714285969734192, "eval_rewards/chosen": -0.9346656203269958, "eval_rewards/margins": 0.12943138182163239, "eval_rewards/margins_max": 1.008405089378357, "eval_rewards/margins_min": -0.7319620251655579, "eval_rewards/margins_std": 0.5778602361679077, "eval_rewards/rejected": -1.0640968084335327, "eval_runtime": 418.5023, "eval_samples_per_second": 9.558, "eval_steps_per_second": 0.151, "step": 300 }, { "epoch": 0.23, "grad_norm": 10.782754556563047, "learning_rate": 4.754408632324253e-07, "logits/chosen": -1.1973422765731812, "logits/rejected": -0.9350277781486511, "logps/chosen": -318.24627685546875, "logps/rejected": -525.6881103515625, "loss": 0.3331, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.7586840987205505, "rewards/margins": 2.185839891433716, "rewards/margins_max": 2.856729745864868, "rewards/margins_min": 1.5149496793746948, "rewards/margins_std": 0.9487816691398621, "rewards/rejected": -2.944523811340332, "step": 310 }, { "epoch": 0.24, "grad_norm": 5.923061735298404, "learning_rate": 4.725910573430866e-07, "logits/chosen": -1.0679926872253418, "logits/rejected": -0.945013165473938, "logps/chosen": -365.65472412109375, "logps/rejected": -405.1241760253906, "loss": 0.3627, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8854155540466309, "rewards/margins": 0.7383102178573608, "rewards/margins_max": 1.1103546619415283, "rewards/margins_min": 0.3662659227848053, "rewards/margins_std": 0.5261501669883728, "rewards/rejected": -1.6237256526947021, "step": 320 }, { "epoch": 0.24, "grad_norm": 7.098229956454526, "learning_rate": 4.6959438227623293e-07, "logits/chosen": -1.1373931169509888, "logits/rejected": -0.862761378288269, "logps/chosen": -276.69671630859375, "logps/rejected": -535.3623046875, "loss": 0.2945, "rewards/accuracies": 1.0, "rewards/chosen": -0.7917782068252563, "rewards/margins": 2.616485595703125, "rewards/margins_max": 4.648871421813965, "rewards/margins_min": 0.5841000080108643, "rewards/margins_std": 2.874227523803711, "rewards/rejected": -3.40826416015625, "step": 330 }, { "epoch": 0.25, "grad_norm": 8.024599779277368, "learning_rate": 4.664528152865105e-07, "logits/chosen": -0.7721256613731384, "logits/rejected": -0.8172466158866882, "logps/chosen": -349.3388671875, "logps/rejected": -486.68597412109375, "loss": 0.2734, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.1078470945358276, "rewards/margins": 1.6050605773925781, "rewards/margins_max": 2.6409249305725098, "rewards/margins_min": 0.5691961646080017, "rewards/margins_std": 1.4649332761764526, "rewards/rejected": -2.712907552719116, "step": 340 }, { "epoch": 0.26, "grad_norm": 7.882556555729322, "learning_rate": 4.6316842923059816e-07, "logits/chosen": -1.0482970476150513, "logits/rejected": -0.8200104832649231, "logps/chosen": -331.43133544921875, "logps/rejected": -785.7190551757812, "loss": 0.3029, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9469194412231445, "rewards/margins": 4.725480556488037, "rewards/margins_max": 8.024388313293457, "rewards/margins_min": 1.4265724420547485, "rewards/margins_std": 4.665360450744629, "rewards/rejected": -5.672399997711182, "step": 350 }, { "epoch": 0.26, "grad_norm": 11.928720576155937, "learning_rate": 4.5974339119950334e-07, "logits/chosen": -0.9947048425674438, "logits/rejected": -0.8432388305664062, "logps/chosen": -433.1314392089844, "logps/rejected": -583.6080932617188, "loss": 0.2952, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5369694232940674, "rewards/margins": 1.8583523035049438, "rewards/margins_max": 2.645268678665161, "rewards/margins_min": 1.0714359283447266, "rewards/margins_std": 1.1128677129745483, "rewards/rejected": -3.3953216075897217, "step": 360 }, { "epoch": 0.27, "grad_norm": 5.540434948793406, "learning_rate": 4.5617996108867997e-07, "logits/chosen": -0.8581298589706421, "logits/rejected": -0.3961424231529236, "logps/chosen": -412.405517578125, "logps/rejected": -812.7819213867188, "loss": 0.2262, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.617913007736206, "rewards/margins": 4.383803844451904, "rewards/margins_max": 7.294039249420166, "rewards/margins_min": 1.4735687971115112, "rewards/margins_std": 4.115694522857666, "rewards/rejected": -6.001717567443848, "step": 370 }, { "epoch": 0.28, "grad_norm": 17.370609516247765, "learning_rate": 4.5248049010691304e-07, "logits/chosen": -1.0891549587249756, "logits/rejected": -0.69083172082901, "logps/chosen": -347.2943420410156, "logps/rejected": -703.4866943359375, "loss": 0.2504, "rewards/accuracies": 1.0, "rewards/chosen": -1.3099722862243652, "rewards/margins": 3.3620052337646484, "rewards/margins_max": 5.631108283996582, "rewards/margins_min": 1.092902421951294, "rewards/margins_std": 3.208995819091797, "rewards/rejected": -4.6719770431518555, "step": 380 }, { "epoch": 0.29, "grad_norm": 9.292751258662012, "learning_rate": 4.486474192249533e-07, "logits/chosen": -1.0247005224227905, "logits/rejected": -0.6028069853782654, "logps/chosen": -442.56671142578125, "logps/rejected": -660.4315185546875, "loss": 0.2063, "rewards/accuracies": 1.0, "rewards/chosen": -1.4350707530975342, "rewards/margins": 2.895498514175415, "rewards/margins_max": 3.9468486309051514, "rewards/margins_min": 1.8441476821899414, "rewards/margins_std": 1.4868338108062744, "rewards/rejected": -4.330569267272949, "step": 390 }, { "epoch": 0.29, "grad_norm": 5.8794814274178755, "learning_rate": 4.4468327756492504e-07, "logits/chosen": -0.7380314469337463, "logits/rejected": -0.5135469436645508, "logps/chosen": -366.635986328125, "logps/rejected": -607.7274780273438, "loss": 0.2059, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5760023593902588, "rewards/margins": 2.5897469520568848, "rewards/margins_max": 3.876375913619995, "rewards/margins_min": 1.3031187057495117, "rewards/margins_std": 1.8195674419403076, "rewards/rejected": -4.165749549865723, "step": 400 }, { "epoch": 0.29, "eval_logits/chosen": -0.8807379603385925, "eval_logits/rejected": -0.8695055842399597, "eval_logps/chosen": -542.6320190429688, "eval_logps/rejected": -566.7861938476562, "eval_loss": 0.7172051072120667, "eval_rewards/accuracies": 0.5972222089767456, "eval_rewards/chosen": -1.9679654836654663, "eval_rewards/margins": 0.3380873501300812, "eval_rewards/margins_max": 2.344252109527588, "eval_rewards/margins_min": -1.388581395149231, "eval_rewards/margins_std": 1.2205023765563965, "eval_rewards/rejected": -2.3060529232025146, "eval_runtime": 415.548, "eval_samples_per_second": 9.626, "eval_steps_per_second": 0.152, "step": 400 }, { "epoch": 0.3, "grad_norm": 11.887878225278437, "learning_rate": 4.405906807315705e-07, "logits/chosen": -0.7631363868713379, "logits/rejected": -0.14442148804664612, "logps/chosen": -412.6502990722656, "logps/rejected": -617.9203491210938, "loss": 0.1867, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8327445983886719, "rewards/margins": 2.406796932220459, "rewards/margins_max": 3.8745861053466797, "rewards/margins_min": 0.9390074014663696, "rewards/margins_std": 2.075767755508423, "rewards/rejected": -4.239541530609131, "step": 410 }, { "epoch": 0.31, "grad_norm": 7.6667134274072195, "learning_rate": 4.363723290864314e-07, "logits/chosen": -0.8663452863693237, "logits/rejected": -0.10104439407587051, "logps/chosen": -507.49078369140625, "logps/rejected": -824.9513549804688, "loss": 0.221, "rewards/accuracies": 1.0, "rewards/chosen": -2.1396355628967285, "rewards/margins": 4.1518402099609375, "rewards/margins_max": 6.126175403594971, "rewards/margins_min": 2.1775054931640625, "rewards/margins_std": 2.792131185531616, "rewards/rejected": -6.291476249694824, "step": 420 }, { "epoch": 0.32, "grad_norm": 15.491732727187143, "learning_rate": 4.3203100596610723e-07, "logits/chosen": -0.5918745398521423, "logits/rejected": -0.1715858429670334, "logps/chosen": -453.0254821777344, "logps/rejected": -597.4471435546875, "loss": 0.1938, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7686259746551514, "rewards/margins": 2.167701005935669, "rewards/margins_max": 3.2129874229431152, "rewards/margins_min": 1.1224141120910645, "rewards/margins_std": 1.4782588481903076, "rewards/rejected": -3.9363269805908203, "step": 430 }, { "epoch": 0.32, "grad_norm": 8.708872027507127, "learning_rate": 4.2756957584576436e-07, "logits/chosen": -0.584081768989563, "logits/rejected": 0.096702441573143, "logps/chosen": -451.47509765625, "logps/rejected": -978.1886596679688, "loss": 0.2022, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9495502710342407, "rewards/margins": 5.0804595947265625, "rewards/margins_max": 8.66343879699707, "rewards/margins_min": 1.4974806308746338, "rewards/margins_std": 5.0670976638793945, "rewards/rejected": -7.0300092697143555, "step": 440 }, { "epoch": 0.33, "grad_norm": 7.538469505929578, "learning_rate": 4.22990982449109e-07, "logits/chosen": -0.6104982495307922, "logits/rejected": -0.21484926342964172, "logps/chosen": -472.439453125, "logps/rejected": -757.8245849609375, "loss": 0.1417, "rewards/accuracies": 1.0, "rewards/chosen": -2.049053192138672, "rewards/margins": 3.2169277667999268, "rewards/margins_max": 4.436863422393799, "rewards/margins_min": 1.9969921112060547, "rewards/margins_std": 1.7252495288848877, "rewards/rejected": -5.2659807205200195, "step": 450 }, { "epoch": 0.34, "grad_norm": 10.825278124877386, "learning_rate": 4.1829824680607104e-07, "logits/chosen": -0.419607937335968, "logits/rejected": 0.11389993131160736, "logps/chosen": -435.0726623535156, "logps/rejected": -784.734130859375, "loss": 0.166, "rewards/accuracies": 1.0, "rewards/chosen": -1.990121841430664, "rewards/margins": 3.3705692291259766, "rewards/margins_max": 5.105216026306152, "rewards/margins_min": 1.6359226703643799, "rewards/margins_std": 2.4531607627868652, "rewards/rejected": -5.360690593719482, "step": 460 }, { "epoch": 0.35, "grad_norm": 9.234625136932591, "learning_rate": 4.134944652594794e-07, "logits/chosen": -0.5118550062179565, "logits/rejected": 0.10812608152627945, "logps/chosen": -453.38848876953125, "logps/rejected": -1031.4366455078125, "loss": 0.1243, "rewards/accuracies": 1.0, "rewards/chosen": -2.346630096435547, "rewards/margins": 5.6441521644592285, "rewards/margins_max": 8.810213088989258, "rewards/margins_min": 2.4780914783477783, "rewards/margins_std": 4.4774861335754395, "rewards/rejected": -7.990782260894775, "step": 470 }, { "epoch": 0.35, "grad_norm": 6.8345938121765775, "learning_rate": 4.085828074220451e-07, "logits/chosen": -0.4821593165397644, "logits/rejected": 0.33621591329574585, "logps/chosen": -612.152587890625, "logps/rejected": -944.8914794921875, "loss": 0.142, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5179717540740967, "rewards/margins": 4.587340354919434, "rewards/margins_max": 7.297093868255615, "rewards/margins_min": 1.8775880336761475, "rewards/margins_std": 3.8321690559387207, "rewards/rejected": -7.105312347412109, "step": 480 }, { "epoch": 0.36, "grad_norm": 11.122230946658236, "learning_rate": 4.035665140849994e-07, "logits/chosen": -0.2719888985157013, "logits/rejected": 0.40051668882369995, "logps/chosen": -519.849365234375, "logps/rejected": -941.7233276367188, "loss": 0.1233, "rewards/accuracies": 1.0, "rewards/chosen": -2.556196689605713, "rewards/margins": 4.605846881866455, "rewards/margins_max": 6.096743106842041, "rewards/margins_min": 3.1149520874023438, "rewards/margins_std": 2.1084442138671875, "rewards/rejected": -7.162044525146484, "step": 490 }, { "epoch": 0.37, "grad_norm": 4.543083572509446, "learning_rate": 3.984488950797678e-07, "logits/chosen": -0.19994431734085083, "logits/rejected": 0.6510161757469177, "logps/chosen": -450.4979553222656, "logps/rejected": -926.5679931640625, "loss": 0.1354, "rewards/accuracies": 1.0, "rewards/chosen": -2.4891421794891357, "rewards/margins": 4.708044052124023, "rewards/margins_max": 7.425878047943115, "rewards/margins_min": 1.9902098178863525, "rewards/margins_std": 3.8435981273651123, "rewards/rejected": -7.197185516357422, "step": 500 }, { "epoch": 0.37, "eval_logits/chosen": -0.25537678599357605, "eval_logits/rejected": -0.16171453893184662, "eval_logps/chosen": -661.367431640625, "eval_logps/rejected": -714.6080322265625, "eval_loss": 0.8081530928611755, "eval_rewards/accuracies": 0.6190476417541504, "eval_rewards/chosen": -3.155320167541504, "eval_rewards/margins": 0.6289510130882263, "eval_rewards/margins_max": 4.081821918487549, "eval_rewards/margins_min": -2.2017109394073486, "eval_rewards/margins_std": 2.03205943107605, "eval_rewards/rejected": -3.784270763397217, "eval_runtime": 416.2564, "eval_samples_per_second": 9.609, "eval_steps_per_second": 0.151, "step": 500 }, { "epoch": 0.38, "grad_norm": 11.627659490001143, "learning_rate": 3.9323332709408904e-07, "logits/chosen": -0.09876732528209686, "logits/rejected": 1.3991271257400513, "logps/chosen": -600.6998291015625, "logps/rejected": -968.8531494140625, "loss": 0.1308, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3052432537078857, "rewards/margins": 4.294064998626709, "rewards/margins_max": 6.973275184631348, "rewards/margins_min": 1.614854097366333, "rewards/margins_std": 3.788975954055786, "rewards/rejected": -7.599307060241699, "step": 510 }, { "epoch": 0.38, "grad_norm": 8.873005540995397, "learning_rate": 3.879232514440227e-07, "logits/chosen": -0.3379233479499817, "logits/rejected": 0.6603206992149353, "logps/chosen": -618.7060546875, "logps/rejected": -1049.278076171875, "loss": 0.1475, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1586403846740723, "rewards/margins": 4.834142208099365, "rewards/margins_max": 6.6787214279174805, "rewards/margins_min": 2.989562511444092, "rewards/margins_std": 2.6086299419403076, "rewards/rejected": -7.992783546447754, "step": 520 }, { "epoch": 0.39, "grad_norm": 17.879342011641224, "learning_rate": 3.825221718033129e-07, "logits/chosen": 0.0034618079662323, "logits/rejected": 0.864820122718811, "logps/chosen": -471.9354553222656, "logps/rejected": -985.2346801757812, "loss": 0.1082, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.823118209838867, "rewards/margins": 5.269505023956299, "rewards/margins_max": 8.90275764465332, "rewards/margins_min": 1.6362518072128296, "rewards/margins_std": 5.138195991516113, "rewards/rejected": -8.092622756958008, "step": 530 }, { "epoch": 0.4, "grad_norm": 6.390466873902363, "learning_rate": 3.7703365189160746e-07, "logits/chosen": -0.07338769733905792, "logits/rejected": 1.4749701023101807, "logps/chosen": -539.89697265625, "logps/rejected": -1210.6910400390625, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": -2.919481039047241, "rewards/margins": 7.211228370666504, "rewards/margins_max": 11.77415943145752, "rewards/margins_min": 2.6482949256896973, "rewards/margins_std": 6.452960968017578, "rewards/rejected": -10.130708694458008, "step": 540 }, { "epoch": 0.4, "grad_norm": 12.554873275869042, "learning_rate": 3.714613131230587e-07, "logits/chosen": -0.22135767340660095, "logits/rejected": 1.1000282764434814, "logps/chosen": -720.9986572265625, "logps/rejected": -1223.421630859375, "loss": 0.1223, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7738468647003174, "rewards/margins": 5.606228828430176, "rewards/margins_max": 8.233736038208008, "rewards/margins_min": 2.9787204265594482, "rewards/margins_std": 3.715857744216919, "rewards/rejected": -9.380073547363281, "step": 550 }, { "epoch": 0.41, "grad_norm": 40.923616793220184, "learning_rate": 3.6580883221685533e-07, "logits/chosen": -0.0870949998497963, "logits/rejected": 1.078148603439331, "logps/chosen": -505.99774169921875, "logps/rejected": -1176.008544921875, "loss": 0.0862, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.822312831878662, "rewards/margins": 5.737250328063965, "rewards/margins_max": 8.857365608215332, "rewards/margins_min": 2.6171350479125977, "rewards/margins_std": 4.412509918212891, "rewards/rejected": -8.559562683105469, "step": 560 }, { "epoch": 0.42, "grad_norm": 2.377000403316867, "learning_rate": 3.6007993877126386e-07, "logits/chosen": 0.25743845105171204, "logits/rejected": 2.0459682941436768, "logps/chosen": -640.0938110351562, "logps/rejected": -1272.0159912109375, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": -3.934041976928711, "rewards/margins": 6.4811530113220215, "rewards/margins_max": 10.410442352294922, "rewards/margins_min": 2.5518646240234375, "rewards/margins_std": 5.556853294372559, "rewards/rejected": -10.415196418762207, "step": 570 }, { "epoch": 0.43, "grad_norm": 6.765929979770598, "learning_rate": 3.5427841280277937e-07, "logits/chosen": 0.19738076627254486, "logits/rejected": 1.5706841945648193, "logps/chosen": -643.2400512695312, "logps/rejected": -1103.7618408203125, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": -3.439357280731201, "rewards/margins": 4.518318176269531, "rewards/margins_max": 6.311240196228027, "rewards/margins_min": 2.725395441055298, "rewards/margins_std": 2.5355746746063232, "rewards/rejected": -7.957674980163574, "step": 580 }, { "epoch": 0.43, "grad_norm": 2.3572788749229394, "learning_rate": 3.484080822520096e-07, "logits/chosen": 0.4655560553073883, "logits/rejected": 1.286608099937439, "logps/chosen": -555.6957397460938, "logps/rejected": -1019.0916748046875, "loss": 0.1491, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.497023820877075, "rewards/margins": 4.476337432861328, "rewards/margins_max": 6.756206512451172, "rewards/margins_min": 2.1964690685272217, "rewards/margins_std": 3.2242209911346436, "rewards/rejected": -7.973361968994141, "step": 590 }, { "epoch": 0.44, "grad_norm": 8.25918903118385, "learning_rate": 3.4247282045793797e-07, "logits/chosen": 0.2085554599761963, "logits/rejected": 1.3560742139816284, "logps/chosen": -595.1603393554688, "logps/rejected": -1199.1165771484375, "loss": 0.1327, "rewards/accuracies": 1.0, "rewards/chosen": -3.877821445465088, "rewards/margins": 6.124663352966309, "rewards/margins_max": 9.755376815795898, "rewards/margins_min": 2.493950843811035, "rewards/margins_std": 5.134603500366211, "rewards/rejected": -10.002485275268555, "step": 600 }, { "epoch": 0.44, "eval_logits/chosen": 0.017259376123547554, "eval_logits/rejected": 0.1599506437778473, "eval_logps/chosen": -731.00927734375, "eval_logps/rejected": -798.1055908203125, "eval_loss": 0.8436357378959656, "eval_rewards/accuracies": 0.6190476417541504, "eval_rewards/chosen": -3.851738452911377, "eval_rewards/margins": 0.7675079107284546, "eval_rewards/margins_max": 4.83132266998291, "eval_rewards/margins_min": -2.431659460067749, "eval_rewards/margins_std": 2.352627992630005, "eval_rewards/rejected": -4.619246482849121, "eval_runtime": 415.8421, "eval_samples_per_second": 9.619, "eval_steps_per_second": 0.151, "step": 600 }, { "epoch": 0.45, "grad_norm": 13.982869383101132, "learning_rate": 3.3647654360223144e-07, "logits/chosen": -0.18186531960964203, "logits/rejected": 1.947683572769165, "logps/chosen": -636.12548828125, "logps/rejected": -1468.92333984375, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": -3.338965654373169, "rewards/margins": 8.887590408325195, "rewards/margins_max": 12.813148498535156, "rewards/margins_min": 4.962031364440918, "rewards/margins_std": 5.551577568054199, "rewards/rejected": -12.226556777954102, "step": 610 }, { "epoch": 0.46, "grad_norm": 24.68214704261548, "learning_rate": 3.30423208125281e-07, "logits/chosen": -0.13235849142074585, "logits/rejected": 1.7915821075439453, "logps/chosen": -697.5199584960938, "logps/rejected": -1485.5936279296875, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": -3.4117112159729004, "rewards/margins": 8.921293258666992, "rewards/margins_max": 12.249357223510742, "rewards/margins_min": 5.593228340148926, "rewards/margins_std": 4.7065935134887695, "rewards/rejected": -12.333003044128418, "step": 620 }, { "epoch": 0.46, "grad_norm": 10.905617995495655, "learning_rate": 3.2431680811567833e-07, "logits/chosen": -0.12053610384464264, "logits/rejected": 1.8949730396270752, "logps/chosen": -630.9464111328125, "logps/rejected": -1220.925048828125, "loss": 0.1229, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.5450587272644043, "rewards/margins": 6.405971527099609, "rewards/margins_max": 10.655710220336914, "rewards/margins_min": 2.1562342643737793, "rewards/margins_std": 6.010036945343018, "rewards/rejected": -9.951030731201172, "step": 630 }, { "epoch": 0.47, "grad_norm": 10.94150157360822, "learning_rate": 3.1816137267485136e-07, "logits/chosen": 0.027946263551712036, "logits/rejected": 1.485925555229187, "logps/chosen": -646.646728515625, "logps/rejected": -1238.3758544921875, "loss": 0.1477, "rewards/accuracies": 1.0, "rewards/chosen": -3.6477348804473877, "rewards/margins": 6.220660209655762, "rewards/margins_max": 9.265599250793457, "rewards/margins_min": 3.1757211685180664, "rewards/margins_std": 4.306193828582764, "rewards/rejected": -9.86839485168457, "step": 640 }, { "epoch": 0.48, "grad_norm": 17.595942677326722, "learning_rate": 3.1196096325859815e-07, "logits/chosen": -0.05433236435055733, "logits/rejected": 2.2038755416870117, "logps/chosen": -578.5730590820312, "logps/rejected": -1498.58203125, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": -3.005871534347534, "rewards/margins": 9.676332473754883, "rewards/margins_max": 15.599041938781738, "rewards/margins_min": 3.753622531890869, "rewards/margins_std": 8.3759765625, "rewards/rejected": -12.68220329284668, "step": 650 }, { "epoch": 0.49, "grad_norm": 7.356684331269297, "learning_rate": 3.057196709972727e-07, "logits/chosen": 0.11046739667654037, "logits/rejected": 2.175269365310669, "logps/chosen": -674.2919921875, "logps/rejected": -1267.6500244140625, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": -3.1146275997161865, "rewards/margins": 7.452083587646484, "rewards/margins_max": 10.800088882446289, "rewards/margins_min": 4.104078769683838, "rewards/margins_std": 4.734793663024902, "rewards/rejected": -10.56671142578125, "step": 660 }, { "epoch": 0.49, "grad_norm": 7.038311259187171, "learning_rate": 2.9944161399639086e-07, "logits/chosen": 0.21353694796562195, "logits/rejected": 1.7908731698989868, "logps/chosen": -616.1519165039062, "logps/rejected": -1157.595947265625, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": -3.5106310844421387, "rewards/margins": 5.999436855316162, "rewards/margins_max": 8.261363983154297, "rewards/margins_min": 3.7375106811523438, "rewards/margins_std": 3.198847532272339, "rewards/rejected": -9.510068893432617, "step": 670 }, { "epoch": 0.5, "grad_norm": 3.3985205014158293, "learning_rate": 2.9313093461943824e-07, "logits/chosen": 0.07152876257896423, "logits/rejected": 1.9080642461776733, "logps/chosen": -658.859619140625, "logps/rejected": -1418.1920166015625, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": -3.6242880821228027, "rewards/margins": 8.017306327819824, "rewards/margins_max": 11.628385543823242, "rewards/margins_min": 4.40622615814209, "rewards/margins_std": 5.106837272644043, "rewards/rejected": -11.641593933105469, "step": 680 }, { "epoch": 0.51, "grad_norm": 12.970507914933444, "learning_rate": 2.8679179675467104e-07, "logits/chosen": 0.5070677995681763, "logits/rejected": 2.8454136848449707, "logps/chosen": -661.779296875, "logps/rejected": -1588.948974609375, "loss": 0.0704, "rewards/accuracies": 1.0, "rewards/chosen": -4.405068397521973, "rewards/margins": 9.361727714538574, "rewards/margins_max": 15.706764221191406, "rewards/margins_min": 3.016690731048584, "rewards/margins_std": 8.973237037658691, "rewards/rejected": -13.766797065734863, "step": 690 }, { "epoch": 0.52, "grad_norm": 25.37176614242638, "learning_rate": 2.80428383067716e-07, "logits/chosen": -0.056868601590394974, "logits/rejected": 2.1195578575134277, "logps/chosen": -643.5035400390625, "logps/rejected": -1405.5491943359375, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": -3.8768184185028076, "rewards/margins": 7.676672458648682, "rewards/margins_max": 11.38581657409668, "rewards/margins_min": 3.967529296875, "rewards/margins_std": 5.245521545410156, "rewards/rejected": -11.553489685058594, "step": 700 }, { "epoch": 0.52, "eval_logits/chosen": 0.4162614345550537, "eval_logits/rejected": 0.6300503015518188, "eval_logps/chosen": -840.1605224609375, "eval_logps/rejected": -929.0051879882812, "eval_loss": 0.9893194437026978, "eval_rewards/accuracies": 0.6190476417541504, "eval_rewards/chosen": -4.943249225616455, "eval_rewards/margins": 0.9849926233291626, "eval_rewards/margins_max": 6.353243827819824, "eval_rewards/margins_min": -3.295872688293457, "eval_rewards/margins_std": 3.1250360012054443, "eval_rewards/rejected": -5.9282426834106445, "eval_runtime": 421.7747, "eval_samples_per_second": 9.484, "eval_steps_per_second": 0.149, "step": 700 }, { "epoch": 0.52, "grad_norm": 8.759540837366416, "learning_rate": 2.7404489224177973e-07, "logits/chosen": 0.6560094356536865, "logits/rejected": 3.2553603649139404, "logps/chosen": -783.5775756835938, "logps/rejected": -1650.650146484375, "loss": 0.1101, "rewards/accuracies": 1.0, "rewards/chosen": -4.931197166442871, "rewards/margins": 8.854988098144531, "rewards/margins_max": 12.33712100982666, "rewards/margins_min": 5.372857093811035, "rewards/margins_std": 4.924478054046631, "rewards/rejected": -13.786186218261719, "step": 710 }, { "epoch": 0.53, "grad_norm": 53.838974553307395, "learning_rate": 2.676455362072894e-07, "logits/chosen": 0.9320627450942993, "logits/rejected": 3.438016414642334, "logps/chosen": -699.7535400390625, "logps/rejected": -1655.8385009765625, "loss": 0.0852, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.045767307281494, "rewards/margins": 9.726736068725586, "rewards/margins_max": 13.513631820678711, "rewards/margins_min": 5.939839839935303, "rewards/margins_std": 5.355479717254639, "rewards/rejected": -14.772501945495605, "step": 720 }, { "epoch": 0.54, "grad_norm": 0.8391615669250567, "learning_rate": 2.612345373627937e-07, "logits/chosen": 0.2621687650680542, "logits/rejected": 1.9230273962020874, "logps/chosen": -639.4342041015625, "logps/rejected": -1445.03271484375, "loss": 0.1804, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.050388336181641, "rewards/margins": 8.062755584716797, "rewards/margins_max": 11.662395477294922, "rewards/margins_min": 4.4631171226501465, "rewards/margins_std": 5.090658664703369, "rewards/rejected": -12.113143920898438, "step": 730 }, { "epoch": 0.54, "grad_norm": 18.77671464276547, "learning_rate": 2.54816125788955e-07, "logits/chosen": 0.5534690022468567, "logits/rejected": 2.526615858078003, "logps/chosen": -709.9898681640625, "logps/rejected": -1459.970947265625, "loss": 0.1361, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.665217399597168, "rewards/margins": 7.5482282638549805, "rewards/margins_max": 12.083941459655762, "rewards/margins_min": 3.0125153064727783, "rewards/margins_std": 6.414466857910156, "rewards/rejected": -12.213445663452148, "step": 740 }, { "epoch": 0.55, "grad_norm": 6.37822813578148, "learning_rate": 2.4839453645747467e-07, "logits/chosen": 0.2104567587375641, "logits/rejected": 1.8120098114013672, "logps/chosen": -643.4108276367188, "logps/rejected": -1417.44921875, "loss": 0.1312, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7755866050720215, "rewards/margins": 8.031917572021484, "rewards/margins_max": 12.62381362915039, "rewards/margins_min": 3.440018892288208, "rewards/margins_std": 6.493924140930176, "rewards/rejected": -11.807502746582031, "step": 750 }, { "epoch": 0.56, "grad_norm": 16.8248388008373, "learning_rate": 2.4197400643678987e-07, "logits/chosen": 0.24539189040660858, "logits/rejected": 1.6847679615020752, "logps/chosen": -639.7948608398438, "logps/rejected": -1011.7283935546875, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": -3.634861707687378, "rewards/margins": 4.383325576782227, "rewards/margins_max": 7.218289852142334, "rewards/margins_min": 1.5483614206314087, "rewards/margins_std": 4.009244918823242, "rewards/rejected": -8.018186569213867, "step": 760 }, { "epoch": 0.57, "grad_norm": 7.954736320138308, "learning_rate": 2.3555877209638726e-07, "logits/chosen": 0.0611066035926342, "logits/rejected": 1.33302640914917, "logps/chosen": -672.7412719726562, "logps/rejected": -1782.3125, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": -3.4519970417022705, "rewards/margins": 11.562549591064453, "rewards/margins_max": 20.07329559326172, "rewards/margins_min": 3.0518016815185547, "rewards/margins_std": 12.036015510559082, "rewards/rejected": -15.014546394348145, "step": 770 }, { "epoch": 0.57, "grad_norm": 11.05108228058454, "learning_rate": 2.2915306631157817e-07, "logits/chosen": 0.2885664105415344, "logits/rejected": 2.206385612487793, "logps/chosen": -648.3999633789062, "logps/rejected": -1299.401123046875, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": -3.7839324474334717, "rewards/margins": 7.008673191070557, "rewards/margins_max": 9.869766235351562, "rewards/margins_min": 4.147579669952393, "rewards/margins_std": 4.046196937561035, "rewards/rejected": -10.792604446411133, "step": 780 }, { "epoch": 0.58, "grad_norm": 32.49887802957626, "learning_rate": 2.2276111567057887e-07, "logits/chosen": 0.22940261662006378, "logits/rejected": 1.6958719491958618, "logps/chosen": -593.3724365234375, "logps/rejected": -1174.9674072265625, "loss": 0.1111, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6294262409210205, "rewards/margins": 5.80316162109375, "rewards/margins_max": 8.796818733215332, "rewards/margins_min": 2.8095040321350098, "rewards/margins_std": 4.233671188354492, "rewards/rejected": -9.432588577270508, "step": 790 }, { "epoch": 0.59, "grad_norm": 7.06163362995566, "learning_rate": 2.1638713768573936e-07, "logits/chosen": 0.06335971504449844, "logits/rejected": 1.4285287857055664, "logps/chosen": -595.5140380859375, "logps/rejected": -1295.677490234375, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": -3.581036329269409, "rewards/margins": 7.286231994628906, "rewards/margins_max": 11.22960090637207, "rewards/margins_min": 3.342862606048584, "rewards/margins_std": 5.576765537261963, "rewards/rejected": -10.867268562316895, "step": 800 }, { "epoch": 0.59, "eval_logits/chosen": 0.12438549101352692, "eval_logits/rejected": 0.28890377283096313, "eval_logps/chosen": -732.38525390625, "eval_logps/rejected": -799.7516479492188, "eval_loss": 0.8086485862731934, "eval_rewards/accuracies": 0.6190476417541504, "eval_rewards/chosen": -3.8654978275299072, "eval_rewards/margins": 0.7702099680900574, "eval_rewards/margins_max": 4.502103328704834, "eval_rewards/margins_min": -2.291940450668335, "eval_rewards/margins_std": 2.2426791191101074, "eval_rewards/rejected": -4.635707378387451, "eval_runtime": 417.0386, "eval_samples_per_second": 9.591, "eval_steps_per_second": 0.151, "step": 800 }, { "epoch": 0.6, "grad_norm": 5.3968517354258125, "learning_rate": 2.100353380107609e-07, "logits/chosen": 0.23273587226867676, "logits/rejected": 1.9462811946868896, "logps/chosen": -776.3011474609375, "logps/rejected": -1441.837158203125, "loss": 0.1, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.670359134674072, "rewards/margins": 7.1968560218811035, "rewards/margins_max": 11.221755981445312, "rewards/margins_min": 3.171954393386841, "rewards/margins_std": 5.6920695304870605, "rewards/rejected": -11.86721420288086, "step": 810 }, { "epoch": 0.6, "grad_norm": 15.03932252297939, "learning_rate": 2.0370990766573698e-07, "logits/chosen": -0.10733046382665634, "logits/rejected": 1.8043702840805054, "logps/chosen": -650.6616821289062, "logps/rejected": -1616.010986328125, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": -3.1738812923431396, "rewards/margins": 10.541234970092773, "rewards/margins_max": 15.04127311706543, "rewards/margins_min": 6.041195392608643, "rewards/margins_std": 6.364017009735107, "rewards/rejected": -13.715115547180176, "step": 820 }, { "epoch": 0.61, "grad_norm": 31.097886733723477, "learning_rate": 1.974150202718513e-07, "logits/chosen": 0.08039845526218414, "logits/rejected": 2.343336582183838, "logps/chosen": -534.8485717773438, "logps/rejected": -1418.825439453125, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -3.1201188564300537, "rewards/margins": 8.804471969604492, "rewards/margins_max": 12.42898178100586, "rewards/margins_min": 5.179962635040283, "rewards/margins_std": 5.12583065032959, "rewards/rejected": -11.924591064453125, "step": 830 }, { "epoch": 0.62, "grad_norm": 16.60986174297272, "learning_rate": 1.9115482929755445e-07, "logits/chosen": 0.24223566055297852, "logits/rejected": 1.6932157278060913, "logps/chosen": -570.802978515625, "logps/rejected": -1331.71533203125, "loss": 0.0856, "rewards/accuracies": 1.0, "rewards/chosen": -3.493419647216797, "rewards/margins": 7.847373962402344, "rewards/margins_max": 11.789865493774414, "rewards/margins_min": 3.9048819541931152, "rewards/margins_std": 5.575525760650635, "rewards/rejected": -11.34079360961914, "step": 840 }, { "epoch": 0.63, "grad_norm": 13.502668975247548, "learning_rate": 1.8493346531803887e-07, "logits/chosen": 0.48027992248535156, "logits/rejected": 2.202148675918579, "logps/chosen": -596.4915161132812, "logps/rejected": -1282.6644287109375, "loss": 0.0983, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.9702115058898926, "rewards/margins": 6.948336124420166, "rewards/margins_max": 9.718558311462402, "rewards/margins_min": 4.178112506866455, "rewards/margins_std": 3.9176864624023438, "rewards/rejected": -10.918546676635742, "step": 850 }, { "epoch": 0.63, "grad_norm": 23.139494234389517, "learning_rate": 1.7875503328981807e-07, "logits/chosen": 0.3601033091545105, "logits/rejected": 2.474608898162842, "logps/chosen": -652.9142456054688, "logps/rejected": -1604.696533203125, "loss": 0.0605, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.9101157188415527, "rewards/margins": 9.86131477355957, "rewards/margins_max": 14.181074142456055, "rewards/margins_min": 5.541555881500244, "rewards/margins_std": 6.1090617179870605, "rewards/rejected": -13.771429061889648, "step": 860 }, { "epoch": 0.64, "grad_norm": 14.349796836286524, "learning_rate": 1.7262360984221006e-07, "logits/chosen": 0.012769157998263836, "logits/rejected": 1.9421314001083374, "logps/chosen": -664.3881225585938, "logps/rejected": -1434.8948974609375, "loss": 0.1274, "rewards/accuracies": 1.0, "rewards/chosen": -3.80120849609375, "rewards/margins": 8.072778701782227, "rewards/margins_max": 11.85603141784668, "rewards/margins_min": 4.289526462554932, "rewards/margins_std": 5.350326061248779, "rewards/rejected": -11.873987197875977, "step": 870 }, { "epoch": 0.65, "grad_norm": 15.485607186999017, "learning_rate": 1.6654324058751175e-07, "logits/chosen": 0.3775918483734131, "logits/rejected": 1.973515510559082, "logps/chosen": -713.2658081054688, "logps/rejected": -1631.826904296875, "loss": 0.0662, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.8164262771606445, "rewards/margins": 9.047931671142578, "rewards/margins_max": 12.981298446655273, "rewards/margins_min": 5.114563941955566, "rewards/margins_std": 5.562621116638184, "rewards/rejected": -13.864356994628906, "step": 880 }, { "epoch": 0.65, "grad_norm": 15.607832046954224, "learning_rate": 1.6051793745163812e-07, "logits/chosen": 0.6472679376602173, "logits/rejected": 2.5574803352355957, "logps/chosen": -689.5281982421875, "logps/rejected": -1642.12109375, "loss": 0.1011, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.642444610595703, "rewards/margins": 9.717334747314453, "rewards/margins_max": 15.204099655151367, "rewards/margins_min": 4.230566501617432, "rewards/margins_std": 7.75946044921875, "rewards/rejected": -14.359777450561523, "step": 890 }, { "epoch": 0.66, "grad_norm": 45.565136882193066, "learning_rate": 1.5455167602698915e-07, "logits/chosen": 0.06020700931549072, "logits/rejected": 2.2921700477600098, "logps/chosen": -727.0872192382812, "logps/rejected": -1482.33837890625, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": -4.481110572814941, "rewards/margins": 7.914282321929932, "rewards/margins_max": 10.535211563110352, "rewards/margins_min": 5.293350696563721, "rewards/margins_std": 3.7065558433532715, "rewards/rejected": -12.395392417907715, "step": 900 }, { "epoch": 0.66, "eval_logits/chosen": 0.20550121366977692, "eval_logits/rejected": 0.3917555809020996, "eval_logps/chosen": -789.8953857421875, "eval_logps/rejected": -866.7603149414062, "eval_loss": 0.8639366030693054, "eval_rewards/accuracies": 0.6269841194152832, "eval_rewards/chosen": -4.4405999183654785, "eval_rewards/margins": 0.8651944398880005, "eval_rewards/margins_max": 5.159237861633301, "eval_rewards/margins_min": -2.6377525329589844, "eval_rewards/margins_std": 2.5658202171325684, "eval_rewards/rejected": -5.305793762207031, "eval_runtime": 419.7425, "eval_samples_per_second": 9.53, "eval_steps_per_second": 0.15, "step": 900 }, { "epoch": 0.67, "grad_norm": 7.254262825774854, "learning_rate": 1.4864839294928924e-07, "logits/chosen": 0.2960719168186188, "logits/rejected": 2.519636392593384, "logps/chosen": -667.0858154296875, "logps/rejected": -2054.51953125, "loss": 0.1092, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.099366664886475, "rewards/margins": 13.694231033325195, "rewards/margins_max": 20.22653579711914, "rewards/margins_min": 7.161923408508301, "rewards/margins_std": 9.238077163696289, "rewards/rejected": -17.793596267700195, "step": 910 }, { "epoch": 0.68, "grad_norm": 19.00706315113973, "learning_rate": 1.428119833001315e-07, "logits/chosen": 0.011763498187065125, "logits/rejected": 2.5436980724334717, "logps/chosen": -683.8145751953125, "logps/rejected": -1476.839111328125, "loss": 0.0479, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7376608848571777, "rewards/margins": 9.0157470703125, "rewards/margins_max": 13.42829418182373, "rewards/margins_min": 4.603199481964111, "rewards/margins_std": 6.2402849197387695, "rewards/rejected": -12.75340747833252, "step": 920 }, { "epoch": 0.68, "grad_norm": 2.449628285920275, "learning_rate": 1.370462980369401e-07, "logits/chosen": 0.11705155670642853, "logits/rejected": 1.5357266664505005, "logps/chosen": -766.4974365234375, "logps/rejected": -1289.979248046875, "loss": 0.0636, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.693875312805176, "rewards/margins": 5.769114017486572, "rewards/margins_max": 7.741427421569824, "rewards/margins_min": 3.796800136566162, "rewards/margins_std": 2.7892730236053467, "rewards/rejected": -10.462987899780273, "step": 930 }, { "epoch": 0.69, "grad_norm": 11.213992357762015, "learning_rate": 1.3135514145204606e-07, "logits/chosen": 0.21615874767303467, "logits/rejected": 2.0779476165771484, "logps/chosen": -605.4188232421875, "logps/rejected": -1608.945556640625, "loss": 0.0971, "rewards/accuracies": 1.0, "rewards/chosen": -3.8029799461364746, "rewards/margins": 10.208128929138184, "rewards/margins_max": 17.389694213867188, "rewards/margins_min": 3.0265650749206543, "rewards/margins_std": 10.15626335144043, "rewards/rejected": -14.011110305786133, "step": 940 }, { "epoch": 0.7, "grad_norm": 4.371361045173521, "learning_rate": 1.257422686625539e-07, "logits/chosen": 0.16180220246315002, "logits/rejected": 2.055144786834717, "logps/chosen": -682.2508544921875, "logps/rejected": -1589.9964599609375, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": -3.9322009086608887, "rewards/margins": 9.378369331359863, "rewards/margins_max": 14.781808853149414, "rewards/margins_min": 3.9749279022216797, "rewards/margins_std": 7.641619682312012, "rewards/rejected": -13.310567855834961, "step": 950 }, { "epoch": 0.71, "grad_norm": 3.4893980542106102, "learning_rate": 1.2021138313265444e-07, "logits/chosen": 0.11532745510339737, "logits/rejected": 1.866121530532837, "logps/chosen": -634.554931640625, "logps/rejected": -1674.252197265625, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": -3.853950023651123, "rewards/margins": 10.867055892944336, "rewards/margins_max": 18.760677337646484, "rewards/margins_min": 2.97343373298645, "rewards/margins_std": 11.163267135620117, "rewards/rejected": -14.7210054397583, "step": 960 }, { "epoch": 0.71, "grad_norm": 14.447489915734623, "learning_rate": 1.1476613423001974e-07, "logits/chosen": 0.17886893451213837, "logits/rejected": 1.89533269405365, "logps/chosen": -677.0606079101562, "logps/rejected": -1261.5345458984375, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": -4.194746971130371, "rewards/margins": 5.769103050231934, "rewards/margins_max": 7.97817325592041, "rewards/margins_min": 3.560032606124878, "rewards/margins_std": 3.1240971088409424, "rewards/rejected": -9.963850021362305, "step": 970 }, { "epoch": 0.72, "grad_norm": 3.9798658979228856, "learning_rate": 1.0941011481789042e-07, "logits/chosen": 0.034214410930871964, "logits/rejected": 2.867272138595581, "logps/chosen": -702.2564697265625, "logps/rejected": -1857.8795166015625, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": -4.056910037994385, "rewards/margins": 11.857443809509277, "rewards/margins_max": 18.30853843688965, "rewards/margins_min": 5.406346797943115, "rewards/margins_std": 9.123228073120117, "rewards/rejected": -15.91435432434082, "step": 980 }, { "epoch": 0.73, "grad_norm": 27.849338662173917, "learning_rate": 1.041468588844476e-07, "logits/chosen": 0.4994427263736725, "logits/rejected": 2.539013385772705, "logps/chosen": -599.5453491210938, "logps/rejected": -1590.7774658203125, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": -3.911860704421997, "rewards/margins": 10.174264907836914, "rewards/margins_max": 15.016085624694824, "rewards/margins_min": 5.332446098327637, "rewards/margins_std": 6.8473663330078125, "rewards/rejected": -14.086126327514648, "step": 990 }, { "epoch": 0.74, "grad_norm": 4.2150247037639375, "learning_rate": 9.897983921102954e-08, "logits/chosen": -0.2390742003917694, "logits/rejected": 2.2101035118103027, "logps/chosen": -670.8737182617188, "logps/rejected": -1509.370361328125, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": -3.90120005607605, "rewards/margins": 8.870689392089844, "rewards/margins_max": 11.843083381652832, "rewards/margins_min": 5.89829683303833, "rewards/margins_std": 4.203598976135254, "rewards/rejected": -12.771888732910156, "step": 1000 }, { "epoch": 0.74, "eval_logits/chosen": 0.21985697746276855, "eval_logits/rejected": 0.4062546491622925, "eval_logps/chosen": -791.2946166992188, "eval_logps/rejected": -865.1302490234375, "eval_loss": 0.8618067502975464, "eval_rewards/accuracies": 0.6230158805847168, "eval_rewards/chosen": -4.454591751098633, "eval_rewards/margins": 0.8349014520645142, "eval_rewards/margins_max": 5.060412406921387, "eval_rewards/margins_min": -2.622389078140259, "eval_rewards/margins_std": 2.52128529548645, "eval_rewards/rejected": -5.289493083953857, "eval_runtime": 419.5466, "eval_samples_per_second": 9.534, "eval_steps_per_second": 0.15, "step": 1000 }, { "epoch": 0.74, "grad_norm": 12.140211164365056, "learning_rate": 9.391246508073433e-08, "logits/chosen": 0.13034725189208984, "logits/rejected": 2.0794267654418945, "logps/chosen": -724.4019775390625, "logps/rejected": -1571.5511474609375, "loss": 0.0777, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.304561614990234, "rewards/margins": 8.91108512878418, "rewards/margins_max": 12.622480392456055, "rewards/margins_min": 5.199688911437988, "rewards/margins_std": 5.248705863952637, "rewards/rejected": -13.215646743774414, "step": 1010 }, { "epoch": 0.75, "grad_norm": 72.48315962813399, "learning_rate": 8.894808002892037e-08, "logits/chosen": 0.19714145362377167, "logits/rejected": 2.8781895637512207, "logps/chosen": -689.0614624023438, "logps/rejected": -1635.4539794921875, "loss": 0.0641, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8019371032714844, "rewards/margins": 10.48505687713623, "rewards/margins_max": 15.840913772583008, "rewards/margins_min": 5.129199981689453, "rewards/margins_std": 7.5743255615234375, "rewards/rejected": -14.286993026733398, "step": 1020 }, { "epoch": 0.76, "grad_norm": 20.88616124929115, "learning_rate": 8.408995963708756e-08, "logits/chosen": -0.0833059698343277, "logits/rejected": 2.3186755180358887, "logps/chosen": -681.8640747070312, "logps/rejected": -1602.0863037109375, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -3.856149673461914, "rewards/margins": 9.128388404846191, "rewards/margins_max": 12.01569652557373, "rewards/margins_min": 6.241078853607178, "rewards/margins_std": 4.0832719802856445, "rewards/rejected": -12.984537124633789, "step": 1030 }, { "epoch": 0.77, "grad_norm": 9.834472583209813, "learning_rate": 7.934130937159508e-08, "logits/chosen": 0.17558620870113373, "logits/rejected": 2.297236442565918, "logps/chosen": -637.3060302734375, "logps/rejected": -1326.9390869140625, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": -3.932767152786255, "rewards/margins": 7.218419075012207, "rewards/margins_max": 9.979570388793945, "rewards/margins_min": 4.457267761230469, "rewards/margins_std": 3.904857635498047, "rewards/rejected": -11.151185989379883, "step": 1040 }, { "epoch": 0.77, "grad_norm": 5.026095263611361, "learning_rate": 7.470526246864364e-08, "logits/chosen": 0.39160841703414917, "logits/rejected": 2.559542179107666, "logps/chosen": -693.7269287109375, "logps/rejected": -1849.744873046875, "loss": 0.0552, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.237768650054932, "rewards/margins": 12.45046329498291, "rewards/margins_max": 19.93360710144043, "rewards/margins_min": 4.967319488525391, "rewards/margins_std": 10.582763671875, "rewards/rejected": -16.688232421875, "step": 1050 }, { "epoch": 0.78, "grad_norm": 0.6591285293800628, "learning_rate": 7.018487786691512e-08, "logits/chosen": 0.43399763107299805, "logits/rejected": 2.060253381729126, "logps/chosen": -745.4591674804688, "logps/rejected": -1831.240478515625, "loss": 0.0678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.024683475494385, "rewards/margins": 10.875140190124512, "rewards/margins_max": 17.002622604370117, "rewards/margins_min": 4.747661113739014, "rewards/margins_std": 8.66556453704834, "rewards/rejected": -15.899823188781738, "step": 1060 }, { "epoch": 0.79, "grad_norm": 4.119017563303306, "learning_rate": 6.578313818923559e-08, "logits/chosen": -0.07052882760763168, "logits/rejected": 1.8699405193328857, "logps/chosen": -909.0846557617188, "logps/rejected": -1548.6923828125, "loss": 0.0634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.350946426391602, "rewards/margins": 7.820859432220459, "rewards/margins_max": 11.624895095825195, "rewards/margins_min": 4.016822338104248, "rewards/margins_std": 5.379720211029053, "rewards/rejected": -13.171804428100586, "step": 1070 }, { "epoch": 0.79, "grad_norm": 16.860241482971446, "learning_rate": 6.15029477745925e-08, "logits/chosen": 0.48959070444107056, "logits/rejected": 2.1462438106536865, "logps/chosen": -734.9025268554688, "logps/rejected": -1803.1939697265625, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": -5.121659755706787, "rewards/margins": 10.34645938873291, "rewards/margins_max": 14.924234390258789, "rewards/margins_min": 5.768682479858398, "rewards/margins_std": 6.473954200744629, "rewards/rejected": -15.468118667602539, "step": 1080 }, { "epoch": 0.8, "grad_norm": 18.379765722708388, "learning_rate": 5.734713076180486e-08, "logits/chosen": 0.46901997923851013, "logits/rejected": 3.454606294631958, "logps/chosen": -741.1581420898438, "logps/rejected": -1905.183349609375, "loss": 0.0713, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.039034843444824, "rewards/margins": 12.11182975769043, "rewards/margins_max": 19.285795211791992, "rewards/margins_min": 4.937865257263184, "rewards/margins_std": 10.145517349243164, "rewards/rejected": -17.15086555480957, "step": 1090 }, { "epoch": 0.81, "grad_norm": 4.317359176747138, "learning_rate": 5.3318429226110875e-08, "logits/chosen": 0.19755136966705322, "logits/rejected": 2.050144672393799, "logps/chosen": -604.0868530273438, "logps/rejected": -1733.5550537109375, "loss": 0.141, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.046292304992676, "rewards/margins": 11.252501487731934, "rewards/margins_max": 16.934438705444336, "rewards/margins_min": 5.57056188583374, "rewards/margins_std": 8.03547477722168, "rewards/rejected": -15.298794746398926, "step": 1100 }, { "epoch": 0.81, "eval_logits/chosen": 0.3016913831233978, "eval_logits/rejected": 0.5082818865776062, "eval_logps/chosen": -832.3104858398438, "eval_logps/rejected": -915.954833984375, "eval_loss": 0.9049465656280518, "eval_rewards/accuracies": 0.6190476417541504, "eval_rewards/chosen": -4.864750385284424, "eval_rewards/margins": 0.9329892992973328, "eval_rewards/margins_max": 5.632690906524658, "eval_rewards/margins_min": -2.8439128398895264, "eval_rewards/margins_std": 2.7856106758117676, "eval_rewards/rejected": -5.7977399826049805, "eval_runtime": 414.0109, "eval_samples_per_second": 9.662, "eval_steps_per_second": 0.152, "step": 1100 }, { "epoch": 0.82, "grad_norm": 13.4435984156611, "learning_rate": 4.9419501369902026e-08, "logits/chosen": 0.08746049553155899, "logits/rejected": 2.6451172828674316, "logps/chosen": -771.4244384765625, "logps/rejected": -2024.484619140625, "loss": 0.1408, "rewards/accuracies": 1.0, "rewards/chosen": -4.441189289093018, "rewards/margins": 13.507779121398926, "rewards/margins_max": 19.457698822021484, "rewards/margins_min": 7.557857513427734, "rewards/margins_std": 8.414458274841309, "rewards/rejected": -17.9489688873291, "step": 1110 }, { "epoch": 0.82, "grad_norm": 1.4128692999239585, "learning_rate": 4.5652919768798896e-08, "logits/chosen": 0.4677937924861908, "logits/rejected": 2.3705403804779053, "logps/chosen": -793.5311279296875, "logps/rejected": -1775.6380615234375, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": -5.224045276641846, "rewards/margins": 10.134596824645996, "rewards/margins_max": 15.679702758789062, "rewards/margins_min": 4.589491844177246, "rewards/margins_std": 7.8419623374938965, "rewards/rejected": -15.358640670776367, "step": 1120 }, { "epoch": 0.83, "grad_norm": 16.039453526164788, "learning_rate": 4.2021169674223536e-08, "logits/chosen": 0.2930324077606201, "logits/rejected": 2.399545431137085, "logps/chosen": -655.0755615234375, "logps/rejected": -1648.030029296875, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -3.887598752975464, "rewards/margins": 10.443506240844727, "rewards/margins_max": 14.695414543151855, "rewards/margins_min": 6.191596984863281, "rewards/margins_std": 6.013107776641846, "rewards/rejected": -14.331106185913086, "step": 1130 }, { "epoch": 0.84, "grad_norm": 26.479285274862587, "learning_rate": 3.852664737359046e-08, "logits/chosen": 0.3496669828891754, "logits/rejected": 1.97479248046875, "logps/chosen": -852.40380859375, "logps/rejected": -1573.5230712890625, "loss": 0.0768, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.772242546081543, "rewards/margins": 7.9194207191467285, "rewards/margins_max": 12.818387985229492, "rewards/margins_min": 3.0204524993896484, "rewards/margins_std": 6.928186893463135, "rewards/rejected": -13.691662788391113, "step": 1140 }, { "epoch": 0.85, "grad_norm": 0.7263792166932626, "learning_rate": 3.5171658609197824e-08, "logits/chosen": 0.1613047868013382, "logits/rejected": 2.029664993286133, "logps/chosen": -742.6275024414062, "logps/rejected": -1609.7635498046875, "loss": 0.1096, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6217732429504395, "rewards/margins": 8.67860221862793, "rewards/margins_max": 13.895421981811523, "rewards/margins_min": 3.4617819786071777, "rewards/margins_std": 7.377697944641113, "rewards/rejected": -13.300374984741211, "step": 1150 }, { "epoch": 0.85, "grad_norm": 3.13150099340305, "learning_rate": 3.195841705686139e-08, "logits/chosen": 0.460742712020874, "logits/rejected": 2.694736957550049, "logps/chosen": -821.4349365234375, "logps/rejected": -1898.295654296875, "loss": 0.0821, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.314339637756348, "rewards/margins": 11.54991626739502, "rewards/margins_max": 18.18251609802246, "rewards/margins_min": 4.917316436767578, "rewards/margins_std": 9.379911422729492, "rewards/rejected": -16.864253997802734, "step": 1160 }, { "epoch": 0.86, "grad_norm": 16.312675595535207, "learning_rate": 2.8889042865294837e-08, "logits/chosen": 0.13087859749794006, "logits/rejected": 2.484839916229248, "logps/chosen": -702.7008056640625, "logps/rejected": -1441.55078125, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -4.257961750030518, "rewards/margins": 7.842066287994385, "rewards/margins_max": 10.642562866210938, "rewards/margins_min": 5.041568756103516, "rewards/margins_std": 3.960501194000244, "rewards/rejected": -12.100028991699219, "step": 1170 }, { "epoch": 0.87, "grad_norm": 9.055687386628646, "learning_rate": 2.5965561257202036e-08, "logits/chosen": 0.1169591173529625, "logits/rejected": 2.362281560897827, "logps/chosen": -763.2276611328125, "logps/rejected": -1660.2099609375, "loss": 0.0572, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.720892906188965, "rewards/margins": 9.803942680358887, "rewards/margins_max": 15.876733779907227, "rewards/margins_min": 3.731149196624756, "rewards/margins_std": 8.588226318359375, "rewards/rejected": -14.524835586547852, "step": 1180 }, { "epoch": 0.88, "grad_norm": 22.841895074273324, "learning_rate": 2.318990119300218e-08, "logits/chosen": 0.10627205669879913, "logits/rejected": 1.2642805576324463, "logps/chosen": -798.917724609375, "logps/rejected": -2165.9775390625, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": -5.4411396980285645, "rewards/margins": 13.126449584960938, "rewards/margins_max": 20.357501983642578, "rewards/margins_min": 5.8953962326049805, "rewards/margins_std": 10.226253509521484, "rewards/rejected": -18.567590713500977, "step": 1190 }, { "epoch": 0.88, "grad_norm": 17.973583296727792, "learning_rate": 2.0563894098070216e-08, "logits/chosen": 0.15934190154075623, "logits/rejected": 2.1497673988342285, "logps/chosen": -712.0560302734375, "logps/rejected": -1505.4547119140625, "loss": 0.0775, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.412214756011963, "rewards/margins": 8.303590774536133, "rewards/margins_max": 12.088435173034668, "rewards/margins_min": 4.5187482833862305, "rewards/margins_std": 5.352576732635498, "rewards/rejected": -12.715806007385254, "step": 1200 }, { "epoch": 0.88, "eval_logits/chosen": 0.30742567777633667, "eval_logits/rejected": 0.5172090530395508, "eval_logps/chosen": -836.2312622070312, "eval_logps/rejected": -922.0319213867188, "eval_loss": 0.9049317836761475, "eval_rewards/accuracies": 0.6210317611694336, "eval_rewards/chosen": -4.903958320617676, "eval_rewards/margins": 0.9545530080795288, "eval_rewards/margins_max": 5.713037014007568, "eval_rewards/margins_min": -2.831618309020996, "eval_rewards/margins_std": 2.813220262527466, "eval_rewards/rejected": -5.858510971069336, "eval_runtime": 422.5993, "eval_samples_per_second": 9.465, "eval_steps_per_second": 0.149, "step": 1200 }, { "epoch": 0.89, "grad_norm": 2.7225416780438763, "learning_rate": 1.8089272654333353e-08, "logits/chosen": 0.28706851601600647, "logits/rejected": 1.9062206745147705, "logps/chosen": -866.8541259765625, "logps/rejected": -1701.005615234375, "loss": 0.0693, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.099188804626465, "rewards/margins": 9.302727699279785, "rewards/margins_max": 13.995088577270508, "rewards/margins_min": 4.61036491394043, "rewards/margins_std": 6.6360015869140625, "rewards/rejected": -14.40191650390625, "step": 1210 }, { "epoch": 0.9, "grad_norm": 7.492427847668467, "learning_rate": 1.5767669657019005e-08, "logits/chosen": 0.21484322845935822, "logits/rejected": 2.9490137100219727, "logps/chosen": -665.4578857421875, "logps/rejected": -1718.431640625, "loss": 0.0694, "rewards/accuracies": 1.0, "rewards/chosen": -4.03716516494751, "rewards/margins": 11.106006622314453, "rewards/margins_max": 14.720375061035156, "rewards/margins_min": 7.491639137268066, "rewards/margins_std": 5.11148738861084, "rewards/rejected": -15.143171310424805, "step": 1220 }, { "epoch": 0.91, "grad_norm": 14.252457056430137, "learning_rate": 1.3600616937310267e-08, "logits/chosen": 0.3399500250816345, "logits/rejected": 2.5051798820495605, "logps/chosen": -776.6029663085938, "logps/rejected": -1890.706298828125, "loss": 0.0533, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.925044059753418, "rewards/margins": 11.241573333740234, "rewards/margins_max": 16.437541961669922, "rewards/margins_min": 6.045604228973389, "rewards/margins_std": 7.348209381103516, "rewards/rejected": -16.166616439819336, "step": 1230 }, { "epoch": 0.91, "grad_norm": 3.254929425883996, "learning_rate": 1.1589544351619047e-08, "logits/chosen": 0.8039329648017883, "logits/rejected": 3.354154109954834, "logps/chosen": -724.2069091796875, "logps/rejected": -2016.739501953125, "loss": 0.0701, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.160962104797363, "rewards/margins": 12.87867259979248, "rewards/margins_max": 20.457225799560547, "rewards/margins_min": 5.300119400024414, "rewards/margins_std": 10.717691421508789, "rewards/rejected": -18.03963279724121, "step": 1240 }, { "epoch": 0.92, "grad_norm": 9.743725835120221, "learning_rate": 9.735778838143749e-09, "logits/chosen": 0.17006321251392365, "logits/rejected": 3.252281904220581, "logps/chosen": -771.3798828125, "logps/rejected": -2618.41943359375, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": -4.744952201843262, "rewards/margins": 18.770408630371094, "rewards/margins_max": 27.798681259155273, "rewards/margins_min": 9.742134094238281, "rewards/margins_std": 12.767908096313477, "rewards/rejected": -23.51535987854004, "step": 1250 }, { "epoch": 0.93, "grad_norm": 9.000437498002796, "learning_rate": 8.040543541333655e-09, "logits/chosen": 0.2970607578754425, "logits/rejected": 3.4422898292541504, "logps/chosen": -716.0152587890625, "logps/rejected": -1930.673095703125, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": -4.7243733406066895, "rewards/margins": 12.035941123962402, "rewards/margins_max": 17.656423568725586, "rewards/margins_min": 6.415456295013428, "rewards/margins_std": 7.9485650062561035, "rewards/rejected": -16.760313034057617, "step": 1260 }, { "epoch": 0.93, "grad_norm": 16.466144409333417, "learning_rate": 6.504957004838746e-09, "logits/chosen": -0.05619863420724869, "logits/rejected": 1.9224863052368164, "logps/chosen": -841.8850708007812, "logps/rejected": -1936.7113037109375, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -4.742644309997559, "rewards/margins": 11.516191482543945, "rewards/margins_max": 15.864044189453125, "rewards/margins_min": 7.168337821960449, "rewards/margins_std": 6.148792266845703, "rewards/rejected": -16.258834838867188, "step": 1270 }, { "epoch": 0.94, "grad_norm": 2.654070322101592, "learning_rate": 5.130032433476483e-09, "logits/chosen": 0.3038169741630554, "logits/rejected": 2.8313422203063965, "logps/chosen": -728.2089233398438, "logps/rejected": -1743.801513671875, "loss": 0.069, "rewards/accuracies": 1.0, "rewards/chosen": -4.6006269454956055, "rewards/margins": 11.062047004699707, "rewards/margins_max": 16.25905418395996, "rewards/margins_min": 5.8650407791137695, "rewards/margins_std": 7.349676609039307, "rewards/rejected": -15.662673950195312, "step": 1280 }, { "epoch": 0.95, "grad_norm": 4.94775999947406, "learning_rate": 3.916677024702858e-09, "logits/chosen": 0.1287023425102234, "logits/rejected": 2.0298779010772705, "logps/chosen": -667.8201904296875, "logps/rejected": -1396.031005859375, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": -4.1791486740112305, "rewards/margins": 7.460590362548828, "rewards/margins_max": 10.305280685424805, "rewards/margins_min": 4.615899562835693, "rewards/margins_std": 4.023000240325928, "rewards/rejected": -11.639739990234375, "step": 1290 }, { "epoch": 0.96, "grad_norm": 62.41949761633163, "learning_rate": 2.865691370028761e-09, "logits/chosen": 0.3163800835609436, "logits/rejected": 2.587982416152954, "logps/chosen": -711.7886962890625, "logps/rejected": -1568.990234375, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": -4.575112342834473, "rewards/margins": 9.028793334960938, "rewards/margins_max": 13.794398307800293, "rewards/margins_min": 4.263186454772949, "rewards/margins_std": 6.739584922790527, "rewards/rejected": -13.603904724121094, "step": 1300 }, { "epoch": 0.96, "eval_logits/chosen": 0.2898733615875244, "eval_logits/rejected": 0.49572646617889404, "eval_logps/chosen": -832.4283447265625, "eval_logps/rejected": -916.66357421875, "eval_loss": 0.9016607403755188, "eval_rewards/accuracies": 0.6230158805847168, "eval_rewards/chosen": -4.8659281730651855, "eval_rewards/margins": 0.9388992786407471, "eval_rewards/margins_max": 5.651630401611328, "eval_rewards/margins_min": -2.8163363933563232, "eval_rewards/margins_std": 2.78544020652771, "eval_rewards/rejected": -5.8048272132873535, "eval_runtime": 417.6061, "eval_samples_per_second": 9.578, "eval_steps_per_second": 0.151, "step": 1300 }, { "epoch": 0.96, "grad_norm": 1.1392819952008515, "learning_rate": 1.977768926776896e-09, "logits/chosen": 0.29715052247047424, "logits/rejected": 2.052577018737793, "logps/chosen": -763.3764038085938, "logps/rejected": -1313.391845703125, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": -4.99444055557251, "rewards/margins": 5.8611674308776855, "rewards/margins_max": 7.457464694976807, "rewards/margins_min": 4.264869213104248, "rewards/margins_std": 2.2575066089630127, "rewards/rejected": -10.855607986450195, "step": 1310 }, { "epoch": 0.97, "grad_norm": 2.4664515698911433, "learning_rate": 1.2534955605274233e-09, "logits/chosen": 0.4122096002101898, "logits/rejected": 3.4274659156799316, "logps/chosen": -771.2412719726562, "logps/rejected": -1840.775390625, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": -5.00337028503418, "rewards/margins": 10.93709659576416, "rewards/margins_max": 16.795883178710938, "rewards/margins_min": 5.078312873840332, "rewards/margins_std": 8.285572052001953, "rewards/rejected": -15.940465927124023, "step": 1320 }, { "epoch": 0.98, "grad_norm": 5.80559652045183, "learning_rate": 6.933491585542351e-10, "logits/chosen": 0.37182289361953735, "logits/rejected": 3.1600046157836914, "logps/chosen": -680.2762451171875, "logps/rejected": -1665.175537109375, "loss": 0.1695, "rewards/accuracies": 1.0, "rewards/chosen": -4.444371223449707, "rewards/margins": 10.164952278137207, "rewards/margins_max": 14.587198257446289, "rewards/margins_min": 5.742705821990967, "rewards/margins_std": 6.254001140594482, "rewards/rejected": -14.60932445526123, "step": 1330 }, { "epoch": 0.99, "grad_norm": 8.610945717452623, "learning_rate": 2.9769931450737694e-10, "logits/chosen": 0.1386619508266449, "logits/rejected": 2.0141379833221436, "logps/chosen": -799.5162353515625, "logps/rejected": -1834.666015625, "loss": 0.0767, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.182085990905762, "rewards/margins": 10.368020057678223, "rewards/margins_max": 15.239529609680176, "rewards/margins_min": 5.496510028839111, "rewards/margins_std": 6.889355659484863, "rewards/rejected": -15.550105094909668, "step": 1340 }, { "epoch": 0.99, "grad_norm": 33.03425737131523, "learning_rate": 6.680708454906425e-11, "logits/chosen": 0.2811238169670105, "logits/rejected": 1.941292405128479, "logps/chosen": -751.3411254882812, "logps/rejected": -1699.15625, "loss": 0.0687, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.085223197937012, "rewards/margins": 9.496126174926758, "rewards/margins_max": 13.1726713180542, "rewards/margins_min": 5.819581508636475, "rewards/margins_std": 5.199418544769287, "rewards/rejected": -14.58134937286377, "step": 1350 }, { "epoch": 1.0, "step": 1359, "total_flos": 0.0, "train_loss": 0.21785820982226384, "train_runtime": 12082.0351, "train_samples_per_second": 1.8, "train_steps_per_second": 0.112 } ], "logging_steps": 10, "max_steps": 1359, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }