{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.985279685966634, "eval_steps": 100, "global_step": 1270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.734375, "learning_rate": 3.9370078740157486e-08, "logits/chosen": -2.356706142425537, "logits/rejected": -2.3367161750793457, "logps/chosen": -287.937255859375, "logps/rejected": -266.50421142578125, "loss": 0.0001, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.04, "grad_norm": 106.5, "learning_rate": 3.937007874015748e-07, "logits/chosen": -2.3996241092681885, "logits/rejected": -2.353182554244995, "logps/chosen": -273.30889892578125, "logps/rejected": -240.43850708007812, "loss": 0.118, "rewards/accuracies": 0.46388885378837585, "rewards/chosen": 0.00026795046869665384, "rewards/margins": 0.00038285815389826894, "rewards/rejected": -0.00011490769247757271, "step": 10 }, { "epoch": 0.08, "grad_norm": 129.0, "learning_rate": 7.874015748031496e-07, "logits/chosen": -2.38779878616333, "logits/rejected": -2.3378489017486572, "logps/chosen": -266.8452453613281, "logps/rejected": -258.0478515625, "loss": 0.1346, "rewards/accuracies": 0.5525000095367432, "rewards/chosen": 0.0013033099239692092, "rewards/margins": 0.0008336402243003249, "rewards/rejected": 0.0004696696996688843, "step": 20 }, { "epoch": 0.12, "grad_norm": 108.0, "learning_rate": 1.1811023622047246e-06, "logits/chosen": -2.43515682220459, "logits/rejected": -2.382305145263672, "logps/chosen": -299.69366455078125, "logps/rejected": -271.0403747558594, "loss": 0.1585, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": 0.002506708027794957, "rewards/margins": 0.0005399176734499633, "rewards/rejected": 0.0019667900633066893, "step": 30 }, { "epoch": 0.16, "grad_norm": 95.0, "learning_rate": 1.5748031496062992e-06, "logits/chosen": -2.3607640266418457, "logits/rejected": -2.313088893890381, "logps/chosen": -288.7026672363281, "logps/rejected": -253.4452667236328, "loss": 0.1404, "rewards/accuracies": 0.5774999856948853, "rewards/chosen": 0.002137306611984968, "rewards/margins": 0.0012014020467177033, "rewards/rejected": 0.0009359045652672648, "step": 40 }, { "epoch": 0.2, "grad_norm": 109.0, "learning_rate": 1.968503937007874e-06, "logits/chosen": -2.407731533050537, "logits/rejected": -2.3856372833251953, "logps/chosen": -267.39801025390625, "logps/rejected": -264.8341064453125, "loss": 0.1726, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 0.0022850066889077425, "rewards/margins": 0.0014212832320481539, "rewards/rejected": 0.0008637232822366059, "step": 50 }, { "epoch": 0.24, "grad_norm": 135.0, "learning_rate": 2.362204724409449e-06, "logits/chosen": -2.3825833797454834, "logits/rejected": -2.3412365913391113, "logps/chosen": -272.9526672363281, "logps/rejected": -239.8430633544922, "loss": 0.1314, "rewards/accuracies": 0.6150000095367432, "rewards/chosen": 0.002347386907786131, "rewards/margins": 0.001956633059307933, "rewards/rejected": 0.00039075379027053714, "step": 60 }, { "epoch": 0.27, "grad_norm": 89.5, "learning_rate": 2.755905511811024e-06, "logits/chosen": -2.382422924041748, "logits/rejected": -2.3432180881500244, "logps/chosen": -273.59881591796875, "logps/rejected": -257.91790771484375, "loss": 0.1777, "rewards/accuracies": 0.6025000214576721, "rewards/chosen": 0.002635209821164608, "rewards/margins": 0.0014770927373319864, "rewards/rejected": 0.0011581169674172997, "step": 70 }, { "epoch": 0.31, "grad_norm": 125.0, "learning_rate": 3.1496062992125985e-06, "logits/chosen": -2.394813060760498, "logits/rejected": -2.3611693382263184, "logps/chosen": -279.48638916015625, "logps/rejected": -260.6809997558594, "loss": 0.1944, "rewards/accuracies": 0.6100000739097595, "rewards/chosen": 0.0013221392873674631, "rewards/margins": 0.0018236342584714293, "rewards/rejected": -0.0005014949128963053, "step": 80 }, { "epoch": 0.35, "grad_norm": 146.0, "learning_rate": 3.5433070866141735e-06, "logits/chosen": -2.3927130699157715, "logits/rejected": -2.3439621925354004, "logps/chosen": -264.44195556640625, "logps/rejected": -236.5146942138672, "loss": 0.2032, "rewards/accuracies": 0.5475000143051147, "rewards/chosen": 0.001684651942923665, "rewards/margins": 0.0011215232079848647, "rewards/rejected": 0.0005631285603158176, "step": 90 }, { "epoch": 0.39, "grad_norm": 145.0, "learning_rate": 3.937007874015748e-06, "logits/chosen": -2.4192750453948975, "logits/rejected": -2.3637657165527344, "logps/chosen": -280.88238525390625, "logps/rejected": -253.8561553955078, "loss": 0.2536, "rewards/accuracies": 0.5900000333786011, "rewards/chosen": 0.003346907440572977, "rewards/margins": 0.002142687328159809, "rewards/rejected": 0.001204220112413168, "step": 100 }, { "epoch": 0.39, "eval_logits/chosen": -2.412179470062256, "eval_logits/rejected": -2.3735485076904297, "eval_logps/chosen": -267.4339904785156, "eval_logps/rejected": -242.3385467529297, "eval_loss": 0.27917271852493286, "eval_rewards/accuracies": 0.6041666865348816, "eval_rewards/chosen": 0.002446565078571439, "eval_rewards/margins": 0.0018827051389962435, "eval_rewards/rejected": 0.0005638597067445517, "eval_runtime": 124.6972, "eval_samples_per_second": 16.039, "eval_steps_per_second": 0.337, "step": 100 }, { "epoch": 0.43, "grad_norm": 196.0, "learning_rate": 4.330708661417324e-06, "logits/chosen": -2.383650064468384, "logits/rejected": -2.345914125442505, "logps/chosen": -298.79864501953125, "logps/rejected": -272.7557373046875, "loss": 0.2924, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004891454242169857, "rewards/margins": 0.002777325687929988, "rewards/rejected": 0.0021141283214092255, "step": 110 }, { "epoch": 0.47, "grad_norm": 138.0, "learning_rate": 4.724409448818898e-06, "logits/chosen": -2.3592543601989746, "logits/rejected": -2.29665470123291, "logps/chosen": -285.70538330078125, "logps/rejected": -253.1526336669922, "loss": 0.3272, "rewards/accuracies": 0.6274999380111694, "rewards/chosen": 0.005152740981429815, "rewards/margins": 0.0025280567351728678, "rewards/rejected": 0.002624684479087591, "step": 120 }, { "epoch": 0.51, "grad_norm": 227.0, "learning_rate": 4.999915012051437e-06, "logits/chosen": -2.3975679874420166, "logits/rejected": -2.3674397468566895, "logps/chosen": -261.0414733886719, "logps/rejected": -249.3390350341797, "loss": 0.3621, "rewards/accuracies": 0.5925000309944153, "rewards/chosen": 0.004668754059821367, "rewards/margins": 0.0027659868355840445, "rewards/rejected": 0.0019027665257453918, "step": 130 }, { "epoch": 0.55, "grad_norm": 226.0, "learning_rate": 4.9984042759305375e-06, "logits/chosen": -2.4002552032470703, "logits/rejected": -2.34761381149292, "logps/chosen": -273.1309509277344, "logps/rejected": -247.2257843017578, "loss": 0.387, "rewards/accuracies": 0.5449999570846558, "rewards/chosen": 0.0018628574907779694, "rewards/margins": 0.0016136768972501159, "rewards/rejected": 0.00024918062263168395, "step": 140 }, { "epoch": 0.59, "grad_norm": 196.0, "learning_rate": 4.9950062323425556e-06, "logits/chosen": -2.3986167907714844, "logits/rejected": -2.358851194381714, "logps/chosen": -271.7406921386719, "logps/rejected": -249.7233428955078, "loss": 0.3955, "rewards/accuracies": 0.6075000166893005, "rewards/chosen": 0.00026552577037364244, "rewards/margins": 0.0028056656010448933, "rewards/rejected": -0.002540139714255929, "step": 150 }, { "epoch": 0.63, "grad_norm": 190.0, "learning_rate": 4.989723448187132e-06, "logits/chosen": -2.3923754692077637, "logits/rejected": -2.3684566020965576, "logps/chosen": -285.33184814453125, "logps/rejected": -282.5887756347656, "loss": 0.4099, "rewards/accuracies": 0.5924999713897705, "rewards/chosen": 0.0046120877377688885, "rewards/margins": 0.0034965365193784237, "rewards/rejected": 0.0011155509855598211, "step": 160 }, { "epoch": 0.67, "grad_norm": 222.0, "learning_rate": 4.982559914106645e-06, "logits/chosen": -2.416792392730713, "logits/rejected": -2.3636841773986816, "logps/chosen": -297.1885681152344, "logps/rejected": -281.99481201171875, "loss": 0.5527, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 0.004418404307216406, "rewards/margins": 0.0038651120848953724, "rewards/rejected": 0.0005532926879823208, "step": 170 }, { "epoch": 0.71, "grad_norm": 189.0, "learning_rate": 4.973521041471662e-06, "logits/chosen": -2.443068504333496, "logits/rejected": -2.4043824672698975, "logps/chosen": -284.85546875, "logps/rejected": -246.3272705078125, "loss": 0.467, "rewards/accuracies": 0.5925000309944153, "rewards/chosen": 0.003813292132690549, "rewards/margins": 0.0034597956109791994, "rewards/rejected": 0.00035349628888070583, "step": 180 }, { "epoch": 0.75, "grad_norm": 186.0, "learning_rate": 4.962613658293158e-06, "logits/chosen": -2.364473581314087, "logits/rejected": -2.336862802505493, "logps/chosen": -260.23297119140625, "logps/rejected": -244.9732666015625, "loss": 0.4326, "rewards/accuracies": 0.6074999570846558, "rewards/chosen": 0.0008181848679669201, "rewards/margins": 0.003228846937417984, "rewards/rejected": -0.0024106616619974375, "step": 190 }, { "epoch": 0.79, "grad_norm": 196.0, "learning_rate": 4.949846004064605e-06, "logits/chosen": -2.414769411087036, "logits/rejected": -2.394395351409912, "logps/chosen": -281.43670654296875, "logps/rejected": -265.92974853515625, "loss": 0.5352, "rewards/accuracies": 0.5974999666213989, "rewards/chosen": 0.00015548830560874194, "rewards/margins": 0.003786542685702443, "rewards/rejected": -0.003631054190918803, "step": 200 }, { "epoch": 0.79, "eval_logits/chosen": -2.4014458656311035, "eval_logits/rejected": -2.362912893295288, "eval_logps/chosen": -267.5639953613281, "eval_logps/rejected": -242.58323669433594, "eval_loss": 0.5010271072387695, "eval_rewards/accuracies": 0.574404776096344, "eval_rewards/chosen": 0.0011464261915534735, "eval_rewards/margins": 0.003029454033821821, "eval_rewards/rejected": -0.001883027609437704, "eval_runtime": 123.1297, "eval_samples_per_second": 16.243, "eval_steps_per_second": 0.341, "step": 200 }, { "epoch": 0.82, "grad_norm": 254.0, "learning_rate": 4.935227723537811e-06, "logits/chosen": -2.406309127807617, "logits/rejected": -2.360525131225586, "logps/chosen": -296.66070556640625, "logps/rejected": -266.5440979003906, "loss": 0.5154, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 0.0017778485780581832, "rewards/margins": 0.004943528212606907, "rewards/rejected": -0.0031656797509640455, "step": 210 }, { "epoch": 0.86, "grad_norm": 288.0, "learning_rate": 4.918769859437233e-06, "logits/chosen": -2.3826467990875244, "logits/rejected": -2.3224892616271973, "logps/chosen": -274.14825439453125, "logps/rejected": -252.51400756835938, "loss": 0.5475, "rewards/accuracies": 0.6050000190734863, "rewards/chosen": 0.004731293302029371, "rewards/margins": 0.0043090214021503925, "rewards/rejected": 0.0004222726565785706, "step": 220 }, { "epoch": 0.9, "grad_norm": 182.0, "learning_rate": 4.900484844118235e-06, "logits/chosen": -2.3914456367492676, "logits/rejected": -2.3371217250823975, "logps/chosen": -280.33282470703125, "logps/rejected": -240.6001739501953, "loss": 0.561, "rewards/accuracies": 0.6175000071525574, "rewards/chosen": 0.002294857520610094, "rewards/margins": 0.0033697611652314663, "rewards/rejected": -0.0010749038774520159, "step": 230 }, { "epoch": 0.94, "grad_norm": 196.0, "learning_rate": 4.880386490175634e-06, "logits/chosen": -2.359574794769287, "logits/rejected": -2.327416181564331, "logps/chosen": -290.989013671875, "logps/rejected": -268.49395751953125, "loss": 0.5661, "rewards/accuracies": 0.5975000262260437, "rewards/chosen": -0.00016075666644610465, "rewards/margins": 0.003780897008255124, "rewards/rejected": -0.003941653296351433, "step": 240 }, { "epoch": 0.98, "grad_norm": 232.0, "learning_rate": 4.8584899800095865e-06, "logits/chosen": -2.4217326641082764, "logits/rejected": -2.3503971099853516, "logps/chosen": -288.73858642578125, "logps/rejected": -258.19317626953125, "loss": 0.5762, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 0.00230691721662879, "rewards/margins": 0.004124250262975693, "rewards/rejected": -0.0018173331627622247, "step": 250 }, { "epoch": 1.02, "grad_norm": 162.0, "learning_rate": 4.834811854356729e-06, "logits/chosen": -2.406905174255371, "logits/rejected": -2.3617682456970215, "logps/chosen": -266.1622619628906, "logps/rejected": -245.754150390625, "loss": 0.4313, "rewards/accuracies": 0.6574999690055847, "rewards/chosen": 0.006862832698971033, "rewards/margins": 0.010333456099033356, "rewards/rejected": -0.0034706243313848972, "step": 260 }, { "epoch": 1.06, "grad_norm": 119.0, "learning_rate": 4.809369999795219e-06, "logits/chosen": -2.367124080657959, "logits/rejected": -2.352210760116577, "logps/chosen": -271.3493957519531, "logps/rejected": -274.4390563964844, "loss": 0.2246, "rewards/accuracies": 0.7775000333786011, "rewards/chosen": 0.011581487953662872, "rewards/margins": 0.02090141549706459, "rewards/rejected": -0.009319926612079144, "step": 270 }, { "epoch": 1.1, "grad_norm": 152.0, "learning_rate": 4.7821836352331235e-06, "logits/chosen": -2.4189422130584717, "logits/rejected": -2.362922430038452, "logps/chosen": -276.532470703125, "logps/rejected": -254.9628143310547, "loss": 0.3068, "rewards/accuracies": 0.7899999618530273, "rewards/chosen": 0.010409007780253887, "rewards/margins": 0.019158251583576202, "rewards/rejected": -0.008749241940677166, "step": 280 }, { "epoch": 1.14, "grad_norm": 122.5, "learning_rate": 4.7532732973903525e-06, "logits/chosen": -2.392087936401367, "logits/rejected": -2.3331363201141357, "logps/chosen": -281.2344970703125, "logps/rejected": -266.2068176269531, "loss": 0.2544, "rewards/accuracies": 0.7375000715255737, "rewards/chosen": 0.01263010036200285, "rewards/margins": 0.01806234009563923, "rewards/rejected": -0.0054322415962815285, "step": 290 }, { "epoch": 1.18, "grad_norm": 177.0, "learning_rate": 4.722660825285122e-06, "logits/chosen": -2.413367509841919, "logits/rejected": -2.3771374225616455, "logps/chosen": -278.57904052734375, "logps/rejected": -270.5982360839844, "loss": 0.3676, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.014171945862472057, "rewards/margins": 0.020834611728787422, "rewards/rejected": -0.0066626654006540775, "step": 300 }, { "epoch": 1.18, "eval_logits/chosen": -2.416761636734009, "eval_logits/rejected": -2.3788468837738037, "eval_logps/chosen": -266.88555908203125, "eval_logps/rejected": -242.12106323242188, "eval_loss": 0.8293091654777527, "eval_rewards/accuracies": 0.5982142686843872, "eval_rewards/chosen": 0.00793052464723587, "eval_rewards/margins": 0.005191552918404341, "eval_rewards/rejected": 0.0027389726601541042, "eval_runtime": 123.1487, "eval_samples_per_second": 16.241, "eval_steps_per_second": 0.341, "step": 300 }, { "epoch": 1.22, "grad_norm": 140.0, "learning_rate": 4.690369343736637e-06, "logits/chosen": -2.4030745029449463, "logits/rejected": -2.368807077407837, "logps/chosen": -278.1623229980469, "logps/rejected": -264.98541259765625, "loss": 0.3085, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": 0.01418610941618681, "rewards/margins": 0.01857883669435978, "rewards/rejected": -0.004392724949866533, "step": 310 }, { "epoch": 1.26, "grad_norm": 185.0, "learning_rate": 4.656423245896494e-06, "logits/chosen": -2.4111828804016113, "logits/rejected": -2.3645176887512207, "logps/chosen": -270.58477783203125, "logps/rejected": -256.71099853515625, "loss": 0.2759, "rewards/accuracies": 0.7575000524520874, "rewards/chosen": 0.008453365415334702, "rewards/margins": 0.01515539176762104, "rewards/rejected": -0.0067020258866250515, "step": 320 }, { "epoch": 1.3, "grad_norm": 163.0, "learning_rate": 4.6208481748219645e-06, "logits/chosen": -2.392019271850586, "logits/rejected": -2.367377519607544, "logps/chosen": -271.10015869140625, "logps/rejected": -256.1971435546875, "loss": 0.2666, "rewards/accuracies": 0.7675000429153442, "rewards/chosen": 0.007386817596852779, "rewards/margins": 0.019648974761366844, "rewards/rejected": -0.01226215623319149, "step": 330 }, { "epoch": 1.33, "grad_norm": 142.0, "learning_rate": 4.583671004105096e-06, "logits/chosen": -2.3817129135131836, "logits/rejected": -2.342694044113159, "logps/chosen": -275.9081115722656, "logps/rejected": -251.59848022460938, "loss": 0.2849, "rewards/accuracies": 0.75, "rewards/chosen": 0.0086748655885458, "rewards/margins": 0.01924619823694229, "rewards/rejected": -0.010571330785751343, "step": 340 }, { "epoch": 1.37, "grad_norm": 113.0, "learning_rate": 4.544919817572262e-06, "logits/chosen": -2.3859992027282715, "logits/rejected": -2.325930118560791, "logps/chosen": -272.119873046875, "logps/rejected": -247.908935546875, "loss": 0.2871, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": 0.012000922113656998, "rewards/margins": 0.019705070182681084, "rewards/rejected": -0.007704148534685373, "step": 350 }, { "epoch": 1.41, "grad_norm": 154.0, "learning_rate": 4.504623888069497e-06, "logits/chosen": -2.397146701812744, "logits/rejected": -2.3492813110351562, "logps/chosen": -271.32171630859375, "logps/rejected": -247.8098907470703, "loss": 0.3405, "rewards/accuracies": 0.7600000500679016, "rewards/chosen": 0.013489668257534504, "rewards/margins": 0.017752837389707565, "rewards/rejected": -0.0042631677351891994, "step": 360 }, { "epoch": 1.45, "grad_norm": 139.0, "learning_rate": 4.462813655349637e-06, "logits/chosen": -2.372323751449585, "logits/rejected": -2.3170628547668457, "logps/chosen": -268.582275390625, "logps/rejected": -245.34097290039062, "loss": 0.3015, "rewards/accuracies": 0.7275000214576721, "rewards/chosen": 0.013011058792471886, "rewards/margins": 0.018547596409916878, "rewards/rejected": -0.005536535754799843, "step": 370 }, { "epoch": 1.49, "grad_norm": 195.0, "learning_rate": 4.419520703077975e-06, "logits/chosen": -2.3980116844177246, "logits/rejected": -2.3060975074768066, "logps/chosen": -284.5008544921875, "logps/rejected": -232.28515625, "loss": 0.2953, "rewards/accuracies": 0.7300000190734863, "rewards/chosen": 0.009602969512343407, "rewards/margins": 0.01879434660077095, "rewards/rejected": -0.00919137429445982, "step": 380 }, { "epoch": 1.53, "grad_norm": 184.0, "learning_rate": 4.3747777349737905e-06, "logits/chosen": -2.394030809402466, "logits/rejected": -2.3477187156677246, "logps/chosen": -299.9769592285156, "logps/rejected": -268.77301025390625, "loss": 0.318, "rewards/accuracies": 0.7575000524520874, "rewards/chosen": 0.011008193716406822, "rewards/margins": 0.020197119563817978, "rewards/rejected": -0.009188923053443432, "step": 390 }, { "epoch": 1.57, "grad_norm": 153.0, "learning_rate": 4.328618550105802e-06, "logits/chosen": -2.3696258068084717, "logits/rejected": -2.341409206390381, "logps/chosen": -271.8193664550781, "logps/rejected": -264.6459045410156, "loss": 0.366, "rewards/accuracies": 0.7400000691413879, "rewards/chosen": 0.012812617234885693, "rewards/margins": 0.018871381878852844, "rewards/rejected": -0.006058765109628439, "step": 400 }, { "epoch": 1.57, "eval_logits/chosen": -2.4146392345428467, "eval_logits/rejected": -2.377370595932007, "eval_logps/chosen": -267.025634765625, "eval_logps/rejected": -242.3221435546875, "eval_loss": 0.8238700032234192, "eval_rewards/accuracies": 0.6398809552192688, "eval_rewards/chosen": 0.006530104670673609, "eval_rewards/margins": 0.005802116356790066, "eval_rewards/rejected": 0.0007279877318069339, "eval_runtime": 123.0966, "eval_samples_per_second": 16.247, "eval_steps_per_second": 0.341, "step": 400 }, { "epoch": 1.61, "grad_norm": 91.0, "learning_rate": 4.2810780173601675e-06, "logits/chosen": -2.3998053073883057, "logits/rejected": -2.341407060623169, "logps/chosen": -285.62054443359375, "logps/rejected": -247.3552703857422, "loss": 0.3234, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.008955768309533596, "rewards/margins": 0.020532304421067238, "rewards/rejected": -0.011576534248888493, "step": 410 }, { "epoch": 1.65, "grad_norm": 174.0, "learning_rate": 4.232192049100351e-06, "logits/chosen": -2.411689043045044, "logits/rejected": -2.384003162384033, "logps/chosen": -242.8949737548828, "logps/rejected": -236.754150390625, "loss": 0.292, "rewards/accuracies": 0.747499942779541, "rewards/chosen": 0.005577466916292906, "rewards/margins": 0.016019560396671295, "rewards/rejected": -0.010442093946039677, "step": 420 }, { "epoch": 1.69, "grad_norm": 110.0, "learning_rate": 4.1819975740387406e-06, "logits/chosen": -2.4044318199157715, "logits/rejected": -2.3681979179382324, "logps/chosen": -276.5671691894531, "logps/rejected": -259.7832336425781, "loss": 0.3016, "rewards/accuracies": 0.7475000619888306, "rewards/chosen": 0.011015561409294605, "rewards/margins": 0.02647540345788002, "rewards/rejected": -0.01545984111726284, "step": 430 }, { "epoch": 1.73, "grad_norm": 158.0, "learning_rate": 4.1305325093405045e-06, "logits/chosen": -2.4186065196990967, "logits/rejected": -2.406249523162842, "logps/chosen": -295.4107971191406, "logps/rejected": -280.9339294433594, "loss": 0.4026, "rewards/accuracies": 0.7675000429153442, "rewards/chosen": 0.010993788950145245, "rewards/margins": 0.020805999636650085, "rewards/rejected": -0.00981221068650484, "step": 440 }, { "epoch": 1.77, "grad_norm": 175.0, "learning_rate": 4.077835731980775e-06, "logits/chosen": -2.416654348373413, "logits/rejected": -2.368619203567505, "logps/chosen": -279.9720764160156, "logps/rejected": -245.94345092773438, "loss": 0.3414, "rewards/accuracies": 0.7575000524520874, "rewards/chosen": 0.006698101758956909, "rewards/margins": 0.016826082020998, "rewards/rejected": -0.010127981193363667, "step": 450 }, { "epoch": 1.81, "grad_norm": 175.0, "learning_rate": 4.02394704937677e-06, "logits/chosen": -2.3919434547424316, "logits/rejected": -2.3505940437316895, "logps/chosen": -280.6643981933594, "logps/rejected": -252.3192901611328, "loss": 0.3603, "rewards/accuracies": 0.7575000524520874, "rewards/chosen": 0.00701780105009675, "rewards/margins": 0.01795104146003723, "rewards/rejected": -0.010933240875601768, "step": 460 }, { "epoch": 1.84, "grad_norm": 176.0, "learning_rate": 3.96890716931708e-06, "logits/chosen": -2.381404399871826, "logits/rejected": -2.369319438934326, "logps/chosen": -251.976806640625, "logps/rejected": -239.9644775390625, "loss": 0.3975, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": 0.00673043355345726, "rewards/margins": 0.01468564011156559, "rewards/rejected": -0.00795520469546318, "step": 470 }, { "epoch": 1.88, "grad_norm": 129.0, "learning_rate": 3.912757669210783e-06, "logits/chosen": -2.4172403812408447, "logits/rejected": -2.354468584060669, "logps/chosen": -258.93780517578125, "logps/rejected": -234.1327362060547, "loss": 0.354, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": 0.015040628612041473, "rewards/margins": 0.020659491419792175, "rewards/rejected": -0.005618864204734564, "step": 480 }, { "epoch": 1.92, "grad_norm": 127.5, "learning_rate": 3.855540964679658e-06, "logits/chosen": -2.3677306175231934, "logits/rejected": -2.323366641998291, "logps/chosen": -239.04776000976562, "logps/rejected": -228.19580078125, "loss": 0.2687, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": 0.0076850662007927895, "rewards/margins": 0.01919684186577797, "rewards/rejected": -0.01151177566498518, "step": 490 }, { "epoch": 1.96, "grad_norm": 100.5, "learning_rate": 3.797300277517212e-06, "logits/chosen": -2.412917137145996, "logits/rejected": -2.38498592376709, "logps/chosen": -285.268310546875, "logps/rejected": -264.0386962890625, "loss": 0.292, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": 0.010482062585651875, "rewards/margins": 0.020365219563245773, "rewards/rejected": -0.009883158840239048, "step": 500 }, { "epoch": 1.96, "eval_logits/chosen": -2.4342868328094482, "eval_logits/rejected": -2.3977575302124023, "eval_logps/chosen": -267.17938232421875, "eval_logps/rejected": -242.4461669921875, "eval_loss": 0.8145859837532043, "eval_rewards/accuracies": 0.6398809552192688, "eval_rewards/chosen": 0.004992412868887186, "eval_rewards/margins": 0.005504657980054617, "eval_rewards/rejected": -0.0005122453439980745, "eval_runtime": 123.1687, "eval_samples_per_second": 16.238, "eval_steps_per_second": 0.341, "step": 500 }, { "epoch": 2.0, "grad_norm": 150.0, "learning_rate": 3.7380796030387035e-06, "logits/chosen": -2.4117255210876465, "logits/rejected": -2.3580873012542725, "logps/chosen": -288.262451171875, "logps/rejected": -250.64486694335938, "loss": 0.2919, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": 0.012420935556292534, "rewards/margins": 0.022166112437844276, "rewards/rejected": -0.009745175018906593, "step": 510 }, { "epoch": 2.04, "grad_norm": 72.0, "learning_rate": 3.6779236768468647e-06, "logits/chosen": -2.416080951690674, "logits/rejected": -2.3825385570526123, "logps/chosen": -266.83453369140625, "logps/rejected": -257.94635009765625, "loss": 0.0944, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.017391610890626907, "rewards/margins": 0.030041953548789024, "rewards/rejected": -0.012650340795516968, "step": 520 }, { "epoch": 2.08, "grad_norm": 40.75, "learning_rate": 3.6168779410383905e-06, "logits/chosen": -2.4022631645202637, "logits/rejected": -2.366995334625244, "logps/chosen": -274.7615661621094, "logps/rejected": -253.46835327148438, "loss": 0.0944, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.014151136390864849, "rewards/margins": 0.029283767566084862, "rewards/rejected": -0.015132628381252289, "step": 530 }, { "epoch": 2.12, "grad_norm": 109.0, "learning_rate": 3.554988509876747e-06, "logits/chosen": -2.411635637283325, "logits/rejected": -2.379657030105591, "logps/chosen": -264.20758056640625, "logps/rejected": -248.7351837158203, "loss": 0.1176, "rewards/accuracies": 0.8475000262260437, "rewards/chosen": 0.01760762929916382, "rewards/margins": 0.03199198096990585, "rewards/rejected": -0.014384354464709759, "step": 540 }, { "epoch": 2.16, "grad_norm": 162.0, "learning_rate": 3.4923021349572183e-06, "logits/chosen": -2.4204351902008057, "logits/rejected": -2.342064619064331, "logps/chosen": -293.338623046875, "logps/rejected": -249.83984375, "loss": 0.1199, "rewards/accuracies": 0.8449999690055847, "rewards/chosen": 0.016306212171912193, "rewards/margins": 0.03437874838709831, "rewards/rejected": -0.01807253621518612, "step": 550 }, { "epoch": 2.2, "grad_norm": 156.0, "learning_rate": 3.428866169890511e-06, "logits/chosen": -2.4187042713165283, "logits/rejected": -2.3788833618164062, "logps/chosen": -280.169921875, "logps/rejected": -266.49139404296875, "loss": 0.1396, "rewards/accuracies": 0.8550000190734863, "rewards/chosen": 0.020975306630134583, "rewards/margins": 0.0341680608689785, "rewards/rejected": -0.013192756101489067, "step": 560 }, { "epoch": 2.24, "grad_norm": 52.5, "learning_rate": 3.3647285345315933e-06, "logits/chosen": -2.426948308944702, "logits/rejected": -2.3513236045837402, "logps/chosen": -301.64703369140625, "logps/rejected": -252.04910278320312, "loss": 0.1179, "rewards/accuracies": 0.8324999809265137, "rewards/chosen": 0.02232900820672512, "rewards/margins": 0.03726055473089218, "rewards/rejected": -0.01493154652416706, "step": 570 }, { "epoch": 2.28, "grad_norm": 63.0, "learning_rate": 3.299937678780786e-06, "logits/chosen": -2.3919901847839355, "logits/rejected": -2.376873016357422, "logps/chosen": -270.5113830566406, "logps/rejected": -262.92120361328125, "loss": 0.1103, "rewards/accuracies": 0.8525000810623169, "rewards/chosen": 0.01391147542744875, "rewards/margins": 0.03057609498500824, "rewards/rejected": -0.016664620488882065, "step": 580 }, { "epoch": 2.32, "grad_norm": 38.25, "learning_rate": 3.234542545984464e-06, "logits/chosen": -2.3860366344451904, "logits/rejected": -2.3532588481903076, "logps/chosen": -279.0345764160156, "logps/rejected": -268.3638000488281, "loss": 0.113, "rewards/accuracies": 0.8125, "rewards/chosen": 0.013167209923267365, "rewards/margins": 0.03231758996844292, "rewards/rejected": -0.0191503819078207, "step": 590 }, { "epoch": 2.36, "grad_norm": 95.5, "learning_rate": 3.1685925359629928e-06, "logits/chosen": -2.382845401763916, "logits/rejected": -2.345613479614258, "logps/chosen": -270.888427734375, "logps/rejected": -262.33251953125, "loss": 0.1355, "rewards/accuracies": 0.8575000762939453, "rewards/chosen": 0.01834903098642826, "rewards/margins": 0.034947365522384644, "rewards/rejected": -0.016598336398601532, "step": 600 }, { "epoch": 2.36, "eval_logits/chosen": -2.4177558422088623, "eval_logits/rejected": -2.3796002864837646, "eval_logps/chosen": -267.20611572265625, "eval_logps/rejected": -242.52117919921875, "eval_loss": 0.9650812745094299, "eval_rewards/accuracies": 0.6160714030265808, "eval_rewards/chosen": 0.004725400358438492, "eval_rewards/margins": 0.00598777923732996, "eval_rewards/rejected": -0.0012623785296455026, "eval_runtime": 123.1103, "eval_samples_per_second": 16.246, "eval_steps_per_second": 0.341, "step": 600 }, { "epoch": 2.39, "grad_norm": 79.5, "learning_rate": 3.102137467693858e-06, "logits/chosen": -2.3922505378723145, "logits/rejected": -2.3382246494293213, "logps/chosen": -273.4150390625, "logps/rejected": -258.9840393066406, "loss": 0.252, "rewards/accuracies": 0.8725000619888306, "rewards/chosen": 0.01791740581393242, "rewards/margins": 0.032240770757198334, "rewards/rejected": -0.014323368668556213, "step": 610 }, { "epoch": 2.43, "grad_norm": 120.0, "learning_rate": 3.0352275416781465e-06, "logits/chosen": -2.416335344314575, "logits/rejected": -2.379333019256592, "logps/chosen": -273.5201110839844, "logps/rejected": -258.6203918457031, "loss": 0.1567, "rewards/accuracies": 0.8400000333786011, "rewards/chosen": 0.02479313686490059, "rewards/margins": 0.0343189537525177, "rewards/rejected": -0.00952581875026226, "step": 620 }, { "epoch": 2.47, "grad_norm": 106.0, "learning_rate": 2.96791330201883e-06, "logits/chosen": -2.421025514602661, "logits/rejected": -2.3913843631744385, "logps/chosen": -266.0569763183594, "logps/rejected": -255.9671173095703, "loss": 0.1255, "rewards/accuracies": 0.8274999856948853, "rewards/chosen": 0.021568376570940018, "rewards/margins": 0.03504693880677223, "rewards/rejected": -0.013478565029799938, "step": 630 }, { "epoch": 2.51, "grad_norm": 94.5, "learning_rate": 2.9002455982394946e-06, "logits/chosen": -2.3834731578826904, "logits/rejected": -2.3404629230499268, "logps/chosen": -279.171630859375, "logps/rejected": -251.27841186523438, "loss": 0.1115, "rewards/accuracies": 0.8674999475479126, "rewards/chosen": 0.020785773172974586, "rewards/margins": 0.03259057179093361, "rewards/rejected": -0.011804800480604172, "step": 640 }, { "epoch": 2.55, "grad_norm": 61.0, "learning_rate": 2.832275546872339e-06, "logits/chosen": -2.401367664337158, "logits/rejected": -2.3691532611846924, "logps/chosen": -261.18377685546875, "logps/rejected": -267.6328125, "loss": 0.0953, "rewards/accuracies": 0.8850000500679016, "rewards/chosen": 0.018077706918120384, "rewards/margins": 0.03268102556467056, "rewards/rejected": -0.014603319577872753, "step": 650 }, { "epoch": 2.59, "grad_norm": 130.0, "learning_rate": 2.7640544928444927e-06, "logits/chosen": -2.418788194656372, "logits/rejected": -2.3343942165374756, "logps/chosen": -288.7831726074219, "logps/rejected": -252.0687713623047, "loss": 0.1191, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 0.020471712574362755, "rewards/margins": 0.03600749000906944, "rewards/rejected": -0.015535781159996986, "step": 660 }, { "epoch": 2.63, "grad_norm": 86.0, "learning_rate": 2.695633970691786e-06, "logits/chosen": -2.3701846599578857, "logits/rejected": -2.351933240890503, "logps/chosen": -257.39752197265625, "logps/rejected": -252.69287109375, "loss": 0.0849, "rewards/accuracies": 0.8550000190734863, "rewards/chosen": 0.019604947417974472, "rewards/margins": 0.031910307705402374, "rewards/rejected": -0.012305359356105328, "step": 670 }, { "epoch": 2.67, "grad_norm": 101.5, "learning_rate": 2.6270656656293007e-06, "logits/chosen": -2.394273281097412, "logits/rejected": -2.348475694656372, "logps/chosen": -264.9925537109375, "logps/rejected": -248.74368286132812, "loss": 0.0955, "rewards/accuracies": 0.8574999570846558, "rewards/chosen": 0.020610950887203217, "rewards/margins": 0.03306712210178375, "rewards/rejected": -0.012456170283257961, "step": 680 }, { "epoch": 2.71, "grad_norm": 63.25, "learning_rate": 2.558401374508089e-06, "logits/chosen": -2.402439594268799, "logits/rejected": -2.3409905433654785, "logps/chosen": -276.2132873535156, "logps/rejected": -251.0520782470703, "loss": 0.0996, "rewards/accuracies": 0.8650000691413879, "rewards/chosen": 0.02115057222545147, "rewards/margins": 0.030901487916707993, "rewards/rejected": -0.009750919416546822, "step": 690 }, { "epoch": 2.75, "grad_norm": 57.75, "learning_rate": 2.4896929666875665e-06, "logits/chosen": -2.4019179344177246, "logits/rejected": -2.3663971424102783, "logps/chosen": -274.6024475097656, "logps/rejected": -264.2455139160156, "loss": 0.1327, "rewards/accuracies": 0.8574999570846558, "rewards/chosen": 0.016096513718366623, "rewards/margins": 0.03081604465842247, "rewards/rejected": -0.014719529077410698, "step": 700 }, { "epoch": 2.75, "eval_logits/chosen": -2.4065868854522705, "eval_logits/rejected": -2.369014263153076, "eval_logps/chosen": -267.2229919433594, "eval_logps/rejected": -242.58834838867188, "eval_loss": 0.9984952211380005, "eval_rewards/accuracies": 0.6339285969734192, "eval_rewards/chosen": 0.004556288011372089, "eval_rewards/margins": 0.006490407045930624, "eval_rewards/rejected": -0.001934119500219822, "eval_runtime": 122.9942, "eval_samples_per_second": 16.261, "eval_steps_per_second": 0.341, "step": 700 }, { "epoch": 2.79, "grad_norm": 104.0, "learning_rate": 2.420992344853132e-06, "logits/chosen": -2.4031834602355957, "logits/rejected": -2.380056142807007, "logps/chosen": -276.49700927734375, "logps/rejected": -262.06341552734375, "loss": 0.1394, "rewards/accuracies": 0.8399999737739563, "rewards/chosen": 0.019898083060979843, "rewards/margins": 0.033728718757629395, "rewards/rejected": -0.013830636627972126, "step": 710 }, { "epoch": 2.83, "grad_norm": 111.0, "learning_rate": 2.3523514058086093e-06, "logits/chosen": -2.410182237625122, "logits/rejected": -2.326798915863037, "logps/chosen": -288.55609130859375, "logps/rejected": -250.171630859375, "loss": 0.1191, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 0.01868962123990059, "rewards/margins": 0.02993825078010559, "rewards/rejected": -0.011248626746237278, "step": 720 }, { "epoch": 2.87, "grad_norm": 63.5, "learning_rate": 2.2838220012731365e-06, "logits/chosen": -2.3818917274475098, "logits/rejected": -2.3685965538024902, "logps/chosen": -270.9535217285156, "logps/rejected": -267.3423767089844, "loss": 0.1279, "rewards/accuracies": 0.8825000524520874, "rewards/chosen": 0.023929597809910774, "rewards/margins": 0.041905276477336884, "rewards/rejected": -0.01797567494213581, "step": 730 }, { "epoch": 2.9, "grad_norm": 136.0, "learning_rate": 2.2154558987121054e-06, "logits/chosen": -2.3983840942382812, "logits/rejected": -2.3515264987945557, "logps/chosen": -274.8974609375, "logps/rejected": -253.7775115966797, "loss": 0.1044, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.025471847504377365, "rewards/margins": 0.03200577199459076, "rewards/rejected": -0.006533923093229532, "step": 740 }, { "epoch": 2.94, "grad_norm": 208.0, "learning_rate": 2.147304742231758e-06, "logits/chosen": -2.3778913021087646, "logits/rejected": -2.3485236167907715, "logps/chosen": -254.48635864257812, "logps/rejected": -267.0797119140625, "loss": 0.1637, "rewards/accuracies": 0.8550001382827759, "rewards/chosen": 0.018020575866103172, "rewards/margins": 0.030409198254346848, "rewards/rejected": -0.012388622388243675, "step": 750 }, { "epoch": 2.98, "grad_norm": 62.5, "learning_rate": 2.0794200135669586e-06, "logits/chosen": -2.399770498275757, "logits/rejected": -2.364065647125244, "logps/chosen": -277.4521484375, "logps/rejected": -267.6507568359375, "loss": 0.1466, "rewards/accuracies": 0.8574999570846558, "rewards/chosen": 0.021130980923771858, "rewards/margins": 0.036186493933200836, "rewards/rejected": -0.015055513009428978, "step": 760 }, { "epoch": 3.02, "grad_norm": 43.25, "learning_rate": 2.011852993191625e-06, "logits/chosen": -2.3711681365966797, "logits/rejected": -2.331266403198242, "logps/chosen": -284.1138916015625, "logps/rejected": -270.9895324707031, "loss": 0.0563, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": 0.02099769189953804, "rewards/margins": 0.03560823202133179, "rewards/rejected": -0.014610541984438896, "step": 770 }, { "epoch": 3.06, "grad_norm": 91.5, "learning_rate": 1.944654721581196e-06, "logits/chosen": -2.3276844024658203, "logits/rejected": -2.3009562492370605, "logps/chosen": -260.98626708984375, "logps/rejected": -247.24685668945312, "loss": 0.0353, "rewards/accuracies": 0.9300001263618469, "rewards/chosen": 0.024031776934862137, "rewards/margins": 0.04021488502621651, "rewards/rejected": -0.01618310809135437, "step": 780 }, { "epoch": 3.1, "grad_norm": 29.5, "learning_rate": 1.877875960656394e-06, "logits/chosen": -2.3537003993988037, "logits/rejected": -2.3234972953796387, "logps/chosen": -275.46453857421875, "logps/rejected": -260.1392517089844, "loss": 0.0298, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.021290091797709465, "rewards/margins": 0.032937195152044296, "rewards/rejected": -0.011647104285657406, "step": 790 }, { "epoch": 3.14, "grad_norm": 42.5, "learning_rate": 1.8115671554374067e-06, "logits/chosen": -2.399651050567627, "logits/rejected": -2.3859167098999023, "logps/chosen": -268.92803955078125, "logps/rejected": -275.9405822753906, "loss": 0.0389, "rewards/accuracies": 0.9375, "rewards/chosen": 0.025255614891648293, "rewards/margins": 0.04190623760223389, "rewards/rejected": -0.016650624573230743, "step": 800 }, { "epoch": 3.14, "eval_logits/chosen": -2.3946948051452637, "eval_logits/rejected": -2.3562896251678467, "eval_logps/chosen": -266.8748474121094, "eval_logps/rejected": -242.36962890625, "eval_loss": 0.8932417035102844, "eval_rewards/accuracies": 0.6517857313156128, "eval_rewards/chosen": 0.008037895895540714, "eval_rewards/margins": 0.007784782908856869, "eval_rewards/rejected": 0.00025311243371106684, "eval_runtime": 123.1142, "eval_samples_per_second": 16.245, "eval_steps_per_second": 0.341, "step": 800 }, { "epoch": 3.18, "grad_norm": 19.25, "learning_rate": 1.7457783959374585e-06, "logits/chosen": -2.404486894607544, "logits/rejected": -2.3604061603546143, "logps/chosen": -278.95855712890625, "logps/rejected": -251.1371307373047, "loss": 0.0348, "rewards/accuracies": 0.9275000691413879, "rewards/chosen": 0.025924455374479294, "rewards/margins": 0.037114791572093964, "rewards/rejected": -0.011190338991582394, "step": 810 }, { "epoch": 3.22, "grad_norm": 32.25, "learning_rate": 1.680559379324558e-06, "logits/chosen": -2.390227794647217, "logits/rejected": -2.3385822772979736, "logps/chosen": -292.6279296875, "logps/rejected": -254.8878173828125, "loss": 0.0299, "rewards/accuracies": 0.9099999666213989, "rewards/chosen": 0.02545427717268467, "rewards/margins": 0.03745580464601517, "rewards/rejected": -0.012001526542007923, "step": 820 }, { "epoch": 3.26, "grad_norm": 60.5, "learning_rate": 1.6159593723800013e-06, "logits/chosen": -2.4059481620788574, "logits/rejected": -2.3442025184631348, "logps/chosen": -264.06060791015625, "logps/rejected": -246.68521118164062, "loss": 0.0307, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.0275394506752491, "rewards/margins": 0.036764778196811676, "rewards/rejected": -0.00922533217817545, "step": 830 }, { "epoch": 3.3, "grad_norm": 49.75, "learning_rate": 1.5520271742819883e-06, "logits/chosen": -2.389446973800659, "logits/rejected": -2.3524551391601562, "logps/chosen": -271.63641357421875, "logps/rejected": -255.3404998779297, "loss": 0.0301, "rewards/accuracies": 0.877500057220459, "rewards/chosen": 0.02547125145792961, "rewards/margins": 0.03725877031683922, "rewards/rejected": -0.01178752165287733, "step": 840 }, { "epoch": 3.34, "grad_norm": 24.625, "learning_rate": 1.4888110797424783e-06, "logits/chosen": -2.4469919204711914, "logits/rejected": -2.3791050910949707, "logps/chosen": -315.74066162109375, "logps/rejected": -277.9239501953125, "loss": 0.0354, "rewards/accuracies": 0.9025000333786011, "rewards/chosen": 0.03072303533554077, "rewards/margins": 0.048735830932855606, "rewards/rejected": -0.018012793734669685, "step": 850 }, { "epoch": 3.38, "grad_norm": 54.0, "learning_rate": 1.4263588425251052e-06, "logits/chosen": -2.4028658866882324, "logits/rejected": -2.3509697914123535, "logps/chosen": -289.78485107421875, "logps/rejected": -251.5, "loss": 0.0267, "rewards/accuracies": 0.9375001192092896, "rewards/chosen": 0.028129303827881813, "rewards/margins": 0.04094386473298073, "rewards/rejected": -0.01281456183642149, "step": 860 }, { "epoch": 3.42, "grad_norm": 25.625, "learning_rate": 1.3647176393717509e-06, "logits/chosen": -2.4022791385650635, "logits/rejected": -2.3641726970672607, "logps/chosen": -278.85333251953125, "logps/rejected": -270.0050964355469, "loss": 0.0217, "rewards/accuracies": 0.9199999570846558, "rewards/chosen": 0.025644132867455482, "rewards/margins": 0.03728828951716423, "rewards/rejected": -0.011644158512353897, "step": 870 }, { "epoch": 3.45, "grad_norm": 30.875, "learning_rate": 1.303934034364983e-06, "logits/chosen": -2.3777599334716797, "logits/rejected": -2.3236684799194336, "logps/chosen": -261.53216552734375, "logps/rejected": -239.6273651123047, "loss": 0.0216, "rewards/accuracies": 0.9025000333786011, "rewards/chosen": 0.024678941816091537, "rewards/margins": 0.03661385923624039, "rewards/rejected": -0.011934916488826275, "step": 880 }, { "epoch": 3.49, "grad_norm": 87.0, "learning_rate": 1.2440539437533075e-06, "logits/chosen": -2.352806806564331, "logits/rejected": -2.3354265689849854, "logps/chosen": -269.5535583496094, "logps/rejected": -268.3061218261719, "loss": 0.0273, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": 0.022415757179260254, "rewards/margins": 0.036310791969299316, "rewards/rejected": -0.013895031996071339, "step": 890 }, { "epoch": 3.53, "grad_norm": 33.25, "learning_rate": 1.1851226012658015e-06, "logits/chosen": -2.366988182067871, "logits/rejected": -2.323378562927246, "logps/chosen": -264.4871520996094, "logps/rejected": -251.3701934814453, "loss": 0.029, "rewards/accuracies": 0.9175000190734863, "rewards/chosen": 0.023964881896972656, "rewards/margins": 0.039505355060100555, "rewards/rejected": -0.015540470369160175, "step": 900 }, { "epoch": 3.53, "eval_logits/chosen": -2.411829948425293, "eval_logits/rejected": -2.3752424716949463, "eval_logps/chosen": -266.7797546386719, "eval_logps/rejected": -242.3114013671875, "eval_loss": 0.9391952157020569, "eval_rewards/accuracies": 0.6577380895614624, "eval_rewards/chosen": 0.008988723158836365, "eval_rewards/margins": 0.008153370581567287, "eval_rewards/rejected": 0.0008353526936843991, "eval_runtime": 123.1579, "eval_samples_per_second": 16.239, "eval_steps_per_second": 0.341, "step": 900 }, { "epoch": 3.57, "grad_norm": 16.5, "learning_rate": 1.1271845239423196e-06, "logits/chosen": -2.4022092819213867, "logits/rejected": -2.357339382171631, "logps/chosen": -289.2544250488281, "logps/rejected": -258.82379150390625, "loss": 0.0269, "rewards/accuracies": 0.9175000190734863, "rewards/chosen": 0.022589916363358498, "rewards/margins": 0.03589435666799545, "rewards/rejected": -0.013304440304636955, "step": 910 }, { "epoch": 3.61, "grad_norm": 24.25, "learning_rate": 1.0702834785050893e-06, "logits/chosen": -2.3661084175109863, "logits/rejected": -2.3483455181121826, "logps/chosen": -276.6420593261719, "logps/rejected": -278.82080078125, "loss": 0.033, "rewards/accuracies": 0.9250000715255737, "rewards/chosen": 0.021236615255475044, "rewards/margins": 0.038433950394392014, "rewards/rejected": -0.017197338864207268, "step": 920 }, { "epoch": 3.65, "grad_norm": 40.5, "learning_rate": 1.0144624482971082e-06, "logits/chosen": -2.4388625621795654, "logits/rejected": -2.380392551422119, "logps/chosen": -271.2420349121094, "logps/rejected": -256.2777404785156, "loss": 0.0275, "rewards/accuracies": 0.9350000619888306, "rewards/chosen": 0.022215455770492554, "rewards/margins": 0.03798101097345352, "rewards/rejected": -0.01576555334031582, "step": 930 }, { "epoch": 3.69, "grad_norm": 14.0, "learning_rate": 9.597636008123052e-07, "logits/chosen": -2.4123058319091797, "logits/rejected": -2.3628923892974854, "logps/chosen": -308.3174743652344, "logps/rejected": -278.7733459472656, "loss": 0.0278, "rewards/accuracies": 0.942500114440918, "rewards/chosen": 0.024744439870119095, "rewards/margins": 0.038902923464775085, "rewards/rejected": -0.014158482663333416, "step": 940 }, { "epoch": 3.73, "grad_norm": 20.375, "learning_rate": 9.06228255841991e-07, "logits/chosen": -2.382359504699707, "logits/rejected": -2.3492817878723145, "logps/chosen": -264.2523193359375, "logps/rejected": -256.5533752441406, "loss": 0.0279, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.022538714110851288, "rewards/margins": 0.03617415204644203, "rewards/rejected": -0.013635434210300446, "step": 950 }, { "epoch": 3.77, "grad_norm": 36.25, "learning_rate": 8.538968542616846e-07, "logits/chosen": -2.41325044631958, "logits/rejected": -2.375253677368164, "logps/chosen": -281.48248291015625, "logps/rejected": -265.0972900390625, "loss": 0.0179, "rewards/accuracies": 0.9075000882148743, "rewards/chosen": 0.025317683815956116, "rewards/margins": 0.038605697453022, "rewards/rejected": -0.013288016431033611, "step": 960 }, { "epoch": 3.81, "grad_norm": 29.625, "learning_rate": 8.028089274818624e-07, "logits/chosen": -2.4157140254974365, "logits/rejected": -2.377472162246704, "logps/chosen": -278.72564697265625, "logps/rejected": -257.7362365722656, "loss": 0.034, "rewards/accuracies": 0.9274999499320984, "rewards/chosen": 0.023796474561095238, "rewards/margins": 0.03968465328216553, "rewards/rejected": -0.01588817685842514, "step": 970 }, { "epoch": 3.85, "grad_norm": 24.5, "learning_rate": 7.530030675857252e-07, "logits/chosen": -2.371452569961548, "logits/rejected": -2.33616304397583, "logps/chosen": -278.2137756347656, "logps/rejected": -254.2333984375, "loss": 0.0245, "rewards/accuracies": 0.9350000619888306, "rewards/chosen": 0.028386935591697693, "rewards/margins": 0.044838014990091324, "rewards/rejected": -0.01645107939839363, "step": 980 }, { "epoch": 3.89, "grad_norm": 31.5, "learning_rate": 7.045168981765427e-07, "logits/chosen": -2.4061717987060547, "logits/rejected": -2.367501735687256, "logps/chosen": -277.0457763671875, "logps/rejected": -248.8456268310547, "loss": 0.0248, "rewards/accuracies": 0.9325000047683716, "rewards/chosen": 0.025927498936653137, "rewards/margins": 0.03844950348138809, "rewards/rejected": -0.012522002682089806, "step": 990 }, { "epoch": 3.93, "grad_norm": 12.875, "learning_rate": 6.573870459565907e-07, "logits/chosen": -2.381437301635742, "logits/rejected": -2.3325188159942627, "logps/chosen": -293.3424377441406, "logps/rejected": -262.5635681152344, "loss": 0.0198, "rewards/accuracies": 0.9225000143051147, "rewards/chosen": 0.02644011378288269, "rewards/margins": 0.04168093949556351, "rewards/rejected": -0.015240825712680817, "step": 1000 }, { "epoch": 3.93, "eval_logits/chosen": -2.4145004749298096, "eval_logits/rejected": -2.3780155181884766, "eval_logps/chosen": -266.8046875, "eval_logps/rejected": -242.29171752929688, "eval_loss": 0.820038914680481, "eval_rewards/accuracies": 0.6577380895614624, "eval_rewards/chosen": 0.008739516139030457, "eval_rewards/margins": 0.007707077078521252, "eval_rewards/rejected": 0.0010324395261704922, "eval_runtime": 123.2067, "eval_samples_per_second": 16.233, "eval_steps_per_second": 0.341, "step": 1000 }, { "epoch": 3.96, "grad_norm": 33.75, "learning_rate": 6.116491130591478e-07, "logits/chosen": -2.410226821899414, "logits/rejected": -2.366502285003662, "logps/chosen": -279.39166259765625, "logps/rejected": -251.89364624023438, "loss": 0.0233, "rewards/accuracies": 0.9175001382827759, "rewards/chosen": 0.024083226919174194, "rewards/margins": 0.03586304560303688, "rewards/rejected": -0.011779818683862686, "step": 1010 }, { "epoch": 4.0, "grad_norm": 3.4375, "learning_rate": 5.673376501544641e-07, "logits/chosen": -2.41102933883667, "logits/rejected": -2.36838960647583, "logps/chosen": -274.3736877441406, "logps/rejected": -243.4824981689453, "loss": 0.017, "rewards/accuracies": 0.9175000190734863, "rewards/chosen": 0.023644987493753433, "rewards/margins": 0.0368778295814991, "rewards/rejected": -0.013232842087745667, "step": 1020 }, { "epoch": 4.04, "grad_norm": 20.5, "learning_rate": 5.244861303500026e-07, "logits/chosen": -2.413541793823242, "logits/rejected": -2.362837314605713, "logps/chosen": -272.99359130859375, "logps/rejected": -239.32608032226562, "loss": 0.008, "rewards/accuracies": 0.9325000643730164, "rewards/chosen": 0.021824661642313004, "rewards/margins": 0.0353974774479866, "rewards/rejected": -0.013572819530963898, "step": 1030 }, { "epoch": 4.08, "grad_norm": 3.765625, "learning_rate": 4.831269239046851e-07, "logits/chosen": -2.4089815616607666, "logits/rejected": -2.366555690765381, "logps/chosen": -266.49798583984375, "logps/rejected": -252.52145385742188, "loss": 0.0043, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.025229623541235924, "rewards/margins": 0.03910643607378006, "rewards/rejected": -0.013876812532544136, "step": 1040 }, { "epoch": 4.12, "grad_norm": 18.75, "learning_rate": 4.4329127377623127e-07, "logits/chosen": -2.380474328994751, "logits/rejected": -2.3557381629943848, "logps/chosen": -275.6029968261719, "logps/rejected": -260.39776611328125, "loss": 0.0054, "rewards/accuracies": 0.9550000429153442, "rewards/chosen": 0.025818094611167908, "rewards/margins": 0.038945622742176056, "rewards/rejected": -0.013127523474395275, "step": 1050 }, { "epoch": 4.16, "grad_norm": 14.8125, "learning_rate": 4.050092720200638e-07, "logits/chosen": -2.384019374847412, "logits/rejected": -2.338085889816284, "logps/chosen": -280.4701232910156, "logps/rejected": -250.2019805908203, "loss": 0.0052, "rewards/accuracies": 0.9575001001358032, "rewards/chosen": 0.028378132730722427, "rewards/margins": 0.042529620230197906, "rewards/rejected": -0.014151493087410927, "step": 1060 }, { "epoch": 4.2, "grad_norm": 8.375, "learning_rate": 3.683098370576196e-07, "logits/chosen": -2.406979560852051, "logits/rejected": -2.3714377880096436, "logps/chosen": -287.4407958984375, "logps/rejected": -259.043212890625, "loss": 0.0062, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": 0.024499880149960518, "rewards/margins": 0.03765181452035904, "rewards/rejected": -0.013151939027011395, "step": 1070 }, { "epoch": 4.24, "grad_norm": 11.6875, "learning_rate": 3.3322069183122253e-07, "logits/chosen": -2.415555477142334, "logits/rejected": -2.3626112937927246, "logps/chosen": -270.87249755859375, "logps/rejected": -251.2595977783203, "loss": 0.0049, "rewards/accuracies": 0.9399999380111694, "rewards/chosen": 0.028504956513643265, "rewards/margins": 0.043677303940057755, "rewards/rejected": -0.015172350220382214, "step": 1080 }, { "epoch": 4.28, "grad_norm": 60.75, "learning_rate": 2.997683428620296e-07, "logits/chosen": -2.4119620323181152, "logits/rejected": -2.3438758850097656, "logps/chosen": -288.2101745605469, "logps/rejected": -261.5445251464844, "loss": 0.0101, "rewards/accuracies": 0.9550000429153442, "rewards/chosen": 0.026679161936044693, "rewards/margins": 0.042851291596889496, "rewards/rejected": -0.016172129660844803, "step": 1090 }, { "epoch": 4.32, "grad_norm": 23.25, "learning_rate": 2.6797806022686835e-07, "logits/chosen": -2.3794476985931396, "logits/rejected": -2.350893974304199, "logps/chosen": -262.9915771484375, "logps/rejected": -261.4454650878906, "loss": 0.0059, "rewards/accuracies": 0.9449998736381531, "rewards/chosen": 0.024703029543161392, "rewards/margins": 0.04311930388212204, "rewards/rejected": -0.018416276201605797, "step": 1100 }, { "epoch": 4.32, "eval_logits/chosen": -2.410806894302368, "eval_logits/rejected": -2.3743793964385986, "eval_logps/chosen": -266.8759765625, "eval_logps/rejected": -242.3739471435547, "eval_loss": 0.8903548717498779, "eval_rewards/accuracies": 0.6577380895614624, "eval_rewards/chosen": 0.008026321418583393, "eval_rewards/margins": 0.007816384546458721, "eval_rewards/rejected": 0.00020993576617911458, "eval_runtime": 123.126, "eval_samples_per_second": 16.244, "eval_steps_per_second": 0.341, "step": 1100 }, { "epoch": 4.36, "grad_norm": 33.5, "learning_rate": 2.378738584690926e-07, "logits/chosen": -2.3809187412261963, "logits/rejected": -2.345142126083374, "logps/chosen": -274.513916015625, "logps/rejected": -259.05767822265625, "loss": 0.0073, "rewards/accuracies": 0.9474999308586121, "rewards/chosen": 0.024536920711398125, "rewards/margins": 0.0396781824529171, "rewards/rejected": -0.015141261741518974, "step": 1110 }, { "epoch": 4.4, "grad_norm": 6.09375, "learning_rate": 2.0947847845787073e-07, "logits/chosen": -2.380807638168335, "logits/rejected": -2.3750548362731934, "logps/chosen": -273.5718688964844, "logps/rejected": -271.99774169921875, "loss": 0.0056, "rewards/accuracies": 0.9699999690055847, "rewards/chosen": 0.025182534009218216, "rewards/margins": 0.04230727255344391, "rewards/rejected": -0.017124736681580544, "step": 1120 }, { "epoch": 4.44, "grad_norm": 12.75, "learning_rate": 1.828133702096152e-07, "logits/chosen": -2.4007954597473145, "logits/rejected": -2.3510959148406982, "logps/chosen": -297.80023193359375, "logps/rejected": -267.09271240234375, "loss": 0.0058, "rewards/accuracies": 0.9350000619888306, "rewards/chosen": 0.027246862649917603, "rewards/margins": 0.04566134512424469, "rewards/rejected": -0.01841447874903679, "step": 1130 }, { "epoch": 4.47, "grad_norm": 6.8125, "learning_rate": 1.5789867668453224e-07, "logits/chosen": -2.359222650527954, "logits/rejected": -2.316779613494873, "logps/chosen": -256.35321044921875, "logps/rejected": -244.2194366455078, "loss": 0.005, "rewards/accuracies": 0.940000057220459, "rewards/chosen": 0.02366521954536438, "rewards/margins": 0.03908833488821983, "rewards/rejected": -0.015423113480210304, "step": 1140 }, { "epoch": 4.51, "grad_norm": 10.625, "learning_rate": 1.3475321857052387e-07, "logits/chosen": -2.3972084522247314, "logits/rejected": -2.3675570487976074, "logps/chosen": -278.0527038574219, "logps/rejected": -256.3372497558594, "loss": 0.0053, "rewards/accuracies": 0.9575001001358032, "rewards/chosen": 0.025035608559846878, "rewards/margins": 0.039394162595272064, "rewards/rejected": -0.014358552172780037, "step": 1150 }, { "epoch": 4.55, "grad_norm": 24.5, "learning_rate": 1.1339448006594284e-07, "logits/chosen": -2.3742661476135254, "logits/rejected": -2.3620691299438477, "logps/chosen": -269.88934326171875, "logps/rejected": -263.372314453125, "loss": 0.0056, "rewards/accuracies": 0.9574999809265137, "rewards/chosen": 0.02747185155749321, "rewards/margins": 0.04529104381799698, "rewards/rejected": -0.01781919226050377, "step": 1160 }, { "epoch": 4.59, "grad_norm": 15.125, "learning_rate": 9.383859567194148e-08, "logits/chosen": -2.4092886447906494, "logits/rejected": -2.3820230960845947, "logps/chosen": -292.05755615234375, "logps/rejected": -274.00262451171875, "loss": 0.0076, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": 0.02904806099832058, "rewards/margins": 0.04705999046564102, "rewards/rejected": -0.018011927604675293, "step": 1170 }, { "epoch": 4.63, "grad_norm": 6.125, "learning_rate": 7.610033800438343e-08, "logits/chosen": -2.404353141784668, "logits/rejected": -2.3535656929016113, "logps/chosen": -279.27001953125, "logps/rejected": -261.3420104980469, "loss": 0.0036, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.023024918511509895, "rewards/margins": 0.038382213562726974, "rewards/rejected": -0.015357298776507378, "step": 1180 }, { "epoch": 4.67, "grad_norm": 15.3125, "learning_rate": 6.019310663453654e-08, "logits/chosen": -2.379361867904663, "logits/rejected": -2.348564624786377, "logps/chosen": -272.57965087890625, "logps/rejected": -280.6869201660156, "loss": 0.0045, "rewards/accuracies": 0.9474999308586121, "rewards/chosen": 0.02704106830060482, "rewards/margins": 0.04761399328708649, "rewards/rejected": -0.02057291939854622, "step": 1190 }, { "epoch": 4.71, "grad_norm": 6.5625, "learning_rate": 4.6128917966964394e-08, "logits/chosen": -2.3975164890289307, "logits/rejected": -2.354431390762329, "logps/chosen": -264.81683349609375, "logps/rejected": -240.6919403076172, "loss": 0.0042, "rewards/accuracies": 0.9375, "rewards/chosen": 0.023663988336920738, "rewards/margins": 0.037333834916353226, "rewards/rejected": -0.013669842854142189, "step": 1200 }, { "epoch": 4.71, "eval_logits/chosen": -2.4119250774383545, "eval_logits/rejected": -2.375300407409668, "eval_logps/chosen": -266.87713623046875, "eval_logps/rejected": -242.38916015625, "eval_loss": 0.8778771162033081, "eval_rewards/accuracies": 0.6517857313156128, "eval_rewards/chosen": 0.00801478698849678, "eval_rewards/margins": 0.007957086898386478, "eval_rewards/rejected": 5.7699922763276845e-05, "eval_runtime": 123.014, "eval_samples_per_second": 16.258, "eval_steps_per_second": 0.341, "step": 1200 }, { "epoch": 4.75, "grad_norm": 27.0, "learning_rate": 3.3918396162275214e-08, "logits/chosen": -2.429567337036133, "logits/rejected": -2.401862621307373, "logps/chosen": -265.3700866699219, "logps/rejected": -255.75082397460938, "loss": 0.0077, "rewards/accuracies": 0.9475001096725464, "rewards/chosen": 0.02286478877067566, "rewards/margins": 0.037350136786699295, "rewards/rejected": -0.01448534894734621, "step": 1210 }, { "epoch": 4.79, "grad_norm": 29.625, "learning_rate": 2.3570765111574357e-08, "logits/chosen": -2.420926570892334, "logits/rejected": -2.3869175910949707, "logps/chosen": -275.02593994140625, "logps/rejected": -250.9207000732422, "loss": 0.0076, "rewards/accuracies": 0.9325000643730164, "rewards/chosen": 0.025391753762960434, "rewards/margins": 0.04114841669797897, "rewards/rejected": -0.01575666293501854, "step": 1220 }, { "epoch": 4.83, "grad_norm": 12.8125, "learning_rate": 1.5093841468690473e-08, "logits/chosen": -2.378108501434326, "logits/rejected": -2.3375566005706787, "logps/chosen": -278.16180419921875, "logps/rejected": -249.80557250976562, "loss": 0.0043, "rewards/accuracies": 0.9550000429153442, "rewards/chosen": 0.02722669579088688, "rewards/margins": 0.042906779795885086, "rewards/rejected": -0.015680085867643356, "step": 1230 }, { "epoch": 4.87, "grad_norm": 6.65625, "learning_rate": 8.494028745434368e-09, "logits/chosen": -2.4071974754333496, "logits/rejected": -2.3638107776641846, "logps/chosen": -272.5426940917969, "logps/rejected": -254.20095825195312, "loss": 0.0031, "rewards/accuracies": 0.9424999952316284, "rewards/chosen": 0.024697447195649147, "rewards/margins": 0.0481376014649868, "rewards/rejected": -0.023440156131982803, "step": 1240 }, { "epoch": 4.91, "grad_norm": 13.625, "learning_rate": 3.776312474353394e-09, "logits/chosen": -2.382949113845825, "logits/rejected": -2.3320465087890625, "logps/chosen": -262.14154052734375, "logps/rejected": -247.5254669189453, "loss": 0.0039, "rewards/accuracies": 0.9550000429153442, "rewards/chosen": 0.022636910900473595, "rewards/margins": 0.03510580584406853, "rewards/rejected": -0.012468894943594933, "step": 1250 }, { "epoch": 4.95, "grad_norm": 6.78125, "learning_rate": 9.442564426342949e-10, "logits/chosen": -2.3739681243896484, "logits/rejected": -2.3734383583068848, "logps/chosen": -248.2006072998047, "logps/rejected": -255.94161987304688, "loss": 0.0033, "rewards/accuracies": 0.9275000691413879, "rewards/chosen": 0.025133823975920677, "rewards/margins": 0.04240426793694496, "rewards/rejected": -0.017270449548959732, "step": 1260 }, { "epoch": 4.99, "grad_norm": 13.1875, "learning_rate": 0.0, "logits/chosen": -2.393690586090088, "logits/rejected": -2.3561182022094727, "logps/chosen": -263.6891174316406, "logps/rejected": -243.8829345703125, "loss": 0.0063, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024886978790163994, "rewards/margins": 0.0397338829934597, "rewards/rejected": -0.014846903271973133, "step": 1270 }, { "epoch": 4.99, "step": 1270, "total_flos": 0.0, "train_loss": 0.164279118501603, "train_runtime": 43545.4617, "train_samples_per_second": 7.02, "train_steps_per_second": 0.029 } ], "logging_steps": 10, "max_steps": 1270, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 10, "trial_name": null, "trial_params": null }