{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9968602825745683, "eval_steps": 100, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": 0.29368966817855835, "logits/rejected": 0.3178113102912903, "logps/chosen": -295.21783447265625, "logps/rejected": -290.84619140625, "loss": 0.6933, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.00023447822604794055, "rewards/margins": 0.00020264319027774036, "rewards/rejected": 3.183506123605184e-05, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 0.25306791067123413, "logits/rejected": 0.3252382278442383, "logps/chosen": -318.19073486328125, "logps/rejected": -289.6706237792969, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00048495858209207654, "rewards/margins": 0.0009848512709140778, "rewards/rejected": -0.0004998926888220012, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 0.33226653933525085, "logits/rejected": 0.3872108459472656, "logps/chosen": -296.3697204589844, "logps/rejected": -283.0611877441406, "loss": 0.692, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0008094090735539794, "rewards/margins": 0.0026363185606896877, "rewards/rejected": -0.003445727750658989, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 0.22801101207733154, "logits/rejected": 0.32900214195251465, "logps/chosen": -305.9015197753906, "logps/rejected": -293.1842346191406, "loss": 0.6883, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0027595984283834696, "rewards/margins": 0.006589935161173344, "rewards/rejected": -0.00934953335672617, "step": 40 }, { "epoch": 0.1, "learning_rate": 5.208333333333334e-07, "logits/chosen": 0.27091675996780396, "logits/rejected": 0.31866759061813354, "logps/chosen": -314.2833557128906, "logps/rejected": -307.02532958984375, "loss": 0.683, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0060192132368683815, "rewards/margins": 0.019419629126787186, "rewards/rejected": -0.025438839569687843, "step": 50 }, { "epoch": 0.13, "learning_rate": 6.249999999999999e-07, "logits/chosen": 0.31704145669937134, "logits/rejected": 0.4334793984889984, "logps/chosen": -294.429931640625, "logps/rejected": -272.87994384765625, "loss": 0.6735, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.022902976721525192, "rewards/margins": 0.04408121109008789, "rewards/rejected": -0.06698418408632278, "step": 60 }, { "epoch": 0.15, "learning_rate": 7.291666666666666e-07, "logits/chosen": 0.31964099407196045, "logits/rejected": 0.3377896547317505, "logps/chosen": -304.6803894042969, "logps/rejected": -311.32794189453125, "loss": 0.6602, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05670947954058647, "rewards/margins": 0.05069820210337639, "rewards/rejected": -0.10740767419338226, "step": 70 }, { "epoch": 0.17, "learning_rate": 8.333333333333333e-07, "logits/chosen": 0.35048729181289673, "logits/rejected": 0.4193252921104431, "logps/chosen": -306.3404541015625, "logps/rejected": -282.4783020019531, "loss": 0.6474, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10853584855794907, "rewards/margins": 0.13117292523384094, "rewards/rejected": -0.2397087812423706, "step": 80 }, { "epoch": 0.19, "learning_rate": 9.374999999999999e-07, "logits/chosen": 0.32813602685928345, "logits/rejected": 0.4464220404624939, "logps/chosen": -341.1703186035156, "logps/rejected": -299.92340087890625, "loss": 0.6357, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2554694712162018, "rewards/margins": 0.12284588813781738, "rewards/rejected": -0.37831538915634155, "step": 90 }, { "epoch": 0.21, "learning_rate": 9.999463737538052e-07, "logits/chosen": 0.35799938440322876, "logits/rejected": 0.3899138271808624, "logps/chosen": -318.7712097167969, "logps/rejected": -348.5688781738281, "loss": 0.6201, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3124231994152069, "rewards/margins": 0.19829413294792175, "rewards/rejected": -0.5107173323631287, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": 0.3705582916736603, "eval_logits/rejected": 0.4172414541244507, "eval_logps/chosen": -331.11236572265625, "eval_logps/rejected": -378.240478515625, "eval_loss": 0.6252639293670654, "eval_rewards/accuracies": 0.703125, "eval_rewards/chosen": -0.27527713775634766, "eval_rewards/margins": 0.3908771872520447, "eval_rewards/rejected": -0.6661543846130371, "eval_runtime": 64.993, "eval_samples_per_second": 30.773, "eval_steps_per_second": 0.492, "step": 100 }, { "epoch": 0.23, "learning_rate": 9.993432105822034e-07, "logits/chosen": 0.3002661168575287, "logits/rejected": 0.3483879864215851, "logps/chosen": -350.9095458984375, "logps/rejected": -360.30963134765625, "loss": 0.6093, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.37449145317077637, "rewards/margins": 0.3816668689250946, "rewards/rejected": -0.7561584115028381, "step": 110 }, { "epoch": 0.25, "learning_rate": 9.980706626858607e-07, "logits/chosen": 0.2088731825351715, "logits/rejected": 0.2899537980556488, "logps/chosen": -395.3984069824219, "logps/rejected": -433.7286682128906, "loss": 0.5905, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4185329079627991, "rewards/margins": 0.4517739713191986, "rewards/rejected": -0.8703069686889648, "step": 120 }, { "epoch": 0.27, "learning_rate": 9.961304359538434e-07, "logits/chosen": 0.09751267731189728, "logits/rejected": 0.22797170281410217, "logps/chosen": -374.6502380371094, "logps/rejected": -373.14263916015625, "loss": 0.5998, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4483565390110016, "rewards/margins": 0.3120550811290741, "rewards/rejected": -0.7604116201400757, "step": 130 }, { "epoch": 0.29, "learning_rate": 9.935251313189563e-07, "logits/chosen": 0.23218217492103577, "logits/rejected": 0.28826406598091125, "logps/chosen": -363.15338134765625, "logps/rejected": -381.83795166015625, "loss": 0.5847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.49422675371170044, "rewards/margins": 0.38399404287338257, "rewards/rejected": -0.8782208561897278, "step": 140 }, { "epoch": 0.31, "learning_rate": 9.902582412711118e-07, "logits/chosen": 0.29499703645706177, "logits/rejected": 0.49714046716690063, "logps/chosen": -342.9471130371094, "logps/rejected": -372.1763610839844, "loss": 0.5681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6186890602111816, "rewards/margins": 0.5347500443458557, "rewards/rejected": -1.1534390449523926, "step": 150 }, { "epoch": 0.33, "learning_rate": 9.86334145175542e-07, "logits/chosen": 0.35773637890815735, "logits/rejected": 0.4941268861293793, "logps/chosen": -370.3203125, "logps/rejected": -428.2557067871094, "loss": 0.5792, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7682583928108215, "rewards/margins": 0.6092099547386169, "rewards/rejected": -1.377468228340149, "step": 160 }, { "epoch": 0.36, "learning_rate": 9.817581034021272e-07, "logits/chosen": 0.19401590526103973, "logits/rejected": 0.3156794607639313, "logps/chosen": -415.65313720703125, "logps/rejected": -425.68963623046875, "loss": 0.5747, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5680743455886841, "rewards/margins": 0.4286484122276306, "rewards/rejected": -0.9967228174209595, "step": 170 }, { "epoch": 0.38, "learning_rate": 9.765362502737097e-07, "logits/chosen": 0.2572034001350403, "logits/rejected": 0.27328386902809143, "logps/chosen": -358.0533447265625, "logps/rejected": -398.5332946777344, "loss": 0.5634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.644694447517395, "rewards/margins": 0.5251447558403015, "rewards/rejected": -1.1698391437530518, "step": 180 }, { "epoch": 0.4, "learning_rate": 9.706755858428485e-07, "logits/chosen": 0.3962785303592682, "logits/rejected": 0.4539657235145569, "logps/chosen": -347.5164794921875, "logps/rejected": -378.83184814453125, "loss": 0.5423, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7963652610778809, "rewards/margins": 0.5720622539520264, "rewards/rejected": -1.3684275150299072, "step": 190 }, { "epoch": 0.42, "learning_rate": 9.641839665080363e-07, "logits/chosen": 0.3198946714401245, "logits/rejected": 0.4063253402709961, "logps/chosen": -352.0743713378906, "logps/rejected": -419.7388610839844, "loss": 0.5547, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7651049494743347, "rewards/margins": 0.6531444787979126, "rewards/rejected": -1.418249487876892, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": 0.3908616304397583, "eval_logits/rejected": 0.4261176884174347, "eval_logps/chosen": -373.46612548828125, "eval_logps/rejected": -458.88629150390625, "eval_loss": 0.5549300312995911, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -0.6988146305084229, "eval_rewards/margins": 0.7737974524497986, "eval_rewards/rejected": -1.4726121425628662, "eval_runtime": 65.2313, "eval_samples_per_second": 30.66, "eval_steps_per_second": 0.491, "step": 200 }, { "epoch": 0.44, "learning_rate": 9.570700944819582e-07, "logits/chosen": 0.3505176901817322, "logits/rejected": 0.42375579476356506, "logps/chosen": -397.61199951171875, "logps/rejected": -454.89776611328125, "loss": 0.5265, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7523492574691772, "rewards/margins": 0.7344074845314026, "rewards/rejected": -1.486756682395935, "step": 210 }, { "epoch": 0.46, "learning_rate": 9.493435061259129e-07, "logits/chosen": 0.2726442813873291, "logits/rejected": 0.4434526860713959, "logps/chosen": -410.77667236328125, "logps/rejected": -431.207275390625, "loss": 0.555, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8984044194221497, "rewards/margins": 0.5839776992797852, "rewards/rejected": -1.48238205909729, "step": 220 }, { "epoch": 0.48, "learning_rate": 9.4101455916603e-07, "logits/chosen": 0.28837597370147705, "logits/rejected": 0.35526323318481445, "logps/chosen": -363.0335998535156, "logps/rejected": -436.46612548828125, "loss": 0.5498, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8101651072502136, "rewards/margins": 0.737695574760437, "rewards/rejected": -1.5478605031967163, "step": 230 }, { "epoch": 0.5, "learning_rate": 9.320944188084241e-07, "logits/chosen": 0.23826150596141815, "logits/rejected": 0.285171240568161, "logps/chosen": -432.6297912597656, "logps/rejected": -497.56341552734375, "loss": 0.5392, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7939236164093018, "rewards/margins": 0.7111212611198425, "rewards/rejected": -1.505044937133789, "step": 240 }, { "epoch": 0.52, "learning_rate": 9.225950427718974e-07, "logits/chosen": 0.2762988209724426, "logits/rejected": 0.31130319833755493, "logps/chosen": -399.399169921875, "logps/rejected": -445.702880859375, "loss": 0.5465, "rewards/accuracies": 0.6875, "rewards/chosen": -0.83611661195755, "rewards/margins": 0.7073522806167603, "rewards/rejected": -1.5434690713882446, "step": 250 }, { "epoch": 0.54, "learning_rate": 9.125291652582547e-07, "logits/chosen": 0.1327328383922577, "logits/rejected": 0.3085227310657501, "logps/chosen": -436.08135986328125, "logps/rejected": -457.634765625, "loss": 0.5194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9428914785385132, "rewards/margins": 0.6950392723083496, "rewards/rejected": -1.6379308700561523, "step": 260 }, { "epoch": 0.57, "learning_rate": 9.019102798817195e-07, "logits/chosen": 0.23745720088481903, "logits/rejected": 0.34172096848487854, "logps/chosen": -421.8299865722656, "logps/rejected": -466.4856872558594, "loss": 0.5496, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9325121641159058, "rewards/margins": 0.7680062651634216, "rewards/rejected": -1.7005186080932617, "step": 270 }, { "epoch": 0.59, "learning_rate": 8.90752621580335e-07, "logits/chosen": 0.16251161694526672, "logits/rejected": 0.2581509053707123, "logps/chosen": -418.5828552246094, "logps/rejected": -507.22412109375, "loss": 0.5168, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.113261342048645, "rewards/margins": 0.7757157683372498, "rewards/rejected": -1.88897705078125, "step": 280 }, { "epoch": 0.61, "learning_rate": 8.79071147533597e-07, "logits/chosen": 0.18345972895622253, "logits/rejected": 0.24752414226531982, "logps/chosen": -374.2388916015625, "logps/rejected": -421.7548828125, "loss": 0.5452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7729798555374146, "rewards/margins": 0.6754422187805176, "rewards/rejected": -1.448421835899353, "step": 290 }, { "epoch": 0.63, "learning_rate": 8.668815171119019e-07, "logits/chosen": 0.11917382478713989, "logits/rejected": 0.2862890362739563, "logps/chosen": -396.6165466308594, "logps/rejected": -400.9344787597656, "loss": 0.5343, "rewards/accuracies": 0.75, "rewards/chosen": -0.8496394157409668, "rewards/margins": 0.5671547055244446, "rewards/rejected": -1.4167941808700562, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": 0.24486932158470154, "eval_logits/rejected": 0.2851215898990631, "eval_logps/chosen": -384.0199279785156, "eval_logps/rejected": -476.3627624511719, "eval_loss": 0.5315821766853333, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -0.8043524026870728, "eval_rewards/margins": 0.8430246114730835, "eval_rewards/rejected": -1.6473771333694458, "eval_runtime": 64.8898, "eval_samples_per_second": 30.822, "eval_steps_per_second": 0.493, "step": 300 }, { "epoch": 0.65, "learning_rate": 8.54200070884685e-07, "logits/chosen": 0.21563191711902618, "logits/rejected": 0.1952591836452484, "logps/chosen": -438.91552734375, "logps/rejected": -499.61395263671875, "loss": 0.5567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.173762559890747, "rewards/margins": 0.5995947122573853, "rewards/rejected": -1.7733571529388428, "step": 310 }, { "epoch": 0.67, "learning_rate": 8.410438087153911e-07, "logits/chosen": 0.031750187277793884, "logits/rejected": 0.14312420785427094, "logps/chosen": -457.0936584472656, "logps/rejected": -459.15411376953125, "loss": 0.5366, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0222933292388916, "rewards/margins": 0.4668423533439636, "rewards/rejected": -1.4891356229782104, "step": 320 }, { "epoch": 0.69, "learning_rate": 8.274303669726426e-07, "logits/chosen": 0.07745673507452011, "logits/rejected": 0.07082104682922363, "logps/chosen": -399.32464599609375, "logps/rejected": -523.4277954101562, "loss": 0.532, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9549884796142578, "rewards/margins": 0.8862001299858093, "rewards/rejected": -1.8411887884140015, "step": 330 }, { "epoch": 0.71, "learning_rate": 8.133779948881513e-07, "logits/chosen": 0.10294970124959946, "logits/rejected": 0.09352216869592667, "logps/chosen": -409.33770751953125, "logps/rejected": -510.10089111328125, "loss": 0.5217, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1990526914596558, "rewards/margins": 0.7596696615219116, "rewards/rejected": -1.958722472190857, "step": 340 }, { "epoch": 0.73, "learning_rate": 7.989055300930704e-07, "logits/chosen": 0.16737070679664612, "logits/rejected": 0.21901166439056396, "logps/chosen": -437.2059020996094, "logps/rejected": -538.3087158203125, "loss": 0.5091, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2976617813110352, "rewards/margins": 0.8197441101074219, "rewards/rejected": -2.117405891418457, "step": 350 }, { "epoch": 0.75, "learning_rate": 7.840323733655778e-07, "logits/chosen": 0.1267612874507904, "logits/rejected": 0.20460394024848938, "logps/chosen": -465.1019592285156, "logps/rejected": -518.5805053710938, "loss": 0.5185, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.174440860748291, "rewards/margins": 0.9074214696884155, "rewards/rejected": -2.081862211227417, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.687784626235447e-07, "logits/chosen": 0.18740633130073547, "logits/rejected": 0.27840983867645264, "logps/chosen": -468.21807861328125, "logps/rejected": -493.06646728515625, "loss": 0.5158, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1525232791900635, "rewards/margins": 0.7130603790283203, "rewards/rejected": -1.8655836582183838, "step": 370 }, { "epoch": 0.8, "learning_rate": 7.531642461971514e-07, "logits/chosen": 0.15459200739860535, "logits/rejected": 0.23283176124095917, "logps/chosen": -453.07904052734375, "logps/rejected": -535.761474609375, "loss": 0.5392, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2102587223052979, "rewards/margins": 1.0239769220352173, "rewards/rejected": -2.2342355251312256, "step": 380 }, { "epoch": 0.82, "learning_rate": 7.372106554172801e-07, "logits/chosen": 0.19951777160167694, "logits/rejected": 0.2895793318748474, "logps/chosen": -411.4781799316406, "logps/rejected": -466.41253662109375, "loss": 0.5356, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8807674646377563, "rewards/margins": 0.7255697846412659, "rewards/rejected": -1.606337308883667, "step": 390 }, { "epoch": 0.84, "learning_rate": 7.209390765564318e-07, "logits/chosen": 0.1757899820804596, "logits/rejected": 0.23791635036468506, "logps/chosen": -390.8957214355469, "logps/rejected": -478.49407958984375, "loss": 0.5323, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9380720257759094, "rewards/margins": 0.7229949235916138, "rewards/rejected": -1.661067008972168, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": 0.25136542320251465, "eval_logits/rejected": 0.28342366218566895, "eval_logps/chosen": -394.2620849609375, "eval_logps/rejected": -494.46002197265625, "eval_loss": 0.521114706993103, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -0.90677410364151, "eval_rewards/margins": 0.9215754270553589, "eval_rewards/rejected": -1.8283497095108032, "eval_runtime": 65.2477, "eval_samples_per_second": 30.652, "eval_steps_per_second": 0.49, "step": 400 }, { "epoch": 0.86, "learning_rate": 7.043713221597773e-07, "logits/chosen": 0.1401471644639969, "logits/rejected": 0.17819848656654358, "logps/chosen": -401.8942565917969, "logps/rejected": -474.2579650878906, "loss": 0.5225, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8892021179199219, "rewards/margins": 0.906479001045227, "rewards/rejected": -1.7956812381744385, "step": 410 }, { "epoch": 0.88, "learning_rate": 6.875296018047809e-07, "logits/chosen": 0.1304786652326584, "logits/rejected": 0.15569528937339783, "logps/chosen": -400.5438537597656, "logps/rejected": -468.2953186035156, "loss": 0.5065, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9208317995071411, "rewards/margins": 0.9665653109550476, "rewards/rejected": -1.887397050857544, "step": 420 }, { "epoch": 0.9, "learning_rate": 6.704364923285857e-07, "logits/chosen": 0.09775003790855408, "logits/rejected": 0.1501173973083496, "logps/chosen": -476.73065185546875, "logps/rejected": -533.7389526367188, "loss": 0.5285, "rewards/accuracies": 0.71875, "rewards/chosen": -1.26286780834198, "rewards/margins": 0.8789494633674622, "rewards/rejected": -2.141817569732666, "step": 430 }, { "epoch": 0.92, "learning_rate": 6.531149075630796e-07, "logits/chosen": 0.1202029138803482, "logits/rejected": 0.1756385862827301, "logps/chosen": -400.1629333496094, "logps/rejected": -469.0196838378906, "loss": 0.5074, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0582826137542725, "rewards/margins": 0.9171028137207031, "rewards/rejected": -1.975385308265686, "step": 440 }, { "epoch": 0.94, "learning_rate": 6.355880676182085e-07, "logits/chosen": 0.01758761703968048, "logits/rejected": 0.15642888844013214, "logps/chosen": -443.065185546875, "logps/rejected": -484.41473388671875, "loss": 0.5295, "rewards/accuracies": 0.6875, "rewards/chosen": -1.004433035850525, "rewards/margins": 0.8090255856513977, "rewards/rejected": -1.8134586811065674, "step": 450 }, { "epoch": 0.96, "learning_rate": 6.178794677547137e-07, "logits/chosen": 0.012063628062605858, "logits/rejected": 0.10572747141122818, "logps/chosen": -408.4104919433594, "logps/rejected": -449.46881103515625, "loss": 0.5267, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0622318983078003, "rewards/margins": 0.6900007724761963, "rewards/rejected": -1.752232551574707, "step": 460 }, { "epoch": 0.98, "learning_rate": 6.000128468880222e-07, "logits/chosen": -0.05889149755239487, "logits/rejected": -0.014351313933730125, "logps/chosen": -449.07061767578125, "logps/rejected": -538.423828125, "loss": 0.5042, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1122030019760132, "rewards/margins": 0.9293983578681946, "rewards/rejected": -2.0416014194488525, "step": 470 }, { "epoch": 1.0, "learning_rate": 5.820121557655108e-07, "logits/chosen": -0.045923542231321335, "logits/rejected": 0.03086056187748909, "logps/chosen": -422.5347595214844, "logps/rejected": -529.2613525390625, "loss": 0.4682, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.275895118713379, "rewards/margins": 1.037326455116272, "rewards/rejected": -2.3132214546203613, "step": 480 }, { "epoch": 1.03, "learning_rate": 5.639015248598023e-07, "logits/chosen": -0.05794327333569527, "logits/rejected": -0.09440571069717407, "logps/chosen": -441.08990478515625, "logps/rejected": -601.30419921875, "loss": 0.3493, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3565930128097534, "rewards/margins": 1.8090267181396484, "rewards/rejected": -3.1656198501586914, "step": 490 }, { "epoch": 1.05, "learning_rate": 5.457052320211339e-07, "logits/chosen": -0.23803594708442688, "logits/rejected": -0.29828980565071106, "logps/chosen": -474.3367614746094, "logps/rejected": -656.8623046875, "loss": 0.352, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.730020523071289, "rewards/margins": 1.8847625255584717, "rewards/rejected": -3.6147830486297607, "step": 500 }, { "epoch": 1.05, "eval_logits/chosen": -0.06535135954618454, "eval_logits/rejected": -0.08458372950553894, "eval_logps/chosen": -498.9117431640625, "eval_logps/rejected": -653.2899169921875, "eval_loss": 0.525809109210968, "eval_rewards/accuracies": 0.796875, "eval_rewards/chosen": -1.9532711505889893, "eval_rewards/margins": 1.4633771181106567, "eval_rewards/rejected": -3.4166483879089355, "eval_runtime": 64.5447, "eval_samples_per_second": 30.986, "eval_steps_per_second": 0.496, "step": 500 }, { "epoch": 1.07, "learning_rate": 5.274476699321637e-07, "logits/chosen": -0.17468394339084625, "logits/rejected": -0.19271844625473022, "logps/chosen": -464.8907165527344, "logps/rejected": -632.8477172851562, "loss": 0.3423, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.695770263671875, "rewards/margins": 1.8958046436309814, "rewards/rejected": -3.5915749073028564, "step": 510 }, { "epoch": 1.09, "learning_rate": 5.091533134088387e-07, "logits/chosen": -0.23788562417030334, "logits/rejected": -0.20414999127388, "logps/chosen": -485.94415283203125, "logps/rejected": -640.9654541015625, "loss": 0.3236, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8264538049697876, "rewards/margins": 1.7633371353149414, "rewards/rejected": -3.5897908210754395, "step": 520 }, { "epoch": 1.11, "learning_rate": 4.908466865911614e-07, "logits/chosen": -0.11999205499887466, "logits/rejected": -0.09423510730266571, "logps/chosen": -487.30810546875, "logps/rejected": -646.9547119140625, "loss": 0.3424, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7824156284332275, "rewards/margins": 1.7626768350601196, "rewards/rejected": -3.5450921058654785, "step": 530 }, { "epoch": 1.13, "learning_rate": 4.7255233006783624e-07, "logits/chosen": -0.16903451085090637, "logits/rejected": -0.06715662032365799, "logps/chosen": -530.5585327148438, "logps/rejected": -645.1633911132812, "loss": 0.3301, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7870140075683594, "rewards/margins": 1.8271121978759766, "rewards/rejected": -3.614126682281494, "step": 540 }, { "epoch": 1.15, "learning_rate": 4.5429476797886617e-07, "logits/chosen": -0.03496643900871277, "logits/rejected": -0.04560618847608566, "logps/chosen": -483.15203857421875, "logps/rejected": -634.9447631835938, "loss": 0.3501, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8454551696777344, "rewards/margins": 1.5336250066757202, "rewards/rejected": -3.379080295562744, "step": 550 }, { "epoch": 1.17, "learning_rate": 4.3609847514019763e-07, "logits/chosen": 0.06872721016407013, "logits/rejected": 0.0859331339597702, "logps/chosen": -506.5009765625, "logps/rejected": -646.7543334960938, "loss": 0.3359, "rewards/accuracies": 0.875, "rewards/chosen": -1.7461020946502686, "rewards/margins": 1.8654359579086304, "rewards/rejected": -3.6115379333496094, "step": 560 }, { "epoch": 1.19, "learning_rate": 4.179878442344892e-07, "logits/chosen": -0.04321649298071861, "logits/rejected": 0.051123034209012985, "logps/chosen": -496.735595703125, "logps/rejected": -647.29150390625, "loss": 0.3464, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0714824199676514, "rewards/margins": 1.762717843055725, "rewards/rejected": -3.834200620651245, "step": 570 }, { "epoch": 1.21, "learning_rate": 3.9998715311197783e-07, "logits/chosen": -0.004042728338390589, "logits/rejected": 0.04070080816745758, "logps/chosen": -512.9417724609375, "logps/rejected": -696.821044921875, "loss": 0.3365, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.097195863723755, "rewards/margins": 1.831916093826294, "rewards/rejected": -3.9291114807128906, "step": 580 }, { "epoch": 1.23, "learning_rate": 3.821205322452863e-07, "logits/chosen": 0.09215477854013443, "logits/rejected": 0.05737446993589401, "logps/chosen": -525.1434936523438, "logps/rejected": -705.9063110351562, "loss": 0.3158, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.0721964836120605, "rewards/margins": 1.9721500873565674, "rewards/rejected": -4.044346809387207, "step": 590 }, { "epoch": 1.26, "learning_rate": 3.6441193238179146e-07, "logits/chosen": 0.07227401435375214, "logits/rejected": 0.03988388180732727, "logps/chosen": -549.4615478515625, "logps/rejected": -771.7275390625, "loss": 0.3342, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2555530071258545, "rewards/margins": 2.014453649520874, "rewards/rejected": -4.2700066566467285, "step": 600 }, { "epoch": 1.26, "eval_logits/chosen": 0.1344175487756729, "eval_logits/rejected": 0.11280365288257599, "eval_logps/chosen": -534.8101196289062, "eval_logps/rejected": -684.085693359375, "eval_loss": 0.5267595648765564, "eval_rewards/accuracies": 0.79296875, "eval_rewards/chosen": -2.3122546672821045, "eval_rewards/margins": 1.412351369857788, "eval_rewards/rejected": -3.7246060371398926, "eval_runtime": 64.3013, "eval_samples_per_second": 31.104, "eval_steps_per_second": 0.498, "step": 600 }, { "epoch": 1.28, "learning_rate": 3.4688509243692034e-07, "logits/chosen": 0.021421348676085472, "logits/rejected": 0.08866464346647263, "logps/chosen": -535.4236450195312, "logps/rejected": -694.8360595703125, "loss": 0.344, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.009089946746826, "rewards/margins": 1.8043378591537476, "rewards/rejected": -3.8134284019470215, "step": 610 }, { "epoch": 1.3, "learning_rate": 3.295635076714144e-07, "logits/chosen": -0.06299210339784622, "logits/rejected": -0.04097691923379898, "logps/chosen": -499.0265197753906, "logps/rejected": -653.1188354492188, "loss": 0.3317, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9393279552459717, "rewards/margins": 1.6952606439590454, "rewards/rejected": -3.6345887184143066, "step": 620 }, { "epoch": 1.32, "learning_rate": 3.12470398195219e-07, "logits/chosen": -0.013870243914425373, "logits/rejected": 0.0671583041548729, "logps/chosen": -512.1448974609375, "logps/rejected": -680.2833251953125, "loss": 0.3283, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9993736743927002, "rewards/margins": 1.9316644668579102, "rewards/rejected": -3.9310379028320312, "step": 630 }, { "epoch": 1.34, "learning_rate": 2.956286778402226e-07, "logits/chosen": -0.11906696856021881, "logits/rejected": -0.17396704852581024, "logps/chosen": -495.10345458984375, "logps/rejected": -677.45556640625, "loss": 0.319, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.2106266021728516, "rewards/margins": 1.9618394374847412, "rewards/rejected": -4.172466278076172, "step": 640 }, { "epoch": 1.36, "learning_rate": 2.7906092344356826e-07, "logits/chosen": -0.07961982488632202, "logits/rejected": -0.12522803246974945, "logps/chosen": -516.12890625, "logps/rejected": -696.549560546875, "loss": 0.3233, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.1828126907348633, "rewards/margins": 1.8567641973495483, "rewards/rejected": -4.039577007293701, "step": 650 }, { "epoch": 1.38, "learning_rate": 2.6278934458271996e-07, "logits/chosen": -0.15696656703948975, "logits/rejected": -0.12818947434425354, "logps/chosen": -568.789794921875, "logps/rejected": -722.9078369140625, "loss": 0.3273, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1225264072418213, "rewards/margins": 1.9797760248184204, "rewards/rejected": -4.102302551269531, "step": 660 }, { "epoch": 1.4, "learning_rate": 2.468357538028487e-07, "logits/chosen": 0.01013887207955122, "logits/rejected": 0.029213298112154007, "logps/chosen": -541.7221069335938, "logps/rejected": -720.105712890625, "loss": 0.3153, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.205817461013794, "rewards/margins": 2.0757954120635986, "rewards/rejected": -4.281612396240234, "step": 670 }, { "epoch": 1.42, "learning_rate": 2.312215373764551e-07, "logits/chosen": -0.06487278640270233, "logits/rejected": -0.01965305209159851, "logps/chosen": -516.360107421875, "logps/rejected": -694.418212890625, "loss": 0.3262, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.184447765350342, "rewards/margins": 1.861135721206665, "rewards/rejected": -4.045583248138428, "step": 680 }, { "epoch": 1.44, "learning_rate": 2.1596762663442213e-07, "logits/chosen": -0.16129662096500397, "logits/rejected": -0.09581325948238373, "logps/chosen": -546.6666259765625, "logps/rejected": -722.95263671875, "loss": 0.3194, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1821353435516357, "rewards/margins": 2.131826877593994, "rewards/rejected": -4.313961982727051, "step": 690 }, { "epoch": 1.47, "learning_rate": 2.0109446990692963e-07, "logits/chosen": -0.11689990758895874, "logits/rejected": -0.2061731368303299, "logps/chosen": -537.8167724609375, "logps/rejected": -781.1466674804688, "loss": 0.337, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.2769298553466797, "rewards/margins": 2.1021370887756348, "rewards/rejected": -4.3790669441223145, "step": 700 }, { "epoch": 1.47, "eval_logits/chosen": 0.041396014392375946, "eval_logits/rejected": 0.009947247803211212, "eval_logps/chosen": -541.1116333007812, "eval_logps/rejected": -699.990966796875, "eval_loss": 0.5290427207946777, "eval_rewards/accuracies": 0.77734375, "eval_rewards/chosen": -2.375269651412964, "eval_rewards/margins": 1.5083887577056885, "eval_rewards/rejected": -3.883657932281494, "eval_runtime": 64.3756, "eval_samples_per_second": 31.068, "eval_steps_per_second": 0.497, "step": 700 }, { "epoch": 1.49, "learning_rate": 1.8662200511184872e-07, "logits/chosen": -0.05749096721410751, "logits/rejected": -0.05046076700091362, "logps/chosen": -522.4590454101562, "logps/rejected": -701.3648071289062, "loss": 0.3387, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.1689815521240234, "rewards/margins": 2.023318290710449, "rewards/rejected": -4.192299842834473, "step": 710 }, { "epoch": 1.51, "learning_rate": 1.725696330273575e-07, "logits/chosen": -0.12996384501457214, "logits/rejected": -0.20302283763885498, "logps/chosen": -513.8458251953125, "logps/rejected": -704.156982421875, "loss": 0.3363, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9350038766860962, "rewards/margins": 2.0845541954040527, "rewards/rejected": -4.019558429718018, "step": 720 }, { "epoch": 1.53, "learning_rate": 1.589561912846089e-07, "logits/chosen": -0.02087187021970749, "logits/rejected": -0.01297883689403534, "logps/chosen": -513.4478149414062, "logps/rejected": -715.5861206054688, "loss": 0.3311, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.01027512550354, "rewards/margins": 1.9698785543441772, "rewards/rejected": -3.9801535606384277, "step": 730 }, { "epoch": 1.55, "learning_rate": 1.4579992911531496e-07, "logits/chosen": -0.08378951251506805, "logits/rejected": -0.10333013534545898, "logps/chosen": -528.8599853515625, "logps/rejected": -697.0358276367188, "loss": 0.31, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.1352083683013916, "rewards/margins": 2.004941940307617, "rewards/rejected": -4.140150547027588, "step": 740 }, { "epoch": 1.57, "learning_rate": 1.3311848288809813e-07, "logits/chosen": -0.021279722452163696, "logits/rejected": -0.07772192358970642, "logps/chosen": -553.6995239257812, "logps/rejected": -696.1124267578125, "loss": 0.3325, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2102222442626953, "rewards/margins": 1.8731515407562256, "rewards/rejected": -4.083374500274658, "step": 750 }, { "epoch": 1.59, "learning_rate": 1.209288524664029e-07, "logits/chosen": -0.1553444117307663, "logits/rejected": -0.07370997965335846, "logps/chosen": -559.6234130859375, "logps/rejected": -718.3536376953125, "loss": 0.3318, "rewards/accuracies": 0.84375, "rewards/chosen": -2.2175405025482178, "rewards/margins": 2.087991714477539, "rewards/rejected": -4.305531978607178, "step": 760 }, { "epoch": 1.61, "learning_rate": 1.0924737841966497e-07, "logits/chosen": 0.06289811432361603, "logits/rejected": 0.08451451361179352, "logps/chosen": -502.3817443847656, "logps/rejected": -655.4021606445312, "loss": 0.3065, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.215487241744995, "rewards/margins": 1.8723819255828857, "rewards/rejected": -4.087869644165039, "step": 770 }, { "epoch": 1.63, "learning_rate": 9.808972011828054e-08, "logits/chosen": -0.028034457936882973, "logits/rejected": -0.07584713399410248, "logps/chosen": -507.1363220214844, "logps/rejected": -684.7667236328125, "loss": 0.321, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.1353392601013184, "rewards/margins": 1.854984998703003, "rewards/rejected": -3.9903244972229004, "step": 780 }, { "epoch": 1.65, "learning_rate": 8.747083474174527e-08, "logits/chosen": -0.027467548847198486, "logits/rejected": -0.11992067098617554, "logps/chosen": -513.2483520507812, "logps/rejected": -773.5486450195312, "loss": 0.3106, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.409447431564331, "rewards/margins": 2.4787163734436035, "rewards/rejected": -4.8881635665893555, "step": 790 }, { "epoch": 1.67, "learning_rate": 7.740495722810269e-08, "logits/chosen": -0.22111931443214417, "logits/rejected": -0.16927292943000793, "logps/chosen": -594.322998046875, "logps/rejected": -767.4426879882812, "loss": 0.3398, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4693970680236816, "rewards/margins": 2.0875487327575684, "rewards/rejected": -4.55694580078125, "step": 800 }, { "epoch": 1.67, "eval_logits/chosen": 0.0749908834695816, "eval_logits/rejected": 0.0380852147936821, "eval_logps/chosen": -554.5546264648438, "eval_logps/rejected": -712.9505615234375, "eval_loss": 0.5297122001647949, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -2.509699821472168, "eval_rewards/margins": 1.5035548210144043, "eval_rewards/rejected": -4.013254642486572, "eval_runtime": 65.3757, "eval_samples_per_second": 30.592, "eval_steps_per_second": 0.489, "step": 800 }, { "epoch": 1.7, "learning_rate": 6.790558119157597e-08, "logits/chosen": -0.03027234971523285, "logits/rejected": -0.026968132704496384, "logps/chosen": -558.7872924804688, "logps/rejected": -744.04736328125, "loss": 0.3134, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.323040246963501, "rewards/margins": 2.1539111137390137, "rewards/rejected": -4.476951599121094, "step": 810 }, { "epoch": 1.72, "learning_rate": 5.898544083397e-08, "logits/chosen": -0.06089891865849495, "logits/rejected": -0.11520856618881226, "logps/chosen": -516.7492065429688, "logps/rejected": -698.2511596679688, "loss": 0.3131, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.2953174114227295, "rewards/margins": 2.0010485649108887, "rewards/rejected": -4.296365737915039, "step": 820 }, { "epoch": 1.74, "learning_rate": 5.065649387408705e-08, "logits/chosen": -0.026049736887216568, "logits/rejected": -0.11464808881282806, "logps/chosen": -556.2135620117188, "logps/rejected": -718.17578125, "loss": 0.3145, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.405728816986084, "rewards/margins": 1.908630132675171, "rewards/rejected": -4.314358711242676, "step": 830 }, { "epoch": 1.76, "learning_rate": 4.292990551804171e-08, "logits/chosen": -0.12712730467319489, "logits/rejected": -0.10675887763500214, "logps/chosen": -529.3737182617188, "logps/rejected": -723.229248046875, "loss": 0.327, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.146822929382324, "rewards/margins": 2.177410125732422, "rewards/rejected": -4.324233055114746, "step": 840 }, { "epoch": 1.78, "learning_rate": 3.581603349196371e-08, "logits/chosen": 0.01494809053838253, "logits/rejected": -0.08690011501312256, "logps/chosen": -540.9631958007812, "logps/rejected": -735.3316650390625, "loss": 0.3262, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.145940065383911, "rewards/margins": 2.0986034870147705, "rewards/rejected": -4.24454402923584, "step": 850 }, { "epoch": 1.8, "learning_rate": 2.9324414157151367e-08, "logits/chosen": -0.10003119707107544, "logits/rejected": -0.13639459013938904, "logps/chosen": -504.34552001953125, "logps/rejected": -681.4578247070312, "loss": 0.3261, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.0967519283294678, "rewards/margins": 1.8127696514129639, "rewards/rejected": -3.9095215797424316, "step": 860 }, { "epoch": 1.82, "learning_rate": 2.3463749726290284e-08, "logits/chosen": -0.005720620043575764, "logits/rejected": -0.09044505655765533, "logps/chosen": -550.0, "logps/rejected": -727.3507690429688, "loss": 0.3221, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1374850273132324, "rewards/margins": 2.060269355773926, "rewards/rejected": -4.197754859924316, "step": 870 }, { "epoch": 1.84, "learning_rate": 1.824189659787284e-08, "logits/chosen": -0.07426755130290985, "logits/rejected": -0.09251859039068222, "logps/chosen": -551.6723022460938, "logps/rejected": -753.9895629882812, "loss": 0.3162, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.1559104919433594, "rewards/margins": 2.1075100898742676, "rewards/rejected": -4.263420581817627, "step": 880 }, { "epoch": 1.86, "learning_rate": 1.3665854824458035e-08, "logits/chosen": -0.05439913272857666, "logits/rejected": -0.06511974334716797, "logps/chosen": -540.4793090820312, "logps/rejected": -690.4094848632812, "loss": 0.3265, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1754844188690186, "rewards/margins": 1.7633205652236938, "rewards/rejected": -3.938805103302002, "step": 890 }, { "epoch": 1.88, "learning_rate": 9.741758728888217e-09, "logits/chosen": -0.06001782417297363, "logits/rejected": -0.016363339498639107, "logps/chosen": -546.7725830078125, "logps/rejected": -690.0701293945312, "loss": 0.307, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.229912281036377, "rewards/margins": 1.9347511529922485, "rewards/rejected": -4.164663314819336, "step": 900 }, { "epoch": 1.88, "eval_logits/chosen": 0.06628188490867615, "eval_logits/rejected": 0.02886618673801422, "eval_logps/chosen": -549.4910278320312, "eval_logps/rejected": -703.8400268554688, "eval_loss": 0.5260834097862244, "eval_rewards/accuracies": 0.77734375, "eval_rewards/chosen": -2.4590635299682617, "eval_rewards/margins": 1.4630858898162842, "eval_rewards/rejected": -3.922149658203125, "eval_runtime": 65.2404, "eval_samples_per_second": 30.656, "eval_steps_per_second": 0.49, "step": 900 }, { "epoch": 1.9, "learning_rate": 6.474868681043577e-09, "logits/chosen": -0.0006331875920295715, "logits/rejected": 0.06078674644231796, "logps/chosen": -526.890869140625, "logps/rejected": -696.57080078125, "loss": 0.3133, "rewards/accuracies": 0.84375, "rewards/chosen": -2.2198734283447266, "rewards/margins": 1.9986340999603271, "rewards/rejected": -4.218507289886475, "step": 910 }, { "epoch": 1.93, "learning_rate": 3.869564046156459e-09, "logits/chosen": -0.17484715580940247, "logits/rejected": -0.2491791695356369, "logps/chosen": -560.9608154296875, "logps/rejected": -718.45703125, "loss": 0.3088, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.2493414878845215, "rewards/margins": 2.0530190467834473, "rewards/rejected": -4.3023600578308105, "step": 920 }, { "epoch": 1.95, "learning_rate": 1.929337314139412e-09, "logits/chosen": -0.0817941427230835, "logits/rejected": -0.2032664567232132, "logps/chosen": -562.2503662109375, "logps/rejected": -765.2064208984375, "loss": 0.3264, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.2634310722351074, "rewards/margins": 2.2729239463806152, "rewards/rejected": -4.536355018615723, "step": 930 }, { "epoch": 1.97, "learning_rate": 6.567894177967325e-10, "logits/chosen": -0.061837755143642426, "logits/rejected": -0.09424273669719696, "logps/chosen": -570.0264892578125, "logps/rejected": -782.4632568359375, "loss": 0.3155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.259819984436035, "rewards/margins": 2.1893467903137207, "rewards/rejected": -4.449166297912598, "step": 940 }, { "epoch": 1.99, "learning_rate": 5.3626246194704575e-11, "logits/chosen": 0.019718965515494347, "logits/rejected": -0.05742845684289932, "logps/chosen": -491.19879150390625, "logps/rejected": -698.8577880859375, "loss": 0.3247, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.148132801055908, "rewards/margins": 2.056044816970825, "rewards/rejected": -4.204176902770996, "step": 950 }, { "epoch": 2.0, "step": 954, "total_flos": 0.0, "train_loss": 0.44779854000739333, "train_runtime": 8782.9823, "train_samples_per_second": 13.921, "train_steps_per_second": 0.109 } ], "logging_steps": 10, "max_steps": 954, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 0.0, "trial_name": null, "trial_params": null }