{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999508116084604, "eval_steps": 200, "global_step": 1016, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.901960784313726e-08, "logits/chosen": -2.0737838745117188, "logits/rejected": -2.1456010341644287, "logps/chosen": -95.6572265625, "logps/rejected": -106.55765533447266, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 4.901960784313725e-07, "logits/chosen": -2.165830373764038, "logits/rejected": -2.060776948928833, "logps/chosen": -121.03773498535156, "logps/rejected": -87.5294189453125, "loss": 0.6918, "rewards/accuracies": 0.5, "rewards/chosen": 0.00047249632189050317, "rewards/margins": 0.002704059472307563, "rewards/rejected": -0.002231562975794077, "step": 10 }, { "epoch": 0.02, "learning_rate": 9.80392156862745e-07, "logits/chosen": -2.1349050998687744, "logits/rejected": -2.016066312789917, "logps/chosen": -130.94175720214844, "logps/rejected": -105.7674789428711, "loss": 0.6935, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0016262540593743324, "rewards/margins": -0.0006325626163743436, "rewards/rejected": -0.0009936915012076497, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.4705882352941177e-06, "logits/chosen": -2.2818872928619385, "logits/rejected": -2.1805636882781982, "logps/chosen": -121.03263854980469, "logps/rejected": -104.84712982177734, "loss": 0.6942, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.003329185303300619, "rewards/margins": -0.002142944373190403, "rewards/rejected": -0.0011862408136948943, "step": 30 }, { "epoch": 0.04, "learning_rate": 1.96078431372549e-06, "logits/chosen": -2.2757694721221924, "logits/rejected": -2.156691074371338, "logps/chosen": -126.5389633178711, "logps/rejected": -105.20024108886719, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0010745985200628638, "rewards/margins": 0.002502765040844679, "rewards/rejected": -0.0014281660551205277, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.450980392156863e-06, "logits/chosen": -2.3002381324768066, "logits/rejected": -2.206784725189209, "logps/chosen": -124.18415832519531, "logps/rejected": -98.63652801513672, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0012203993974253535, "rewards/margins": 0.003048995044082403, "rewards/rejected": -0.0018285956466570497, "step": 50 }, { "epoch": 0.06, "learning_rate": 2.9411764705882355e-06, "logits/chosen": -2.3112730979919434, "logits/rejected": -2.232532501220703, "logps/chosen": -126.93055725097656, "logps/rejected": -109.7610092163086, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01018393412232399, "rewards/margins": 0.0034624538384377956, "rewards/rejected": 0.006721480283886194, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.431372549019608e-06, "logits/chosen": -2.2930376529693604, "logits/rejected": -2.187328815460205, "logps/chosen": -119.68111419677734, "logps/rejected": -93.93470001220703, "loss": 0.6912, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.007645626552402973, "rewards/margins": 0.003846182022243738, "rewards/rejected": 0.0037994447629898787, "step": 70 }, { "epoch": 0.08, "learning_rate": 3.92156862745098e-06, "logits/chosen": -2.144854784011841, "logits/rejected": -2.016728401184082, "logps/chosen": -132.2415313720703, "logps/rejected": -106.7207260131836, "loss": 0.6902, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.015773242339491844, "rewards/margins": 0.006012483034282923, "rewards/rejected": 0.009760759770870209, "step": 80 }, { "epoch": 0.09, "learning_rate": 4.411764705882353e-06, "logits/chosen": -2.095534324645996, "logits/rejected": -1.9636192321777344, "logps/chosen": -106.81976318359375, "logps/rejected": -83.68408966064453, "loss": 0.6887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02147563174366951, "rewards/margins": 0.009103062562644482, "rewards/rejected": 0.012372570112347603, "step": 90 }, { "epoch": 0.1, "learning_rate": 4.901960784313726e-06, "logits/chosen": -2.0606656074523926, "logits/rejected": -1.87876296043396, "logps/chosen": -141.498779296875, "logps/rejected": -105.37713623046875, "loss": 0.6811, "rewards/accuracies": 0.75, "rewards/chosen": 0.04738330841064453, "rewards/margins": 0.024673232808709145, "rewards/rejected": 0.022710075601935387, "step": 100 }, { "epoch": 0.11, "learning_rate": 4.9990549169459415e-06, "logits/chosen": -2.2754921913146973, "logits/rejected": -2.135282516479492, "logps/chosen": -124.19816589355469, "logps/rejected": -98.95077514648438, "loss": 0.6777, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.056061066687107086, "rewards/margins": 0.03164363652467728, "rewards/rejected": 0.02441743016242981, "step": 110 }, { "epoch": 0.12, "learning_rate": 4.995216741642263e-06, "logits/chosen": -2.280646324157715, "logits/rejected": -2.206347942352295, "logps/chosen": -115.8212890625, "logps/rejected": -96.82814025878906, "loss": 0.6805, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07867949455976486, "rewards/margins": 0.02664627507328987, "rewards/rejected": 0.05203322693705559, "step": 120 }, { "epoch": 0.13, "learning_rate": 4.988430936991089e-06, "logits/chosen": -2.2835030555725098, "logits/rejected": -2.140094041824341, "logps/chosen": -127.94944763183594, "logps/rejected": -101.47581481933594, "loss": 0.6635, "rewards/accuracies": 0.75, "rewards/chosen": 0.10174749791622162, "rewards/margins": 0.06262228637933731, "rewards/rejected": 0.039125215262174606, "step": 130 }, { "epoch": 0.14, "learning_rate": 4.978705519144525e-06, "logits/chosen": -2.18971586227417, "logits/rejected": -2.0279135704040527, "logps/chosen": -140.58509826660156, "logps/rejected": -104.29924011230469, "loss": 0.664, "rewards/accuracies": 0.75, "rewards/chosen": 0.07688155025243759, "rewards/margins": 0.061921559274196625, "rewards/rejected": 0.014959996566176414, "step": 140 }, { "epoch": 0.15, "learning_rate": 4.966051976854862e-06, "logits/chosen": -2.333808183670044, "logits/rejected": -2.2144458293914795, "logps/chosen": -111.2592544555664, "logps/rejected": -89.93299865722656, "loss": 0.6526, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.11892716586589813, "rewards/margins": 0.08827908337116241, "rewards/rejected": 0.030648082494735718, "step": 150 }, { "epoch": 0.16, "learning_rate": 4.950485257902782e-06, "logits/chosen": -2.209681749343872, "logits/rejected": -2.1354687213897705, "logps/chosen": -122.34986877441406, "logps/rejected": -97.61170959472656, "loss": 0.6622, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.09915992617607117, "rewards/margins": 0.06800667941570282, "rewards/rejected": 0.031153246760368347, "step": 160 }, { "epoch": 0.17, "learning_rate": 4.932023751439358e-06, "logits/chosen": -2.276695489883423, "logits/rejected": -2.118220329284668, "logps/chosen": -131.77114868164062, "logps/rejected": -103.4157943725586, "loss": 0.6666, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10167907178401947, "rewards/margins": 0.06214705854654312, "rewards/rejected": 0.039532024413347244, "step": 170 }, { "epoch": 0.18, "learning_rate": 4.9106892662627395e-06, "logits/chosen": -2.347627878189087, "logits/rejected": -2.223806858062744, "logps/chosen": -125.82316589355469, "logps/rejected": -102.32342529296875, "loss": 0.6605, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.13379618525505066, "rewards/margins": 0.07378261536359787, "rewards/rejected": 0.06001356244087219, "step": 180 }, { "epoch": 0.19, "learning_rate": 4.886507005055149e-06, "logits/chosen": -2.299999713897705, "logits/rejected": -2.124567985534668, "logps/chosen": -135.9125518798828, "logps/rejected": -102.34548950195312, "loss": 0.6471, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.12664374709129333, "rewards/margins": 0.10224989801645279, "rewards/rejected": 0.024393849074840546, "step": 190 }, { "epoch": 0.2, "learning_rate": 4.859505534610658e-06, "logits/chosen": -2.2595176696777344, "logits/rejected": -2.167226552963257, "logps/chosen": -115.63578033447266, "logps/rejected": -97.40113830566406, "loss": 0.652, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07887722551822662, "rewards/margins": 0.09930779039859772, "rewards/rejected": -0.020430563017725945, "step": 200 }, { "epoch": 0.2, "eval_logits/chosen": -2.1231298446655273, "eval_logits/rejected": -2.0235936641693115, "eval_logps/chosen": -120.2629165649414, "eval_logps/rejected": -98.29754638671875, "eval_loss": 0.6594940423965454, "eval_rewards/accuracies": 0.6415094137191772, "eval_rewards/chosen": 0.04982735216617584, "eval_rewards/margins": 0.08182442933320999, "eval_rewards/rejected": -0.031997084617614746, "eval_runtime": 417.8564, "eval_samples_per_second": 1.0, "eval_steps_per_second": 0.127, "step": 200 }, { "epoch": 0.21, "learning_rate": 4.829716752088893e-06, "logits/chosen": -2.202777147293091, "logits/rejected": -2.1421408653259277, "logps/chosen": -107.44587707519531, "logps/rejected": -101.29158020019531, "loss": 0.6594, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03508143126964569, "rewards/margins": 0.07955195009708405, "rewards/rejected": -0.04447052255272865, "step": 210 }, { "epoch": 0.22, "learning_rate": 4.797175847334535e-06, "logits/chosen": -2.217074155807495, "logits/rejected": -2.0960960388183594, "logps/chosen": -130.84152221679688, "logps/rejected": -107.61418151855469, "loss": 0.6708, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0030643828213214874, "rewards/margins": 0.05633828788995743, "rewards/rejected": -0.05327390506863594, "step": 220 }, { "epoch": 0.23, "learning_rate": 4.761921261307143e-06, "logits/chosen": -2.2311947345733643, "logits/rejected": -2.0887582302093506, "logps/chosen": -123.64229583740234, "logps/rejected": -103.31034851074219, "loss": 0.655, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009160916320979595, "rewards/margins": 0.09632667899131775, "rewards/rejected": -0.10548758506774902, "step": 230 }, { "epoch": 0.24, "learning_rate": 4.723994640670377e-06, "logits/chosen": -2.2253684997558594, "logits/rejected": -2.0454909801483154, "logps/chosen": -137.86196899414062, "logps/rejected": -104.11041259765625, "loss": 0.6146, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011105505749583244, "rewards/margins": 0.1803070306777954, "rewards/rejected": -0.1914125233888626, "step": 240 }, { "epoch": 0.25, "learning_rate": 4.68344078859431e-06, "logits/chosen": -2.1830849647521973, "logits/rejected": -2.1306838989257812, "logps/chosen": -119.04063415527344, "logps/rejected": -113.45658111572266, "loss": 0.6407, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10984460264444351, "rewards/margins": 0.1269720494747162, "rewards/rejected": -0.2368166148662567, "step": 250 }, { "epoch": 0.26, "learning_rate": 4.6403076118289006e-06, "logits/chosen": -2.151690721511841, "logits/rejected": -2.0012199878692627, "logps/chosen": -135.11477661132812, "logps/rejected": -110.16157531738281, "loss": 0.6217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.162130668759346, "rewards/margins": 0.17287474870681763, "rewards/rejected": -0.33500543236732483, "step": 260 }, { "epoch": 0.27, "learning_rate": 4.5946460641111776e-06, "logits/chosen": -2.3390426635742188, "logits/rejected": -2.1155288219451904, "logps/chosen": -132.8888397216797, "logps/rejected": -110.8713607788086, "loss": 0.5927, "rewards/accuracies": 0.75, "rewards/chosen": -0.19292452931404114, "rewards/margins": 0.24748054146766663, "rewards/rejected": -0.44040507078170776, "step": 270 }, { "epoch": 0.28, "learning_rate": 4.546510085972983e-06, "logits/chosen": -2.3495311737060547, "logits/rejected": -2.2263104915618896, "logps/chosen": -143.91481018066406, "logps/rejected": -119.75496673583984, "loss": 0.6171, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20113949477672577, "rewards/margins": 0.19329218566417694, "rewards/rejected": -0.3944316804409027, "step": 280 }, { "epoch": 0.29, "learning_rate": 4.495956541020376e-06, "logits/chosen": -2.3487744331359863, "logits/rejected": -2.1971921920776367, "logps/chosen": -150.26535034179688, "logps/rejected": -133.6148681640625, "loss": 0.6038, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.36308565735816956, "rewards/margins": 0.2375185787677765, "rewards/rejected": -0.6006041765213013, "step": 290 }, { "epoch": 0.3, "learning_rate": 4.443045148759978e-06, "logits/chosen": -2.3486955165863037, "logits/rejected": -2.1962506771087646, "logps/chosen": -159.32064819335938, "logps/rejected": -126.43644714355469, "loss": 0.5853, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.44245219230651855, "rewards/margins": 0.2853606343269348, "rewards/rejected": -0.7278127670288086, "step": 300 }, { "epoch": 0.3, "learning_rate": 4.3878384140516025e-06, "logits/chosen": -2.3736767768859863, "logits/rejected": -2.249598741531372, "logps/chosen": -146.68006896972656, "logps/rejected": -135.25997924804688, "loss": 0.5898, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.41068369150161743, "rewards/margins": 0.26723065972328186, "rewards/rejected": -0.6779143214225769, "step": 310 }, { "epoch": 0.31, "learning_rate": 4.330401553270522e-06, "logits/chosen": -2.298905372619629, "logits/rejected": -2.1949081420898438, "logps/chosen": -146.1105194091797, "logps/rejected": -136.11727905273438, "loss": 0.6003, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.551005482673645, "rewards/margins": 0.2657201588153839, "rewards/rejected": -0.8167255520820618, "step": 320 }, { "epoch": 0.32, "learning_rate": 4.2708024172665795e-06, "logits/chosen": -2.402360439300537, "logits/rejected": -2.2175159454345703, "logps/chosen": -150.5807342529297, "logps/rejected": -125.05631256103516, "loss": 0.5465, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7037911415100098, "rewards/margins": 0.3860488533973694, "rewards/rejected": -1.089840054512024, "step": 330 }, { "epoch": 0.33, "learning_rate": 4.209111411211174e-06, "logits/chosen": -2.413839340209961, "logits/rejected": -2.253542423248291, "logps/chosen": -142.4720001220703, "logps/rejected": -127.50254821777344, "loss": 0.5395, "rewards/accuracies": 0.75, "rewards/chosen": -0.7327712178230286, "rewards/margins": 0.4077509045600891, "rewards/rejected": -1.1405221223831177, "step": 340 }, { "epoch": 0.34, "learning_rate": 4.145401411426788e-06, "logits/chosen": -2.4574408531188965, "logits/rejected": -2.3272013664245605, "logps/chosen": -143.1080322265625, "logps/rejected": -130.01637268066406, "loss": 0.5795, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7288345098495483, "rewards/margins": 0.36709967255592346, "rewards/rejected": -1.0959341526031494, "step": 350 }, { "epoch": 0.35, "learning_rate": 4.079747679297314e-06, "logits/chosen": -2.295055627822876, "logits/rejected": -2.179664134979248, "logps/chosen": -157.7494354248047, "logps/rejected": -137.73941040039062, "loss": 0.5973, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8279803395271301, "rewards/margins": 0.3018009066581726, "rewards/rejected": -1.1297812461853027, "step": 360 }, { "epoch": 0.36, "learning_rate": 4.012227772360889e-06, "logits/chosen": -2.2948107719421387, "logits/rejected": -2.1263232231140137, "logps/chosen": -167.21339416503906, "logps/rejected": -150.82669067382812, "loss": 0.5349, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9286147356033325, "rewards/margins": 0.44435009360313416, "rewards/rejected": -1.3729647397994995, "step": 370 }, { "epoch": 0.37, "learning_rate": 3.942921452690245e-06, "logits/chosen": -2.3513195514678955, "logits/rejected": -2.25927734375, "logps/chosen": -173.61428833007812, "logps/rejected": -163.9345703125, "loss": 0.6164, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3186113834381104, "rewards/margins": 0.24015231430530548, "rewards/rejected": -1.558763861656189, "step": 380 }, { "epoch": 0.38, "learning_rate": 3.871910592668817e-06, "logits/chosen": -2.433640480041504, "logits/rejected": -2.300931215286255, "logps/chosen": -170.8921356201172, "logps/rejected": -165.83462524414062, "loss": 0.503, "rewards/accuracies": 0.8125, "rewards/chosen": -1.420196294784546, "rewards/margins": 0.5389910340309143, "rewards/rejected": -1.9591872692108154, "step": 390 }, { "epoch": 0.39, "learning_rate": 3.799279078273921e-06, "logits/chosen": -2.3466134071350098, "logits/rejected": -2.157196044921875, "logps/chosen": -177.853271484375, "logps/rejected": -158.73776245117188, "loss": 0.4905, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5086350440979004, "rewards/margins": 0.6321147680282593, "rewards/rejected": -2.140749454498291, "step": 400 }, { "epoch": 0.39, "eval_logits/chosen": -2.195096015930176, "eval_logits/rejected": -2.0950398445129395, "eval_logps/chosen": -177.19459533691406, "eval_logps/rejected": -168.9884490966797, "eval_loss": 0.5550708174705505, "eval_rewards/accuracies": 0.6957547068595886, "eval_rewards/chosen": -1.6581227779388428, "eval_rewards/margins": 0.4946018159389496, "eval_rewards/rejected": -2.152724504470825, "eval_runtime": 417.866, "eval_samples_per_second": 1.0, "eval_steps_per_second": 0.127, "step": 400 }, { "epoch": 0.4, "learning_rate": 3.725112709981249e-06, "logits/chosen": -2.20538592338562, "logits/rejected": -2.059528112411499, "logps/chosen": -192.91726684570312, "logps/rejected": -185.90341186523438, "loss": 0.5449, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8029924631118774, "rewards/margins": 0.5775827765464783, "rewards/rejected": -2.38057541847229, "step": 410 }, { "epoch": 0.41, "learning_rate": 3.649499101407737e-06, "logits/chosen": -2.363370895385742, "logits/rejected": -2.2048511505126953, "logps/chosen": -224.8810577392578, "logps/rejected": -214.8511505126953, "loss": 0.5825, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7866973876953125, "rewards/margins": 0.5168424844741821, "rewards/rejected": -3.303539991378784, "step": 420 }, { "epoch": 0.42, "learning_rate": 3.5725275758125564e-06, "logits/chosen": -2.27677059173584, "logits/rejected": -2.1246485710144043, "logps/chosen": -200.77066040039062, "logps/rejected": -190.68533325195312, "loss": 0.4805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6391139030456543, "rewards/margins": 0.7239618897438049, "rewards/rejected": -3.3630757331848145, "step": 430 }, { "epoch": 0.43, "learning_rate": 3.494289060578478e-06, "logits/chosen": -2.3822944164276123, "logits/rejected": -2.2915594577789307, "logps/chosen": -169.66128540039062, "logps/rejected": -174.19607543945312, "loss": 0.4812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6458194255828857, "rewards/margins": 0.7625138163566589, "rewards/rejected": -2.4083335399627686, "step": 440 }, { "epoch": 0.44, "learning_rate": 3.414875979798272e-06, "logits/chosen": -2.327730178833008, "logits/rejected": -2.2062036991119385, "logps/chosen": -194.1064453125, "logps/rejected": -193.63107299804688, "loss": 0.4394, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4300081729888916, "rewards/margins": 0.892192006111145, "rewards/rejected": -2.322200059890747, "step": 450 }, { "epoch": 0.45, "learning_rate": 3.3343821450930196e-06, "logits/chosen": -2.454336166381836, "logits/rejected": -2.2937960624694824, "logps/chosen": -201.33657836914062, "logps/rejected": -188.39266967773438, "loss": 0.4758, "rewards/accuracies": 0.75, "rewards/chosen": -2.0642800331115723, "rewards/margins": 0.8783187866210938, "rewards/rejected": -2.942598819732666, "step": 460 }, { "epoch": 0.46, "learning_rate": 3.252902644791325e-06, "logits/chosen": -2.188424587249756, "logits/rejected": -2.041105031967163, "logps/chosen": -214.7936248779297, "logps/rejected": -222.3824462890625, "loss": 0.4536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7383971214294434, "rewards/margins": 0.9385073781013489, "rewards/rejected": -3.6769042015075684, "step": 470 }, { "epoch": 0.47, "learning_rate": 3.170533731600339e-06, "logits/chosen": -2.3067922592163086, "logits/rejected": -2.1540229320526123, "logps/chosen": -204.9595184326172, "logps/rejected": -204.6565704345703, "loss": 0.4887, "rewards/accuracies": 0.75, "rewards/chosen": -2.3199477195739746, "rewards/margins": 0.8953973650932312, "rewards/rejected": -3.2153449058532715, "step": 480 }, { "epoch": 0.48, "learning_rate": 3.0873727089012816e-06, "logits/chosen": -2.4507811069488525, "logits/rejected": -2.357919931411743, "logps/chosen": -212.8636932373047, "logps/rejected": -208.5207977294922, "loss": 0.5393, "rewards/accuracies": 0.75, "rewards/chosen": -2.4198460578918457, "rewards/margins": 0.7457529306411743, "rewards/rejected": -3.1655986309051514, "step": 490 }, { "epoch": 0.49, "learning_rate": 3.0035178158038026e-06, "logits/chosen": -2.2298638820648193, "logits/rejected": -2.035947322845459, "logps/chosen": -217.26846313476562, "logps/rejected": -209.74526977539062, "loss": 0.4886, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4076380729675293, "rewards/margins": 0.9488040208816528, "rewards/rejected": -3.3564422130584717, "step": 500 }, { "epoch": 0.5, "learning_rate": 2.919068111094937e-06, "logits/chosen": -2.3187146186828613, "logits/rejected": -2.193861484527588, "logps/chosen": -185.0901641845703, "logps/rejected": -195.90634155273438, "loss": 0.552, "rewards/accuracies": 0.6875, "rewards/chosen": -2.132998466491699, "rewards/margins": 0.827830970287323, "rewards/rejected": -2.960829257965088, "step": 510 }, { "epoch": 0.51, "learning_rate": 2.8341233562197895e-06, "logits/chosen": -2.3116376399993896, "logits/rejected": -2.246950626373291, "logps/chosen": -172.0282440185547, "logps/rejected": -177.47348022460938, "loss": 0.5152, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7447055578231812, "rewards/margins": 0.754051685333252, "rewards/rejected": -2.4987568855285645, "step": 520 }, { "epoch": 0.52, "learning_rate": 2.7487838974321352e-06, "logits/chosen": -2.2577805519104004, "logits/rejected": -2.143658399581909, "logps/chosen": -178.53317260742188, "logps/rejected": -182.5507049560547, "loss": 0.4719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.789046287536621, "rewards/margins": 0.8771921396255493, "rewards/rejected": -2.666238307952881, "step": 530 }, { "epoch": 0.53, "learning_rate": 2.6631505472541997e-06, "logits/chosen": -2.2621216773986816, "logits/rejected": -2.1167335510253906, "logps/chosen": -193.0259246826172, "logps/rejected": -198.51705932617188, "loss": 0.4442, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8945804834365845, "rewards/margins": 1.0066120624542236, "rewards/rejected": -2.9011926651000977, "step": 540 }, { "epoch": 0.54, "learning_rate": 2.5773244653856173e-06, "logits/chosen": -2.2354609966278076, "logits/rejected": -2.1045310497283936, "logps/chosen": -205.27163696289062, "logps/rejected": -207.8455810546875, "loss": 0.4791, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.161865711212158, "rewards/margins": 0.9317368268966675, "rewards/rejected": -3.0936026573181152, "step": 550 }, { "epoch": 0.55, "learning_rate": 2.4914070392022717e-06, "logits/chosen": -2.274534225463867, "logits/rejected": -2.158811569213867, "logps/chosen": -208.08349609375, "logps/rejected": -211.2762451171875, "loss": 0.4978, "rewards/accuracies": 0.75, "rewards/chosen": -2.0876078605651855, "rewards/margins": 0.8888559341430664, "rewards/rejected": -2.976463794708252, "step": 560 }, { "epoch": 0.56, "learning_rate": 2.4054997639861778e-06, "logits/chosen": -2.1874241828918457, "logits/rejected": -2.0126781463623047, "logps/chosen": -207.1369171142578, "logps/rejected": -207.59768676757812, "loss": 0.4732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0929489135742188, "rewards/margins": 1.14609694480896, "rewards/rejected": -3.2390456199645996, "step": 570 }, { "epoch": 0.57, "learning_rate": 2.3197041230278905e-06, "logits/chosen": -2.3066487312316895, "logits/rejected": -2.18741512298584, "logps/chosen": -204.58642578125, "logps/rejected": -222.44973754882812, "loss": 0.427, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2493700981140137, "rewards/margins": 1.2541824579238892, "rewards/rejected": -3.5035526752471924, "step": 580 }, { "epoch": 0.58, "learning_rate": 2.234121467743082e-06, "logits/chosen": -2.3349661827087402, "logits/rejected": -2.252894163131714, "logps/chosen": -208.3444061279297, "logps/rejected": -211.37295532226562, "loss": 0.5968, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.635042905807495, "rewards/margins": 0.8392454981803894, "rewards/rejected": -3.4742884635925293, "step": 590 }, { "epoch": 0.59, "learning_rate": 2.148852897944905e-06, "logits/chosen": -2.3977198600769043, "logits/rejected": -2.2428812980651855, "logps/chosen": -208.42538452148438, "logps/rejected": -219.8905487060547, "loss": 0.4249, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.645089626312256, "rewards/margins": 1.1129385232925415, "rewards/rejected": -3.758028030395508, "step": 600 }, { "epoch": 0.59, "eval_logits/chosen": -2.17734956741333, "eval_logits/rejected": -2.078249454498291, "eval_logps/chosen": -237.10446166992188, "eval_logps/rejected": -241.38665771484375, "eval_loss": 0.5327094793319702, "eval_rewards/accuracies": 0.724056601524353, "eval_rewards/chosen": -3.455418348312378, "eval_rewards/margins": 0.8692519068717957, "eval_rewards/rejected": -4.324670314788818, "eval_runtime": 423.7708, "eval_samples_per_second": 0.986, "eval_steps_per_second": 0.125, "step": 600 }, { "epoch": 0.6, "learning_rate": 2.063999142413574e-06, "logits/chosen": -2.2915334701538086, "logits/rejected": -2.1494946479797363, "logps/chosen": -247.0482177734375, "logps/rejected": -239.52462768554688, "loss": 0.489, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.5384364128112793, "rewards/margins": 0.9590060114860535, "rewards/rejected": -4.497443199157715, "step": 610 }, { "epoch": 0.61, "learning_rate": 1.9796604399042547e-06, "logits/chosen": -2.3757712841033936, "logits/rejected": -2.2203211784362793, "logps/chosen": -268.9779052734375, "logps/rejected": -274.21771240234375, "loss": 0.4595, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.319927215576172, "rewards/margins": 1.1538012027740479, "rewards/rejected": -5.473728179931641, "step": 620 }, { "epoch": 0.62, "learning_rate": 1.8959364207338216e-06, "logits/chosen": -2.345416784286499, "logits/rejected": -2.180387020111084, "logps/chosen": -245.0520477294922, "logps/rejected": -258.79498291015625, "loss": 0.4835, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.156452655792236, "rewards/margins": 1.1458499431610107, "rewards/rejected": -5.302302360534668, "step": 630 }, { "epoch": 0.63, "learning_rate": 1.8129259890863825e-06, "logits/chosen": -2.3113701343536377, "logits/rejected": -2.200329303741455, "logps/chosen": -258.2785949707031, "logps/rejected": -273.8367614746094, "loss": 0.5494, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.8280506134033203, "rewards/margins": 1.0454813241958618, "rewards/rejected": -4.873531818389893, "step": 640 }, { "epoch": 0.64, "learning_rate": 1.7307272061765738e-06, "logits/chosen": -2.332291841506958, "logits/rejected": -2.238374948501587, "logps/chosen": -245.2693328857422, "logps/rejected": -258.13922119140625, "loss": 0.4966, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.4873454570770264, "rewards/margins": 1.1437709331512451, "rewards/rejected": -4.6311163902282715, "step": 650 }, { "epoch": 0.65, "learning_rate": 1.649437174408685e-06, "logits/chosen": -2.2653393745422363, "logits/rejected": -2.1328892707824707, "logps/chosen": -225.3964385986328, "logps/rejected": -243.22384643554688, "loss": 0.5442, "rewards/accuracies": 0.75, "rewards/chosen": -2.7976624965667725, "rewards/margins": 1.2127068042755127, "rewards/rejected": -4.010369300842285, "step": 660 }, { "epoch": 0.66, "learning_rate": 1.569151922668422e-06, "logits/chosen": -2.367928981781006, "logits/rejected": -2.253605365753174, "logps/chosen": -201.37025451660156, "logps/rejected": -218.9671630859375, "loss": 0.5362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.52730131149292, "rewards/margins": 1.1274640560150146, "rewards/rejected": -3.6547648906707764, "step": 670 }, { "epoch": 0.67, "learning_rate": 1.4899662928828428e-06, "logits/chosen": -2.309407949447632, "logits/rejected": -2.1246845722198486, "logps/chosen": -186.25376892089844, "logps/rejected": -205.3202667236328, "loss": 0.4534, "rewards/accuracies": 0.8125, "rewards/chosen": -2.198615550994873, "rewards/margins": 1.3432915210723877, "rewards/rejected": -3.5419068336486816, "step": 680 }, { "epoch": 0.68, "learning_rate": 1.4119738279824507e-06, "logits/chosen": -2.233764886856079, "logits/rejected": -2.0911850929260254, "logps/chosen": -194.0051727294922, "logps/rejected": -202.19174194335938, "loss": 0.5816, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.272157907485962, "rewards/margins": 0.9171239137649536, "rewards/rejected": -3.189281463623047, "step": 690 }, { "epoch": 0.69, "learning_rate": 1.3352666613978152e-06, "logits/chosen": -2.1566410064697266, "logits/rejected": -2.075209379196167, "logps/chosen": -198.47071838378906, "logps/rejected": -202.29379272460938, "loss": 0.5355, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.325533151626587, "rewards/margins": 0.8646724820137024, "rewards/rejected": -3.1902058124542236, "step": 700 }, { "epoch": 0.7, "learning_rate": 1.2599354082212523e-06, "logits/chosen": -2.222766876220703, "logits/rejected": -2.0928432941436768, "logps/chosen": -190.0797119140625, "logps/rejected": -198.69985961914062, "loss": 0.5152, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1240830421447754, "rewards/margins": 0.9723888635635376, "rewards/rejected": -3.0964715480804443, "step": 710 }, { "epoch": 0.71, "learning_rate": 1.186069058162127e-06, "logits/chosen": -2.3559908866882324, "logits/rejected": -2.249436140060425, "logps/chosen": -197.04818725585938, "logps/rejected": -201.38693237304688, "loss": 0.5537, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.438502788543701, "rewards/margins": 0.8104500770568848, "rewards/rejected": -3.248952865600586, "step": 720 }, { "epoch": 0.72, "learning_rate": 1.113754870422254e-06, "logits/chosen": -2.40106201171875, "logits/rejected": -2.2911598682403564, "logps/chosen": -198.04409790039062, "logps/rejected": -214.9007568359375, "loss": 0.5454, "rewards/accuracies": 0.75, "rewards/chosen": -2.3586318492889404, "rewards/margins": 0.8204809427261353, "rewards/rejected": -3.179112672805786, "step": 730 }, { "epoch": 0.73, "learning_rate": 1.0430782706155545e-06, "logits/chosen": -2.4164676666259766, "logits/rejected": -2.2673213481903076, "logps/chosen": -204.253173828125, "logps/rejected": -209.43753051757812, "loss": 0.45, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2196271419525146, "rewards/margins": 1.149065613746643, "rewards/rejected": -3.3686928749084473, "step": 740 }, { "epoch": 0.74, "learning_rate": 9.741227498537615e-07, "logits/chosen": -2.457432270050049, "logits/rejected": -2.321898937225342, "logps/chosen": -194.0103302001953, "logps/rejected": -204.46145629882812, "loss": 0.4596, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4977972507476807, "rewards/margins": 1.0296275615692139, "rewards/rejected": -3.5274243354797363, "step": 750 }, { "epoch": 0.75, "learning_rate": 9.069697661173668e-07, "logits/chosen": -2.3087573051452637, "logits/rejected": -2.210942268371582, "logps/chosen": -205.71200561523438, "logps/rejected": -217.53921508789062, "loss": 0.6425, "rewards/accuracies": 0.6875, "rewards/chosen": -2.667379379272461, "rewards/margins": 0.8116022348403931, "rewards/rejected": -3.4789810180664062, "step": 760 }, { "epoch": 0.76, "learning_rate": 8.416986480283434e-07, "logits/chosen": -2.2393643856048584, "logits/rejected": -2.096648693084717, "logps/chosen": -205.7020721435547, "logps/rejected": -220.5606689453125, "loss": 0.5619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.711803913116455, "rewards/margins": 1.0723652839660645, "rewards/rejected": -3.7841694355010986, "step": 770 }, { "epoch": 0.77, "learning_rate": 7.783865011382876e-07, "logits/chosen": -2.2971372604370117, "logits/rejected": -2.1558918952941895, "logps/chosen": -213.41958618164062, "logps/rejected": -214.00326538085938, "loss": 0.5043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.363002300262451, "rewards/margins": 1.088847279548645, "rewards/rejected": -3.4518496990203857, "step": 780 }, { "epoch": 0.78, "learning_rate": 7.171081168427205e-07, "logits/chosen": -2.331343173980713, "logits/rejected": -2.165480613708496, "logps/chosen": -221.4240264892578, "logps/rejected": -226.91650390625, "loss": 0.473, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2688238620758057, "rewards/margins": 1.2109057903289795, "rewards/rejected": -3.479729413986206, "step": 790 }, { "epoch": 0.79, "learning_rate": 6.579358840291064e-07, "logits/chosen": -2.2774598598480225, "logits/rejected": -2.1744918823242188, "logps/chosen": -209.7314453125, "logps/rejected": -224.2617645263672, "loss": 0.5858, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.5639281272888184, "rewards/margins": 0.7768001556396484, "rewards/rejected": -3.340728282928467, "step": 800 }, { "epoch": 0.79, "eval_logits/chosen": -2.159109354019165, "eval_logits/rejected": -2.058640241622925, "eval_logps/chosen": -205.4982452392578, "eval_logps/rejected": -212.2717742919922, "eval_loss": 0.5206710696220398, "eval_rewards/accuracies": 0.7334905862808228, "eval_rewards/chosen": -2.507232666015625, "eval_rewards/margins": 0.9439911842346191, "eval_rewards/rejected": -3.4512243270874023, "eval_runtime": 423.3516, "eval_samples_per_second": 0.987, "eval_steps_per_second": 0.125, "step": 800 }, { "epoch": 0.8, "learning_rate": 6.00939703563006e-07, "logits/chosen": -2.3462395668029785, "logits/rejected": -2.2609333992004395, "logps/chosen": -191.94070434570312, "logps/rejected": -208.1775665283203, "loss": 0.5793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5009541511535645, "rewards/margins": 0.8846192359924316, "rewards/rejected": -3.385573625564575, "step": 810 }, { "epoch": 0.81, "learning_rate": 5.461869057133412e-07, "logits/chosen": -2.386518955230713, "logits/rejected": -2.2898330688476562, "logps/chosen": -198.1256866455078, "logps/rejected": -216.5961456298828, "loss": 0.4895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2799556255340576, "rewards/margins": 1.0353368520736694, "rewards/rejected": -3.3152928352355957, "step": 820 }, { "epoch": 0.82, "learning_rate": 4.937421706143497e-07, "logits/chosen": -2.2601094245910645, "logits/rejected": -2.1199710369110107, "logps/chosen": -208.54898071289062, "logps/rejected": -210.0823974609375, "loss": 0.5261, "rewards/accuracies": 0.75, "rewards/chosen": -2.4491305351257324, "rewards/margins": 0.9635556936264038, "rewards/rejected": -3.412686586380005, "step": 830 }, { "epoch": 0.83, "learning_rate": 4.43667451858166e-07, "logits/chosen": -2.262453079223633, "logits/rejected": -2.0967936515808105, "logps/chosen": -188.5206298828125, "logps/rejected": -202.87240600585938, "loss": 0.4361, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2963976860046387, "rewards/margins": 1.180870771408081, "rewards/rejected": -3.4772682189941406, "step": 840 }, { "epoch": 0.84, "learning_rate": 3.9602190330830484e-07, "logits/chosen": -2.2342655658721924, "logits/rejected": -2.130017042160034, "logps/chosen": -200.30343627929688, "logps/rejected": -220.6678466796875, "loss": 0.4885, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.533165454864502, "rewards/margins": 1.0824253559112549, "rewards/rejected": -3.6155905723571777, "step": 850 }, { "epoch": 0.85, "learning_rate": 3.5086180922049295e-07, "logits/chosen": -2.3764655590057373, "logits/rejected": -2.235109329223633, "logps/chosen": -224.97146606445312, "logps/rejected": -221.2453155517578, "loss": 0.4675, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.4995951652526855, "rewards/margins": 1.0894619226455688, "rewards/rejected": -3.589057445526123, "step": 860 }, { "epoch": 0.86, "learning_rate": 3.0824051775340895e-07, "logits/chosen": -2.380509853363037, "logits/rejected": -2.275191068649292, "logps/chosen": -180.95895385742188, "logps/rejected": -195.82479858398438, "loss": 0.547, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3850598335266113, "rewards/margins": 0.8363308906555176, "rewards/rejected": -3.22139048576355, "step": 870 }, { "epoch": 0.87, "learning_rate": 2.6820837794786336e-07, "logits/chosen": -2.2321319580078125, "logits/rejected": -2.1590046882629395, "logps/chosen": -204.28173828125, "logps/rejected": -214.01327514648438, "loss": 0.6701, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.5836868286132812, "rewards/margins": 0.4722086787223816, "rewards/rejected": -3.0558953285217285, "step": 880 }, { "epoch": 0.88, "learning_rate": 2.3081268024887694e-07, "logits/chosen": -2.222381114959717, "logits/rejected": -2.051706075668335, "logps/chosen": -199.63970947265625, "logps/rejected": -208.906005859375, "loss": 0.4139, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3457841873168945, "rewards/margins": 1.2535767555236816, "rewards/rejected": -3.599360942840576, "step": 890 }, { "epoch": 0.89, "learning_rate": 1.9609760064091044e-07, "logits/chosen": -2.3242409229278564, "logits/rejected": -2.2560477256774902, "logps/chosen": -205.27554321289062, "logps/rejected": -201.7273712158203, "loss": 0.5173, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.428300142288208, "rewards/margins": 0.8561038970947266, "rewards/rejected": -3.2844040393829346, "step": 900 }, { "epoch": 0.9, "learning_rate": 1.6410414846224992e-07, "logits/chosen": -2.220360517501831, "logits/rejected": -2.109575033187866, "logps/chosen": -201.75045776367188, "logps/rejected": -214.3638458251953, "loss": 0.4565, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.569467067718506, "rewards/margins": 1.1193673610687256, "rewards/rejected": -3.6888339519500732, "step": 910 }, { "epoch": 0.91, "learning_rate": 1.348701179601819e-07, "logits/chosen": -2.401984691619873, "logits/rejected": -2.2650654315948486, "logps/chosen": -215.353515625, "logps/rejected": -225.1014404296875, "loss": 0.4529, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.38716983795166, "rewards/margins": 1.1682064533233643, "rewards/rejected": -3.5553765296936035, "step": 920 }, { "epoch": 0.91, "learning_rate": 1.0843004364420151e-07, "logits/chosen": -2.2123489379882812, "logits/rejected": -2.100048065185547, "logps/chosen": -215.25479125976562, "logps/rejected": -229.445556640625, "loss": 0.5697, "rewards/accuracies": 0.6875, "rewards/chosen": -2.393723964691162, "rewards/margins": 0.8360812067985535, "rewards/rejected": -3.2298049926757812, "step": 930 }, { "epoch": 0.92, "learning_rate": 8.481515948997931e-08, "logits/chosen": -2.3680367469787598, "logits/rejected": -2.26953387260437, "logps/chosen": -216.99514770507812, "logps/rejected": -212.17263793945312, "loss": 0.6119, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.478694200515747, "rewards/margins": 0.7769169807434082, "rewards/rejected": -3.255610942840576, "step": 940 }, { "epoch": 0.93, "learning_rate": 6.4053362042297e-08, "logits/chosen": -2.2625975608825684, "logits/rejected": -2.104025363922119, "logps/chosen": -205.9175262451172, "logps/rejected": -217.52304077148438, "loss": 0.4583, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4613893032073975, "rewards/margins": 1.17227303981781, "rewards/rejected": -3.633662462234497, "step": 950 }, { "epoch": 0.94, "learning_rate": 4.616917746052163e-08, "logits/chosen": -2.3516554832458496, "logits/rejected": -2.214130401611328, "logps/chosen": -203.63438415527344, "logps/rejected": -211.27334594726562, "loss": 0.546, "rewards/accuracies": 0.75, "rewards/chosen": -2.5478570461273193, "rewards/margins": 0.990521252155304, "rewards/rejected": -3.5383784770965576, "step": 960 }, { "epoch": 0.95, "learning_rate": 3.118373254556412e-08, "logits/chosen": -2.3878164291381836, "logits/rejected": -2.2502613067626953, "logps/chosen": -198.12130737304688, "logps/rejected": -199.36111450195312, "loss": 0.4535, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.448812484741211, "rewards/margins": 1.0348026752471924, "rewards/rejected": -3.483614683151245, "step": 970 }, { "epoch": 0.96, "learning_rate": 1.9114729782535037e-08, "logits/chosen": -2.4108872413635254, "logits/rejected": -2.297142505645752, "logps/chosen": -194.57632446289062, "logps/rejected": -202.7489776611328, "loss": 0.4981, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.350036859512329, "rewards/margins": 0.9002410173416138, "rewards/rejected": -3.2502777576446533, "step": 980 }, { "epoch": 0.97, "learning_rate": 9.97642642858815e-09, "logits/chosen": -2.28024959564209, "logits/rejected": -2.139801502227783, "logps/chosen": -211.3317413330078, "logps/rejected": -211.1747283935547, "loss": 0.5021, "rewards/accuracies": 0.6875, "rewards/chosen": -2.55255126953125, "rewards/margins": 0.9248201251029968, "rewards/rejected": -3.4773712158203125, "step": 990 }, { "epoch": 0.98, "learning_rate": 3.779617670651436e-09, "logits/chosen": -2.248671054840088, "logits/rejected": -2.1480166912078857, "logps/chosen": -217.71707153320312, "logps/rejected": -225.20040893554688, "loss": 0.6128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.73987078666687, "rewards/margins": 0.9156008958816528, "rewards/rejected": -3.6554713249206543, "step": 1000 }, { "epoch": 0.98, "eval_logits/chosen": -2.1620335578918457, "eval_logits/rejected": -2.062356948852539, "eval_logps/chosen": -206.584716796875, "eval_logps/rejected": -214.93492126464844, "eval_loss": 0.521207869052887, "eval_rewards/accuracies": 0.7405660152435303, "eval_rewards/chosen": -2.5398268699645996, "eval_rewards/margins": 0.9912916421890259, "eval_rewards/rejected": -3.531118631362915, "eval_runtime": 423.4333, "eval_samples_per_second": 0.987, "eval_steps_per_second": 0.125, "step": 1000 }, { "epoch": 0.99, "learning_rate": 5.316238729444201e-10, "logits/chosen": -2.234221935272217, "logits/rejected": -2.0627448558807373, "logps/chosen": -209.44503784179688, "logps/rejected": -219.24606323242188, "loss": 0.4684, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.601386547088623, "rewards/margins": 1.190915584564209, "rewards/rejected": -3.792301893234253, "step": 1010 }, { "epoch": 1.0, "step": 1016, "total_flos": 0.0, "train_loss": 0.5583291621658746, "train_runtime": 16785.8838, "train_samples_per_second": 0.484, "train_steps_per_second": 0.061 } ], "logging_steps": 10, "max_steps": 1016, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }