{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9989534275248562, "eval_steps": 100, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.617801047120419e-09, "logits/chosen": -2.913743019104004, "logits/rejected": -2.700042247772217, "logps/chosen": -229.8895263671875, "logps/rejected": -215.29129028320312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.6178010471204188e-08, "logits/chosen": -2.688786506652832, "logits/rejected": -2.711923599243164, "logps/chosen": -293.5041198730469, "logps/rejected": -247.4051055908203, "loss": 0.6936, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": 0.001126001006923616, "rewards/margins": 7.740315049886703e-05, "rewards/rejected": 0.001048597856424749, "step": 10 }, { "epoch": 0.02, "learning_rate": 5.2356020942408376e-08, "logits/chosen": -2.6238837242126465, "logits/rejected": -2.633690118789673, "logps/chosen": -270.5980529785156, "logps/rejected": -243.71517944335938, "loss": 0.6925, "rewards/accuracies": 0.53125, "rewards/chosen": 0.005117292050272226, "rewards/margins": 0.004833548329770565, "rewards/rejected": 0.00028374380781315267, "step": 20 }, { "epoch": 0.03, "learning_rate": 7.853403141361257e-08, "logits/chosen": -2.717092990875244, "logits/rejected": -2.675096035003662, "logps/chosen": -277.98382568359375, "logps/rejected": -252.8087158203125, "loss": 0.6925, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.006060977932065725, "rewards/margins": 0.0060595860704779625, "rewards/rejected": 1.3906974345445633e-06, "step": 30 }, { "epoch": 0.04, "learning_rate": 1.0471204188481675e-07, "logits/chosen": -2.6683716773986816, "logits/rejected": -2.6545655727386475, "logps/chosen": -272.1450500488281, "logps/rejected": -234.2085418701172, "loss": 0.6845, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.016314741224050522, "rewards/margins": 0.025149071589112282, "rewards/rejected": -0.00883432850241661, "step": 40 }, { "epoch": 0.05, "learning_rate": 1.3089005235602092e-07, "logits/chosen": -2.7000627517700195, "logits/rejected": -2.662144184112549, "logps/chosen": -292.8883056640625, "logps/rejected": -269.4085998535156, "loss": 0.6758, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.016980037093162537, "rewards/margins": 0.03487912937998772, "rewards/rejected": -0.01789909228682518, "step": 50 }, { "epoch": 0.06, "learning_rate": 1.5706806282722514e-07, "logits/chosen": -2.6669087409973145, "logits/rejected": -2.668426275253296, "logps/chosen": -284.6460266113281, "logps/rejected": -270.30694580078125, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.045764319598674774, "rewards/margins": 0.07312445342540741, "rewards/rejected": -0.027360141277313232, "step": 60 }, { "epoch": 0.07, "learning_rate": 1.8324607329842932e-07, "logits/chosen": -2.7055583000183105, "logits/rejected": -2.7362632751464844, "logps/chosen": -311.878173828125, "logps/rejected": -288.1414489746094, "loss": 0.634, "rewards/accuracies": 0.65625, "rewards/chosen": 0.16289404034614563, "rewards/margins": 0.14815162122249603, "rewards/rejected": 0.014742432162165642, "step": 70 }, { "epoch": 0.08, "learning_rate": 2.094240837696335e-07, "logits/chosen": -2.59578013420105, "logits/rejected": -2.5442214012145996, "logps/chosen": -292.73455810546875, "logps/rejected": -259.7320251464844, "loss": 0.5995, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21888795495033264, "rewards/margins": 0.2754802107810974, "rewards/rejected": -0.05659227445721626, "step": 80 }, { "epoch": 0.09, "learning_rate": 2.356020942408377e-07, "logits/chosen": -2.6324312686920166, "logits/rejected": -2.6091675758361816, "logps/chosen": -285.9070739746094, "logps/rejected": -265.3856506347656, "loss": 0.5926, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3487551212310791, "rewards/margins": 0.3654690086841583, "rewards/rejected": -0.016713904216885567, "step": 90 }, { "epoch": 0.1, "learning_rate": 2.6178010471204185e-07, "logits/chosen": -2.621962547302246, "logits/rejected": -2.6166701316833496, "logps/chosen": -297.90020751953125, "logps/rejected": -290.8314208984375, "loss": 0.6075, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.25804010033607483, "rewards/margins": 0.30780670046806335, "rewards/rejected": -0.049766603857278824, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": -2.6650705337524414, "eval_logits/rejected": -2.632366180419922, "eval_logps/chosen": -278.720947265625, "eval_logps/rejected": -261.4175109863281, "eval_loss": 0.5945262312889099, "eval_rewards/accuracies": 0.716269850730896, "eval_rewards/chosen": 0.3241066038608551, "eval_rewards/margins": 0.44465819001197815, "eval_rewards/rejected": -0.12055157870054245, "eval_runtime": 226.7391, "eval_samples_per_second": 8.821, "eval_steps_per_second": 0.278, "step": 100 }, { "epoch": 0.12, "learning_rate": 2.879581151832461e-07, "logits/chosen": -2.634498119354248, "logits/rejected": -2.5716135501861572, "logps/chosen": -256.88555908203125, "logps/rejected": -220.08761596679688, "loss": 0.5846, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15877611935138702, "rewards/margins": 0.4923480451107025, "rewards/rejected": -0.3335719108581543, "step": 110 }, { "epoch": 0.13, "learning_rate": 3.1413612565445027e-07, "logits/chosen": -2.5683364868164062, "logits/rejected": -2.520733118057251, "logps/chosen": -298.49615478515625, "logps/rejected": -249.5241241455078, "loss": 0.5936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22743339836597443, "rewards/margins": 0.5804780125617981, "rewards/rejected": -0.35304465889930725, "step": 120 }, { "epoch": 0.14, "learning_rate": 3.4031413612565446e-07, "logits/chosen": -2.683486223220825, "logits/rejected": -2.635319471359253, "logps/chosen": -314.8161926269531, "logps/rejected": -270.246337890625, "loss": 0.5675, "rewards/accuracies": 0.75, "rewards/chosen": 0.3807130455970764, "rewards/margins": 0.5722749829292297, "rewards/rejected": -0.1915619671344757, "step": 130 }, { "epoch": 0.15, "learning_rate": 3.6649214659685864e-07, "logits/chosen": -2.6005754470825195, "logits/rejected": -2.608137607574463, "logps/chosen": -232.632080078125, "logps/rejected": -225.4097137451172, "loss": 0.5458, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5643398761749268, "rewards/margins": 0.5881937146186829, "rewards/rejected": -0.023853814229369164, "step": 140 }, { "epoch": 0.16, "learning_rate": 3.926701570680628e-07, "logits/chosen": -2.604186773300171, "logits/rejected": -2.635862112045288, "logps/chosen": -269.14630126953125, "logps/rejected": -283.9921875, "loss": 0.5907, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4123942255973816, "rewards/margins": 0.44325852394104004, "rewards/rejected": -0.030864257365465164, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.18848167539267e-07, "logits/chosen": -2.611471652984619, "logits/rejected": -2.556894302368164, "logps/chosen": -287.0811767578125, "logps/rejected": -288.70196533203125, "loss": 0.5136, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.5156514048576355, "rewards/margins": 0.6663922071456909, "rewards/rejected": -0.1507408320903778, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.450261780104712e-07, "logits/chosen": -2.5958797931671143, "logits/rejected": -2.6182973384857178, "logps/chosen": -270.68731689453125, "logps/rejected": -252.85885620117188, "loss": 0.5416, "rewards/accuracies": 0.6875, "rewards/chosen": 0.099888376891613, "rewards/margins": 0.7057486772537231, "rewards/rejected": -0.6058603525161743, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.712041884816754e-07, "logits/chosen": -2.519507884979248, "logits/rejected": -2.509925127029419, "logps/chosen": -245.8113250732422, "logps/rejected": -238.33609008789062, "loss": 0.5141, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.07970233261585236, "rewards/margins": 0.759169340133667, "rewards/rejected": -0.8388715982437134, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.973821989528796e-07, "logits/chosen": -2.642540693283081, "logits/rejected": -2.58392596244812, "logps/chosen": -276.4290466308594, "logps/rejected": -262.4585876464844, "loss": 0.498, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18133386969566345, "rewards/margins": 0.8626937866210938, "rewards/rejected": -0.6813598871231079, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.999661831436498e-07, "logits/chosen": -2.561401844024658, "logits/rejected": -2.499178409576416, "logps/chosen": -283.63043212890625, "logps/rejected": -282.01190185546875, "loss": 0.5341, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.15141230821609497, "rewards/margins": 0.9977926015853882, "rewards/rejected": -0.846380352973938, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": -2.577924966812134, "eval_logits/rejected": -2.5393853187561035, "eval_logps/chosen": -282.6962585449219, "eval_logps/rejected": -270.31524658203125, "eval_loss": 0.5470743179321289, "eval_rewards/accuracies": 0.7638888955116272, "eval_rewards/chosen": -0.07342492789030075, "eval_rewards/margins": 0.9369009137153625, "eval_rewards/rejected": -1.0103257894515991, "eval_runtime": 226.533, "eval_samples_per_second": 8.829, "eval_steps_per_second": 0.278, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.998492971140339e-07, "logits/chosen": -2.567587375640869, "logits/rejected": -2.5276873111724854, "logps/chosen": -301.2264099121094, "logps/rejected": -257.6970520019531, "loss": 0.5422, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.15012404322624207, "rewards/margins": 0.7839210629463196, "rewards/rejected": -0.6337969899177551, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.996489634487865e-07, "logits/chosen": -2.5400662422180176, "logits/rejected": -2.5105788707733154, "logps/chosen": -280.18170166015625, "logps/rejected": -276.0943908691406, "loss": 0.5254, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.6297529339790344, "rewards/margins": 0.9011818766593933, "rewards/rejected": -0.27142906188964844, "step": 220 }, { "epoch": 0.24, "learning_rate": 4.993652490577246e-07, "logits/chosen": -2.5180108547210693, "logits/rejected": -2.5125648975372314, "logps/chosen": -291.6443176269531, "logps/rejected": -256.5786437988281, "loss": 0.592, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1465241014957428, "rewards/margins": 0.8940057754516602, "rewards/rejected": -1.0405299663543701, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.9899824869915e-07, "logits/chosen": -2.5353310108184814, "logits/rejected": -2.487065315246582, "logps/chosen": -273.65447998046875, "logps/rejected": -234.3402557373047, "loss": 0.5545, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11877474933862686, "rewards/margins": 0.7978388667106628, "rewards/rejected": -0.9166136980056763, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.985480849482012e-07, "logits/chosen": -2.557776689529419, "logits/rejected": -2.557042121887207, "logps/chosen": -264.174560546875, "logps/rejected": -287.1108703613281, "loss": 0.5466, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.03126413747668266, "rewards/margins": 0.7518512010574341, "rewards/rejected": -0.7205870151519775, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.980149081559142e-07, "logits/chosen": -2.5667481422424316, "logits/rejected": -2.5552361011505127, "logps/chosen": -282.69219970703125, "logps/rejected": -264.5124206542969, "loss": 0.5502, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.08340673893690109, "rewards/margins": 0.9245700836181641, "rewards/rejected": -0.8411632776260376, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.973988963990065e-07, "logits/chosen": -2.6153595447540283, "logits/rejected": -2.6135799884796143, "logps/chosen": -302.2950439453125, "logps/rejected": -262.1023864746094, "loss": 0.5263, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.25781750679016113, "rewards/margins": 1.005016803741455, "rewards/rejected": -1.2628343105316162, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.967002554204008e-07, "logits/chosen": -2.649857997894287, "logits/rejected": -2.6510024070739746, "logps/chosen": -292.15399169921875, "logps/rejected": -293.506103515625, "loss": 0.5183, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.21941521763801575, "rewards/margins": 0.8853797912597656, "rewards/rejected": -1.104794979095459, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.959192185605087e-07, "logits/chosen": -2.582623243331909, "logits/rejected": -2.5996947288513184, "logps/chosen": -281.49810791015625, "logps/rejected": -257.9612731933594, "loss": 0.5675, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.2765043377876282, "rewards/margins": 0.9062711000442505, "rewards/rejected": -0.6297667026519775, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.950560466792969e-07, "logits/chosen": -2.5451674461364746, "logits/rejected": -2.519487142562866, "logps/chosen": -279.9990539550781, "logps/rejected": -258.7757873535156, "loss": 0.5315, "rewards/accuracies": 0.75, "rewards/chosen": 0.20309853553771973, "rewards/margins": 1.2224222421646118, "rewards/rejected": -1.0193235874176025, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": -2.5710976123809814, "eval_logits/rejected": -2.533705234527588, "eval_logps/chosen": -280.52740478515625, "eval_logps/rejected": -269.9693908691406, "eval_loss": 0.5258406400680542, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": 0.14346502721309662, "eval_rewards/margins": 1.1192045211791992, "eval_rewards/rejected": -0.975739598274231, "eval_runtime": 226.9103, "eval_samples_per_second": 8.814, "eval_steps_per_second": 0.278, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.941110280691619e-07, "logits/chosen": -2.4874155521392822, "logits/rejected": -2.457726001739502, "logps/chosen": -277.2449035644531, "logps/rejected": -276.86090087890625, "loss": 0.486, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.32208043336868286, "rewards/margins": 1.3255231380462646, "rewards/rejected": -1.0034427642822266, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.930844783586424e-07, "logits/chosen": -2.5562355518341064, "logits/rejected": -2.521014451980591, "logps/chosen": -274.89434814453125, "logps/rejected": -274.78485107421875, "loss": 0.5288, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.12573584914207458, "rewards/margins": 1.0204110145568848, "rewards/rejected": -0.8946751356124878, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.919767404070033e-07, "logits/chosen": -2.5349462032318115, "logits/rejected": -2.5252394676208496, "logps/chosen": -267.4788818359375, "logps/rejected": -234.8790740966797, "loss": 0.5625, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5486379861831665, "rewards/margins": 0.8442266583442688, "rewards/rejected": -0.29558849334716797, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.907881841897216e-07, "logits/chosen": -2.6037631034851074, "logits/rejected": -2.558772325515747, "logps/chosen": -238.9552764892578, "logps/rejected": -223.467041015625, "loss": 0.5941, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6076385378837585, "rewards/margins": 0.6313496828079224, "rewards/rejected": -0.023711146786808968, "step": 340 }, { "epoch": 0.37, "learning_rate": 4.895192066749189e-07, "logits/chosen": -2.572120189666748, "logits/rejected": -2.6085386276245117, "logps/chosen": -269.9642639160156, "logps/rejected": -285.7270812988281, "loss": 0.5593, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03363977000117302, "rewards/margins": 0.9270251393318176, "rewards/rejected": -0.8933852910995483, "step": 350 }, { "epoch": 0.38, "learning_rate": 4.881702316907768e-07, "logits/chosen": -2.598543167114258, "logits/rejected": -2.6146891117095947, "logps/chosen": -286.3782958984375, "logps/rejected": -255.3352813720703, "loss": 0.61, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7369937300682068, "rewards/margins": 0.7843579649925232, "rewards/rejected": -1.5213515758514404, "step": 360 }, { "epoch": 0.39, "learning_rate": 4.86741709783982e-07, "logits/chosen": -2.5297656059265137, "logits/rejected": -2.5421297550201416, "logps/chosen": -241.9557647705078, "logps/rejected": -266.3279724121094, "loss": 0.4914, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.32598692178726196, "rewards/margins": 1.1650376319885254, "rewards/rejected": -1.491024374961853, "step": 370 }, { "epoch": 0.4, "learning_rate": 4.85234118069247e-07, "logits/chosen": -2.6338109970092773, "logits/rejected": -2.5728697776794434, "logps/chosen": -291.5673522949219, "logps/rejected": -269.80145263671875, "loss": 0.5416, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.06620955467224121, "rewards/margins": 1.0793496370315552, "rewards/rejected": -1.013140082359314, "step": 380 }, { "epoch": 0.41, "learning_rate": 4.836479600699578e-07, "logits/chosen": -2.508533239364624, "logits/rejected": -2.498030185699463, "logps/chosen": -271.2484436035156, "logps/rejected": -244.17117309570312, "loss": 0.5335, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.11522499471902847, "rewards/margins": 1.244307279586792, "rewards/rejected": -1.1290823221206665, "step": 390 }, { "epoch": 0.42, "learning_rate": 4.819837655500013e-07, "logits/chosen": -2.610954761505127, "logits/rejected": -2.583712100982666, "logps/chosen": -305.75408935546875, "logps/rejected": -277.2504577636719, "loss": 0.4978, "rewards/accuracies": 0.75, "rewards/chosen": -0.2530876696109772, "rewards/margins": 1.0643428564071655, "rewards/rejected": -1.3174306154251099, "step": 400 }, { "epoch": 0.42, "eval_logits/chosen": -2.601116895675659, "eval_logits/rejected": -2.566725969314575, "eval_logps/chosen": -284.13909912109375, "eval_logps/rejected": -273.0382995605469, "eval_loss": 0.5365874171257019, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -0.2177048623561859, "eval_rewards/margins": 1.064923882484436, "eval_rewards/rejected": -1.2826287746429443, "eval_runtime": 226.3544, "eval_samples_per_second": 8.836, "eval_steps_per_second": 0.278, "step": 400 }, { "epoch": 0.43, "learning_rate": 4.802420903368285e-07, "logits/chosen": -2.681976795196533, "logits/rejected": -2.616307258605957, "logps/chosen": -314.3067321777344, "logps/rejected": -302.2765197753906, "loss": 0.5174, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12892000377178192, "rewards/margins": 1.0059425830841064, "rewards/rejected": -1.1348625421524048, "step": 410 }, { "epoch": 0.44, "learning_rate": 4.784235161358123e-07, "logits/chosen": -2.568340539932251, "logits/rejected": -2.46895170211792, "logps/chosen": -261.3116455078125, "logps/rejected": -237.0153045654297, "loss": 0.5485, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.15922246873378754, "rewards/margins": 0.9320141077041626, "rewards/rejected": -0.7727915644645691, "step": 420 }, { "epoch": 0.45, "learning_rate": 4.7652865033596314e-07, "logits/chosen": -2.582084894180298, "logits/rejected": -2.4861457347869873, "logps/chosen": -259.3132629394531, "logps/rejected": -245.1965789794922, "loss": 0.5428, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17654812335968018, "rewards/margins": 0.8880146741867065, "rewards/rejected": -0.7114666104316711, "step": 430 }, { "epoch": 0.46, "learning_rate": 4.7455812580706534e-07, "logits/chosen": -2.5245871543884277, "logits/rejected": -2.6036410331726074, "logps/chosen": -256.15679931640625, "logps/rejected": -257.1427307128906, "loss": 0.5171, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.05140058323740959, "rewards/margins": 0.8768993616104126, "rewards/rejected": -0.8254987001419067, "step": 440 }, { "epoch": 0.47, "learning_rate": 4.725126006883046e-07, "logits/chosen": -2.4219858646392822, "logits/rejected": -2.4380393028259277, "logps/chosen": -257.55950927734375, "logps/rejected": -252.17617797851562, "loss": 0.5499, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.1307249814271927, "rewards/margins": 1.2146170139312744, "rewards/rejected": -1.0838921070098877, "step": 450 }, { "epoch": 0.48, "learning_rate": 4.703927581684539e-07, "logits/chosen": -2.5411155223846436, "logits/rejected": -2.4892051219940186, "logps/chosen": -263.80718994140625, "logps/rejected": -269.1478271484375, "loss": 0.5068, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.025493722409009933, "rewards/margins": 1.4079092741012573, "rewards/rejected": -1.382415533065796, "step": 460 }, { "epoch": 0.49, "learning_rate": 4.68199306257695e-07, "logits/chosen": -2.5160670280456543, "logits/rejected": -2.5236003398895264, "logps/chosen": -295.74859619140625, "logps/rejected": -267.48406982421875, "loss": 0.5103, "rewards/accuracies": 0.75, "rewards/chosen": 0.044890422374010086, "rewards/margins": 1.135880947113037, "rewards/rejected": -1.0909905433654785, "step": 470 }, { "epoch": 0.5, "learning_rate": 4.6593297755114776e-07, "logits/chosen": -2.429823160171509, "logits/rejected": -2.3565280437469482, "logps/chosen": -230.04013061523438, "logps/rejected": -235.0593719482422, "loss": 0.5476, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.04353000968694687, "rewards/margins": 1.120490312576294, "rewards/rejected": -1.0769603252410889, "step": 480 }, { "epoch": 0.51, "learning_rate": 4.635945289841902e-07, "logits/chosen": -2.44765043258667, "logits/rejected": -2.4359757900238037, "logps/chosen": -272.905517578125, "logps/rejected": -257.1273193359375, "loss": 0.5412, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.009350990876555443, "rewards/margins": 1.0459551811218262, "rewards/rejected": -1.055306077003479, "step": 490 }, { "epoch": 0.52, "learning_rate": 4.611847415796476e-07, "logits/chosen": -2.437455654144287, "logits/rejected": -2.4236032962799072, "logps/chosen": -273.6705627441406, "logps/rejected": -263.4073181152344, "loss": 0.5134, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4976045489311218, "rewards/margins": 1.1695888042449951, "rewards/rejected": -1.6671931743621826, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": -2.4835798740386963, "eval_logits/rejected": -2.4488439559936523, "eval_logps/chosen": -286.6747741699219, "eval_logps/rejected": -275.3515625, "eval_loss": 0.5339823961257935, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": -0.47127577662467957, "eval_rewards/margins": 1.0426832437515259, "eval_rewards/rejected": -1.5139589309692383, "eval_runtime": 226.2176, "eval_samples_per_second": 8.841, "eval_steps_per_second": 0.278, "step": 500 }, { "epoch": 0.53, "learning_rate": 4.5870442018693773e-07, "logits/chosen": -2.432690382003784, "logits/rejected": -2.4488320350646973, "logps/chosen": -301.65472412109375, "logps/rejected": -263.7899169921875, "loss": 0.4985, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.31870347261428833, "rewards/margins": 1.2717561721801758, "rewards/rejected": -1.5904595851898193, "step": 510 }, { "epoch": 0.54, "learning_rate": 4.5615439321325735e-07, "logits/chosen": -2.497467517852783, "logits/rejected": -2.43135404586792, "logps/chosen": -257.162109375, "logps/rejected": -260.63873291015625, "loss": 0.5405, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0005254775169305503, "rewards/margins": 1.253112554550171, "rewards/rejected": -1.2536382675170898, "step": 520 }, { "epoch": 0.55, "learning_rate": 4.535355123469008e-07, "logits/chosen": -2.5136122703552246, "logits/rejected": -2.514291286468506, "logps/chosen": -280.01190185546875, "logps/rejected": -266.7725524902344, "loss": 0.5029, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.18937644362449646, "rewards/margins": 1.3221962451934814, "rewards/rejected": -1.132819652557373, "step": 530 }, { "epoch": 0.57, "learning_rate": 4.5084865227280366e-07, "logits/chosen": -2.5600991249084473, "logits/rejected": -2.5647799968719482, "logps/chosen": -290.2501220703125, "logps/rejected": -308.81463623046875, "loss": 0.5412, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1531665325164795, "rewards/margins": 1.4022881984710693, "rewards/rejected": -1.2491216659545898, "step": 540 }, { "epoch": 0.58, "learning_rate": 4.4809471038040437e-07, "logits/chosen": -2.5655248165130615, "logits/rejected": -2.5592260360717773, "logps/chosen": -265.06158447265625, "logps/rejected": -254.6921844482422, "loss": 0.5304, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4487919807434082, "rewards/margins": 1.197060465812683, "rewards/rejected": -0.7482684850692749, "step": 550 }, { "epoch": 0.59, "learning_rate": 4.4527460646392386e-07, "logits/chosen": -2.462329387664795, "logits/rejected": -2.481189489364624, "logps/chosen": -264.7054443359375, "logps/rejected": -241.17300415039062, "loss": 0.5908, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01212785579264164, "rewards/margins": 0.978294849395752, "rewards/rejected": -0.9661668539047241, "step": 560 }, { "epoch": 0.6, "learning_rate": 4.4238928241516163e-07, "logits/chosen": -2.4593071937561035, "logits/rejected": -2.425731897354126, "logps/chosen": -276.5869445800781, "logps/rejected": -269.2005310058594, "loss": 0.583, "rewards/accuracies": 0.75, "rewards/chosen": -0.11743898689746857, "rewards/margins": 0.9975796937942505, "rewards/rejected": -1.115018606185913, "step": 570 }, { "epoch": 0.61, "learning_rate": 4.394397019089116e-07, "logits/chosen": -2.379189968109131, "logits/rejected": -2.3689961433410645, "logps/chosen": -265.9002685546875, "logps/rejected": -244.2664031982422, "loss": 0.5335, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3145141899585724, "rewards/margins": 1.2020901441574097, "rewards/rejected": -0.8875759243965149, "step": 580 }, { "epoch": 0.62, "learning_rate": 4.3642685008110246e-07, "logits/chosen": -2.3947653770446777, "logits/rejected": -2.3050079345703125, "logps/chosen": -269.3788757324219, "logps/rejected": -284.05023193359375, "loss": 0.532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2743293344974518, "rewards/margins": 0.9255713224411011, "rewards/rejected": -0.6512419581413269, "step": 590 }, { "epoch": 0.63, "learning_rate": 4.333517331997704e-07, "logits/chosen": -2.3771307468414307, "logits/rejected": -2.360133647918701, "logps/chosen": -322.1334533691406, "logps/rejected": -320.4483642578125, "loss": 0.5404, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03305041417479515, "rewards/margins": 1.0837600231170654, "rewards/rejected": -1.0507094860076904, "step": 600 }, { "epoch": 0.63, "eval_logits/chosen": -2.4039180278778076, "eval_logits/rejected": -2.363107442855835, "eval_logps/chosen": -282.49615478515625, "eval_logps/rejected": -273.1927795410156, "eval_loss": 0.5188466310501099, "eval_rewards/accuracies": 0.7480158805847168, "eval_rewards/chosen": -0.053411100059747696, "eval_rewards/margins": 1.2446662187576294, "eval_rewards/rejected": -1.2980774641036987, "eval_runtime": 226.4153, "eval_samples_per_second": 8.833, "eval_steps_per_second": 0.278, "step": 600 }, { "epoch": 0.64, "learning_rate": 4.302153783289736e-07, "logits/chosen": -2.3987278938293457, "logits/rejected": -2.349885940551758, "logps/chosen": -275.1676025390625, "logps/rejected": -238.166015625, "loss": 0.5057, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.22538134455680847, "rewards/margins": 1.2502909898757935, "rewards/rejected": -1.4756723642349243, "step": 610 }, { "epoch": 0.65, "learning_rate": 4.2701883298576124e-07, "logits/chosen": -2.4204845428466797, "logits/rejected": -2.3872830867767334, "logps/chosen": -276.31207275390625, "logps/rejected": -246.4015350341797, "loss": 0.5219, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5614226460456848, "rewards/margins": 1.1581157445907593, "rewards/rejected": -1.7195383310317993, "step": 620 }, { "epoch": 0.66, "learning_rate": 4.237631647903115e-07, "logits/chosen": -2.3450286388397217, "logits/rejected": -2.375546932220459, "logps/chosen": -262.30255126953125, "logps/rejected": -268.90081787109375, "loss": 0.4865, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2230757176876068, "rewards/margins": 1.3167085647583008, "rewards/rejected": -1.53978431224823, "step": 630 }, { "epoch": 0.67, "learning_rate": 4.204494611093548e-07, "logits/chosen": -2.313415288925171, "logits/rejected": -2.3189613819122314, "logps/chosen": -279.33447265625, "logps/rejected": -267.25091552734375, "loss": 0.5633, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.33714792132377625, "rewards/margins": 1.0815935134887695, "rewards/rejected": -1.4187414646148682, "step": 640 }, { "epoch": 0.68, "learning_rate": 4.1707882869300235e-07, "logits/chosen": -2.4951534271240234, "logits/rejected": -2.4177632331848145, "logps/chosen": -306.20562744140625, "logps/rejected": -271.62957763671875, "loss": 0.5365, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.07749436795711517, "rewards/margins": 1.0011107921600342, "rewards/rejected": -1.0786052942276, "step": 650 }, { "epoch": 0.69, "learning_rate": 4.136523933051005e-07, "logits/chosen": -2.48036789894104, "logits/rejected": -2.5561745166778564, "logps/chosen": -298.15081787109375, "logps/rejected": -275.85784912109375, "loss": 0.5123, "rewards/accuracies": 0.75, "rewards/chosen": 0.18228770792484283, "rewards/margins": 1.110039472579956, "rewards/rejected": -0.927751898765564, "step": 660 }, { "epoch": 0.7, "learning_rate": 4.101712993472348e-07, "logits/chosen": -2.3927178382873535, "logits/rejected": -2.3628573417663574, "logps/chosen": -279.5555725097656, "logps/rejected": -277.4151916503906, "loss": 0.5174, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.13750191032886505, "rewards/margins": 1.1913979053497314, "rewards/rejected": -1.0538959503173828, "step": 670 }, { "epoch": 0.71, "learning_rate": 4.066367094765091e-07, "logits/chosen": -2.3350799083709717, "logits/rejected": -2.3456828594207764, "logps/chosen": -257.46600341796875, "logps/rejected": -244.01730346679688, "loss": 0.5449, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.08425486087799072, "rewards/margins": 0.739366888999939, "rewards/rejected": -0.6551119089126587, "step": 680 }, { "epoch": 0.72, "learning_rate": 4.0304980421722766e-07, "logits/chosen": -2.3606956005096436, "logits/rejected": -2.308849573135376, "logps/chosen": -256.06671142578125, "logps/rejected": -248.5846710205078, "loss": 0.4933, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.35821112990379333, "rewards/margins": 1.1378281116485596, "rewards/rejected": -0.7796168327331543, "step": 690 }, { "epoch": 0.73, "learning_rate": 3.994117815666095e-07, "logits/chosen": -2.320355176925659, "logits/rejected": -2.2387115955352783, "logps/chosen": -254.97274780273438, "logps/rejected": -257.1867980957031, "loss": 0.5256, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2872292399406433, "rewards/margins": 1.2621638774871826, "rewards/rejected": -1.5493931770324707, "step": 700 }, { "epoch": 0.73, "eval_logits/chosen": -2.3640167713165283, "eval_logits/rejected": -2.3223814964294434, "eval_logps/chosen": -284.4947509765625, "eval_logps/rejected": -275.916259765625, "eval_loss": 0.5269842743873596, "eval_rewards/accuracies": 0.7638888955116272, "eval_rewards/chosen": -0.25327354669570923, "eval_rewards/margins": 1.317151427268982, "eval_rewards/rejected": -1.570425033569336, "eval_runtime": 226.7679, "eval_samples_per_second": 8.82, "eval_steps_per_second": 0.278, "step": 700 }, { "epoch": 0.74, "learning_rate": 3.957238565946671e-07, "logits/chosen": -2.345858335494995, "logits/rejected": -2.282494068145752, "logps/chosen": -307.03070068359375, "logps/rejected": -252.6605987548828, "loss": 0.4995, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.31514400243759155, "rewards/margins": 1.2233600616455078, "rewards/rejected": -1.5385040044784546, "step": 710 }, { "epoch": 0.75, "learning_rate": 3.9198726103838306e-07, "logits/chosen": -2.321413516998291, "logits/rejected": -2.3380115032196045, "logps/chosen": -294.9783020019531, "logps/rejected": -284.97711181640625, "loss": 0.4904, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.4381527900695801, "rewards/margins": 1.4210107326507568, "rewards/rejected": -1.8591632843017578, "step": 720 }, { "epoch": 0.76, "learning_rate": 3.8820324289031946e-07, "logits/chosen": -2.3153445720672607, "logits/rejected": -2.299293041229248, "logps/chosen": -302.4072265625, "logps/rejected": -262.1842346191406, "loss": 0.4979, "rewards/accuracies": 0.75, "rewards/chosen": 0.04457683488726616, "rewards/margins": 1.197031021118164, "rewards/rejected": -1.152454137802124, "step": 730 }, { "epoch": 0.77, "learning_rate": 3.84373065981799e-07, "logits/chosen": -2.2789864540100098, "logits/rejected": -2.287776470184326, "logps/chosen": -310.31085205078125, "logps/rejected": -270.4225158691406, "loss": 0.4977, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2677065134048462, "rewards/margins": 1.452398657798767, "rewards/rejected": -1.1846922636032104, "step": 740 }, { "epoch": 0.78, "learning_rate": 3.8049800956079545e-07, "logits/chosen": -2.279670238494873, "logits/rejected": -2.283494234085083, "logps/chosen": -281.9285583496094, "logps/rejected": -287.0923767089844, "loss": 0.5456, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.13261595368385315, "rewards/margins": 1.1431291103363037, "rewards/rejected": -1.2757450342178345, "step": 750 }, { "epoch": 0.8, "learning_rate": 3.7657936786467525e-07, "logits/chosen": -2.302417516708374, "logits/rejected": -2.3380768299102783, "logps/chosen": -280.0326843261719, "logps/rejected": -265.580810546875, "loss": 0.4944, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34787678718566895, "rewards/margins": 1.2519744634628296, "rewards/rejected": -1.599851369857788, "step": 760 }, { "epoch": 0.81, "learning_rate": 3.7261844968793226e-07, "logits/chosen": -2.3550281524658203, "logits/rejected": -2.328148365020752, "logps/chosen": -295.94879150390625, "logps/rejected": -310.93951416015625, "loss": 0.5443, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6231586933135986, "rewards/margins": 1.0639232397079468, "rewards/rejected": -1.6870819330215454, "step": 770 }, { "epoch": 0.82, "learning_rate": 3.6861657794506187e-07, "logits/chosen": -2.3366761207580566, "logits/rejected": -2.3406996726989746, "logps/chosen": -258.4978942871094, "logps/rejected": -248.54598999023438, "loss": 0.5286, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3897480368614197, "rewards/margins": 0.9254152178764343, "rewards/rejected": -1.315163254737854, "step": 780 }, { "epoch": 0.83, "learning_rate": 3.6457508922871777e-07, "logits/chosen": -2.3761212825775146, "logits/rejected": -2.3622617721557617, "logps/chosen": -301.3812255859375, "logps/rejected": -298.0675048828125, "loss": 0.5199, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.30222287774086, "rewards/margins": 1.3482304811477661, "rewards/rejected": -1.6504533290863037, "step": 790 }, { "epoch": 0.84, "learning_rate": 3.6049533336330084e-07, "logits/chosen": -2.3272039890289307, "logits/rejected": -2.228933572769165, "logps/chosen": -339.2475280761719, "logps/rejected": -294.8114013671875, "loss": 0.4991, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.338949978351593, "rewards/margins": 1.4692856073379517, "rewards/rejected": -1.8082354068756104, "step": 800 }, { "epoch": 0.84, "eval_logits/chosen": -2.4144346714019775, "eval_logits/rejected": -2.3730151653289795, "eval_logps/chosen": -284.3555908203125, "eval_logps/rejected": -275.48785400390625, "eval_loss": 0.5277653336524963, "eval_rewards/accuracies": 0.7638888955116272, "eval_rewards/chosen": -0.23935770988464355, "eval_rewards/margins": 1.2882306575775146, "eval_rewards/rejected": -1.5275882482528687, "eval_runtime": 226.9947, "eval_samples_per_second": 8.811, "eval_steps_per_second": 0.278, "step": 800 }, { "epoch": 0.85, "learning_rate": 3.56378672954129e-07, "logits/chosen": -2.371492385864258, "logits/rejected": -2.309382200241089, "logps/chosen": -280.04901123046875, "logps/rejected": -250.42471313476562, "loss": 0.5256, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.29708132147789, "rewards/margins": 1.1195948123931885, "rewards/rejected": -1.4166762828826904, "step": 810 }, { "epoch": 0.86, "learning_rate": 3.5222648293233803e-07, "logits/chosen": -2.3205182552337646, "logits/rejected": -2.410818576812744, "logps/chosen": -251.86279296875, "logps/rejected": -247.7059783935547, "loss": 0.5668, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.08637113124132156, "rewards/margins": 1.2163774967193604, "rewards/rejected": -1.302748441696167, "step": 820 }, { "epoch": 0.87, "learning_rate": 3.480401500956657e-07, "logits/chosen": -2.4300191402435303, "logits/rejected": -2.364224910736084, "logps/chosen": -248.8407745361328, "logps/rejected": -254.6449432373047, "loss": 0.5004, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11026261001825333, "rewards/margins": 1.2070744037628174, "rewards/rejected": -1.3173370361328125, "step": 830 }, { "epoch": 0.88, "learning_rate": 3.438210726452724e-07, "logits/chosen": -2.3442130088806152, "logits/rejected": -2.423654079437256, "logps/chosen": -306.9721374511719, "logps/rejected": -293.9955749511719, "loss": 0.5428, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.07187880575656891, "rewards/margins": 1.2101354598999023, "rewards/rejected": -1.28201425075531, "step": 840 }, { "epoch": 0.89, "learning_rate": 3.395706597187538e-07, "logits/chosen": -2.494635820388794, "logits/rejected": -2.4887969493865967, "logps/chosen": -300.2822570800781, "logps/rejected": -255.76779174804688, "loss": 0.5266, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.10039399564266205, "rewards/margins": 1.0088200569152832, "rewards/rejected": -1.1092140674591064, "step": 850 }, { "epoch": 0.9, "learning_rate": 3.3529033091949986e-07, "logits/chosen": -2.4344663619995117, "logits/rejected": -2.4320755004882812, "logps/chosen": -280.1997985839844, "logps/rejected": -265.43438720703125, "loss": 0.5136, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.34707534313201904, "rewards/margins": 1.197710394859314, "rewards/rejected": -1.544785737991333, "step": 860 }, { "epoch": 0.91, "learning_rate": 3.309815158425591e-07, "logits/chosen": -2.477915048599243, "logits/rejected": -2.501054048538208, "logps/chosen": -246.0802001953125, "logps/rejected": -259.4007873535156, "loss": 0.5906, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.03818273916840553, "rewards/margins": 1.3956449031829834, "rewards/rejected": -1.3574622869491577, "step": 870 }, { "epoch": 0.92, "learning_rate": 3.2664565359716536e-07, "logits/chosen": -2.5257723331451416, "logits/rejected": -2.4535813331604004, "logps/chosen": -272.57464599609375, "logps/rejected": -262.0669860839844, "loss": 0.5031, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.043708182871341705, "rewards/margins": 1.1104907989501953, "rewards/rejected": -1.1541990041732788, "step": 880 }, { "epoch": 0.93, "learning_rate": 3.222841923260869e-07, "logits/chosen": -2.486614227294922, "logits/rejected": -2.430428981781006, "logps/chosen": -266.1326599121094, "logps/rejected": -275.91021728515625, "loss": 0.5121, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.0877460464835167, "rewards/margins": 1.2036986351013184, "rewards/rejected": -1.2914446592330933, "step": 890 }, { "epoch": 0.94, "learning_rate": 3.1789858872195887e-07, "logits/chosen": -2.4623024463653564, "logits/rejected": -2.3918118476867676, "logps/chosen": -293.54876708984375, "logps/rejected": -294.0716247558594, "loss": 0.5084, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.14543576538562775, "rewards/margins": 1.5042603015899658, "rewards/rejected": -1.358824610710144, "step": 900 }, { "epoch": 0.94, "eval_logits/chosen": -2.525357484817505, "eval_logits/rejected": -2.4875481128692627, "eval_logps/chosen": -279.2980651855469, "eval_logps/rejected": -269.75811767578125, "eval_loss": 0.545681357383728, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": 0.26639631390571594, "eval_rewards/margins": 1.2210099697113037, "eval_rewards/rejected": -0.9546135663986206, "eval_runtime": 226.8439, "eval_samples_per_second": 8.817, "eval_steps_per_second": 0.278, "step": 900 }, { "epoch": 0.95, "learning_rate": 3.1349030754075937e-07, "logits/chosen": -2.4634242057800293, "logits/rejected": -2.4429314136505127, "logps/chosen": -244.9004364013672, "logps/rejected": -252.3045196533203, "loss": 0.5056, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1813797950744629, "rewards/margins": 1.2861802577972412, "rewards/rejected": -1.1048003435134888, "step": 910 }, { "epoch": 0.96, "learning_rate": 3.090608211125931e-07, "logits/chosen": -2.4693336486816406, "logits/rejected": -2.447345018386841, "logps/chosen": -290.149658203125, "logps/rejected": -268.160400390625, "loss": 0.4786, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1346684992313385, "rewards/margins": 1.4626705646514893, "rewards/rejected": -1.3280022144317627, "step": 920 }, { "epoch": 0.97, "learning_rate": 3.0461160884994487e-07, "logits/chosen": -2.4989802837371826, "logits/rejected": -2.479954957962036, "logps/chosen": -268.21307373046875, "logps/rejected": -252.8021240234375, "loss": 0.5034, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2526644468307495, "rewards/margins": 1.299523949623108, "rewards/rejected": -1.0468595027923584, "step": 930 }, { "epoch": 0.98, "learning_rate": 3.001441567535681e-07, "logits/chosen": -2.4905037879943848, "logits/rejected": -2.4868297576904297, "logps/chosen": -322.7031555175781, "logps/rejected": -302.384033203125, "loss": 0.5202, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1863454431295395, "rewards/margins": 1.4165011644363403, "rewards/rejected": -1.2301557064056396, "step": 940 }, { "epoch": 0.99, "learning_rate": 2.956599569161724e-07, "logits/chosen": -2.523249387741089, "logits/rejected": -2.452022075653076, "logps/chosen": -283.5419921875, "logps/rejected": -276.04901123046875, "loss": 0.5186, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.08425289392471313, "rewards/margins": 1.4224612712860107, "rewards/rejected": -1.5067142248153687, "step": 950 }, { "epoch": 1.0, "learning_rate": 2.91160507024077e-07, "logits/chosen": -2.465339422225952, "logits/rejected": -2.3531413078308105, "logps/chosen": -259.94134521484375, "logps/rejected": -273.2628479003906, "loss": 0.3169, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.44497355818748474, "rewards/margins": 2.54945707321167, "rewards/rejected": -2.1044836044311523, "step": 960 }, { "epoch": 1.02, "learning_rate": 2.866473098569953e-07, "logits/chosen": -2.4656593799591064, "logits/rejected": -2.4711594581604004, "logps/chosen": -312.7133483886719, "logps/rejected": -352.4391174316406, "loss": 0.1041, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9930880665779114, "rewards/margins": 4.52797794342041, "rewards/rejected": -3.5348899364471436, "step": 970 }, { "epoch": 1.03, "learning_rate": 2.8212187278611905e-07, "logits/chosen": -2.4395172595977783, "logits/rejected": -2.4456565380096436, "logps/chosen": -247.8092803955078, "logps/rejected": -293.00457763671875, "loss": 0.1067, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.7508343458175659, "rewards/margins": 4.162354469299316, "rewards/rejected": -3.411520481109619, "step": 980 }, { "epoch": 1.04, "learning_rate": 2.775857072706684e-07, "logits/chosen": -2.478757619857788, "logits/rejected": -2.481419086456299, "logps/chosen": -301.2716979980469, "logps/rejected": -311.4957275390625, "loss": 0.0912, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.0747854709625244, "rewards/margins": 4.808146953582764, "rewards/rejected": -3.7333617210388184, "step": 990 }, { "epoch": 1.05, "learning_rate": 2.7304032835307667e-07, "logits/chosen": -2.463419198989868, "logits/rejected": -2.459811210632324, "logps/chosen": -263.8146057128906, "logps/rejected": -299.4679260253906, "loss": 0.1011, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5132207274436951, "rewards/margins": 4.391068935394287, "rewards/rejected": -3.8778488636016846, "step": 1000 }, { "epoch": 1.05, "eval_logits/chosen": -2.477447271347046, "eval_logits/rejected": -2.4389193058013916, "eval_logps/chosen": -287.1976318359375, "eval_logps/rejected": -281.5762023925781, "eval_loss": 0.5360726118087769, "eval_rewards/accuracies": 0.7876983880996704, "eval_rewards/chosen": -0.5235590934753418, "eval_rewards/margins": 1.6128613948822021, "eval_rewards/rejected": -2.136420249938965, "eval_runtime": 226.7944, "eval_samples_per_second": 8.819, "eval_steps_per_second": 0.278, "step": 1000 }, { "epoch": 1.06, "learning_rate": 2.6848725415297884e-07, "logits/chosen": -2.4920871257781982, "logits/rejected": -2.4668002128601074, "logps/chosen": -288.52398681640625, "logps/rejected": -294.40106201171875, "loss": 0.0849, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8318729400634766, "rewards/margins": 4.401202201843262, "rewards/rejected": -3.5693297386169434, "step": 1010 }, { "epoch": 1.07, "learning_rate": 2.6392800536017183e-07, "logits/chosen": -2.4347338676452637, "logits/rejected": -2.415900468826294, "logps/chosen": -287.1625061035156, "logps/rejected": -317.97344970703125, "loss": 0.0926, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9968826174736023, "rewards/margins": 4.725643634796143, "rewards/rejected": -3.7287609577178955, "step": 1020 }, { "epoch": 1.08, "learning_rate": 2.59364104726716e-07, "logits/chosen": -2.4137046337127686, "logits/rejected": -2.4201440811157227, "logps/chosen": -259.9551696777344, "logps/rejected": -282.3338928222656, "loss": 0.0926, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6825924515724182, "rewards/margins": 4.598808765411377, "rewards/rejected": -3.9162163734436035, "step": 1030 }, { "epoch": 1.09, "learning_rate": 2.547970765583491e-07, "logits/chosen": -2.399484634399414, "logits/rejected": -2.3622524738311768, "logps/chosen": -254.76718139648438, "logps/rejected": -332.40692138671875, "loss": 0.1048, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.104675054550171, "rewards/margins": 5.040583610534668, "rewards/rejected": -3.935908079147339, "step": 1040 }, { "epoch": 1.1, "learning_rate": 2.502284462053799e-07, "logits/chosen": -2.464451789855957, "logits/rejected": -2.4598162174224854, "logps/chosen": -273.9252014160156, "logps/rejected": -282.4798278808594, "loss": 0.0988, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.222347617149353, "rewards/margins": 4.508743762969971, "rewards/rejected": -3.2863965034484863, "step": 1050 }, { "epoch": 1.11, "learning_rate": 2.4565973955323374e-07, "logits/chosen": -2.4067177772521973, "logits/rejected": -2.351435661315918, "logps/chosen": -315.61480712890625, "logps/rejected": -314.0069885253906, "loss": 0.0855, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4242727756500244, "rewards/margins": 5.2490129470825195, "rewards/rejected": -3.824740171432495, "step": 1060 }, { "epoch": 1.12, "learning_rate": 2.410924825128195e-07, "logits/chosen": -2.426487684249878, "logits/rejected": -2.4229023456573486, "logps/chosen": -251.529052734375, "logps/rejected": -255.94302368164062, "loss": 0.0828, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.053213119506836, "rewards/margins": 4.353865146636963, "rewards/rejected": -3.300652265548706, "step": 1070 }, { "epoch": 1.13, "learning_rate": 2.365282005108875e-07, "logits/chosen": -2.4227538108825684, "logits/rejected": -2.4418461322784424, "logps/chosen": -248.6163330078125, "logps/rejected": -300.9302673339844, "loss": 0.0797, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8265342712402344, "rewards/margins": 4.589941024780273, "rewards/rejected": -3.763406753540039, "step": 1080 }, { "epoch": 1.14, "learning_rate": 2.319684179805491e-07, "logits/chosen": -2.441734790802002, "logits/rejected": -2.451201915740967, "logps/chosen": -295.5838928222656, "logps/rejected": -313.68902587890625, "loss": 0.09, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8838626742362976, "rewards/margins": 4.967753887176514, "rewards/rejected": -4.083890914916992, "step": 1090 }, { "epoch": 1.15, "learning_rate": 2.2741465785212902e-07, "logits/chosen": -2.466045618057251, "logits/rejected": -2.4696879386901855, "logps/chosen": -261.8978576660156, "logps/rejected": -305.57568359375, "loss": 0.0942, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8575581312179565, "rewards/margins": 4.823992729187012, "rewards/rejected": -3.9664344787597656, "step": 1100 }, { "epoch": 1.15, "eval_logits/chosen": -2.4925894737243652, "eval_logits/rejected": -2.451488971710205, "eval_logps/chosen": -286.3182067871094, "eval_logps/rejected": -282.2591552734375, "eval_loss": 0.545380711555481, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -0.43561825156211853, "eval_rewards/margins": 1.7691000699996948, "eval_rewards/rejected": -2.2047183513641357, "eval_runtime": 226.8575, "eval_samples_per_second": 8.816, "eval_steps_per_second": 0.278, "step": 1100 }, { "epoch": 1.16, "learning_rate": 2.2286844104451843e-07, "logits/chosen": -2.522095203399658, "logits/rejected": -2.4959158897399902, "logps/chosen": -275.20220947265625, "logps/rejected": -306.86651611328125, "loss": 0.0883, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.150061845779419, "rewards/margins": 5.355578899383545, "rewards/rejected": -4.205517768859863, "step": 1110 }, { "epoch": 1.17, "learning_rate": 2.183312859572008e-07, "logits/chosen": -2.442845106124878, "logits/rejected": -2.4609451293945312, "logps/chosen": -285.9617614746094, "logps/rejected": -306.34881591796875, "loss": 0.0817, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1253329515457153, "rewards/margins": 5.508823394775391, "rewards/rejected": -4.383490562438965, "step": 1120 }, { "epoch": 1.18, "learning_rate": 2.138047079631184e-07, "logits/chosen": -2.4442250728607178, "logits/rejected": -2.4877326488494873, "logps/chosen": -282.99237060546875, "logps/rejected": -302.3456115722656, "loss": 0.0806, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9299386739730835, "rewards/margins": 4.6444220542907715, "rewards/rejected": -3.7144827842712402, "step": 1130 }, { "epoch": 1.19, "learning_rate": 2.0929021890255068e-07, "logits/chosen": -2.5162599086761475, "logits/rejected": -2.4039034843444824, "logps/chosen": -284.69293212890625, "logps/rejected": -282.2374267578125, "loss": 0.0689, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1416810750961304, "rewards/margins": 4.925940036773682, "rewards/rejected": -3.784259080886841, "step": 1140 }, { "epoch": 1.2, "learning_rate": 2.0478932657817102e-07, "logits/chosen": -2.459836006164551, "logits/rejected": -2.450810432434082, "logps/chosen": -289.20245361328125, "logps/rejected": -318.3241882324219, "loss": 0.0899, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8015066385269165, "rewards/margins": 5.1224517822265625, "rewards/rejected": -4.320944786071777, "step": 1150 }, { "epoch": 1.21, "learning_rate": 2.0030353425145374e-07, "logits/chosen": -2.488434314727783, "logits/rejected": -2.420006275177002, "logps/chosen": -278.0094299316406, "logps/rejected": -317.5365295410156, "loss": 0.0773, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7895523309707642, "rewards/margins": 5.3374738693237305, "rewards/rejected": -4.547921657562256, "step": 1160 }, { "epoch": 1.22, "learning_rate": 1.9583434014059635e-07, "logits/chosen": -2.4857897758483887, "logits/rejected": -2.485476016998291, "logps/chosen": -303.67156982421875, "logps/rejected": -290.3270568847656, "loss": 0.1065, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8350151777267456, "rewards/margins": 4.640496730804443, "rewards/rejected": -3.8054821491241455, "step": 1170 }, { "epoch": 1.23, "learning_rate": 1.9138323692012733e-07, "logits/chosen": -2.42707896232605, "logits/rejected": -2.4794921875, "logps/chosen": -283.53973388671875, "logps/rejected": -280.3748474121094, "loss": 0.0785, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4752848744392395, "rewards/margins": 4.622966766357422, "rewards/rejected": -4.147681713104248, "step": 1180 }, { "epoch": 1.25, "learning_rate": 1.8695171122236442e-07, "logits/chosen": -2.5199742317199707, "logits/rejected": -2.46282696723938, "logps/chosen": -278.2845458984375, "logps/rejected": -302.77618408203125, "loss": 0.0664, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7170212864875793, "rewards/margins": 5.141378879547119, "rewards/rejected": -4.424357891082764, "step": 1190 }, { "epoch": 1.26, "learning_rate": 1.8254124314089223e-07, "logits/chosen": -2.4795804023742676, "logits/rejected": -2.462161064147949, "logps/chosen": -293.730224609375, "logps/rejected": -313.1897277832031, "loss": 0.0817, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8153163194656372, "rewards/margins": 4.868264675140381, "rewards/rejected": -4.052948951721191, "step": 1200 }, { "epoch": 1.26, "eval_logits/chosen": -2.486276626586914, "eval_logits/rejected": -2.4441094398498535, "eval_logps/chosen": -289.5495300292969, "eval_logps/rejected": -286.06744384765625, "eval_loss": 0.5530261397361755, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": -0.7587527632713318, "eval_rewards/margins": 1.8267929553985596, "eval_rewards/rejected": -2.585545778274536, "eval_runtime": 226.9182, "eval_samples_per_second": 8.814, "eval_steps_per_second": 0.278, "step": 1200 }, { "epoch": 1.27, "learning_rate": 1.7815330573622205e-07, "logits/chosen": -2.4787325859069824, "logits/rejected": -2.3763909339904785, "logps/chosen": -243.62338256835938, "logps/rejected": -287.4659118652344, "loss": 0.1113, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6325318813323975, "rewards/margins": 4.749757289886475, "rewards/rejected": -4.117225646972656, "step": 1210 }, { "epoch": 1.28, "learning_rate": 1.7378936454380274e-07, "logits/chosen": -2.4470252990722656, "logits/rejected": -2.431623935699463, "logps/chosen": -291.8919982910156, "logps/rejected": -297.94134521484375, "loss": 0.0809, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9836438298225403, "rewards/margins": 5.215204238891602, "rewards/rejected": -4.231560707092285, "step": 1220 }, { "epoch": 1.29, "learning_rate": 1.694508770845427e-07, "logits/chosen": -2.4745402336120605, "logits/rejected": -2.373675584793091, "logps/chosen": -305.07513427734375, "logps/rejected": -305.4457092285156, "loss": 0.097, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8863117098808289, "rewards/margins": 4.978560924530029, "rewards/rejected": -4.092249393463135, "step": 1230 }, { "epoch": 1.3, "learning_rate": 1.651392923780105e-07, "logits/chosen": -2.4465365409851074, "logits/rejected": -2.441972255706787, "logps/chosen": -255.78231811523438, "logps/rejected": -292.06768798828125, "loss": 0.1047, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.5283718109130859, "rewards/margins": 4.56099271774292, "rewards/rejected": -4.032620906829834, "step": 1240 }, { "epoch": 1.31, "learning_rate": 1.6085605045847367e-07, "logits/chosen": -2.475860118865967, "logits/rejected": -2.4598336219787598, "logps/chosen": -282.69293212890625, "logps/rejected": -297.631103515625, "loss": 0.0903, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8150407075881958, "rewards/margins": 4.919914722442627, "rewards/rejected": -4.104874134063721, "step": 1250 }, { "epoch": 1.32, "learning_rate": 1.5660258189393944e-07, "logits/chosen": -2.408860683441162, "logits/rejected": -2.4077579975128174, "logps/chosen": -238.93527221679688, "logps/rejected": -282.56280517578125, "loss": 0.0792, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4947360157966614, "rewards/margins": 4.470829963684082, "rewards/rejected": -3.9760937690734863, "step": 1260 }, { "epoch": 1.33, "learning_rate": 1.5238030730835577e-07, "logits/chosen": -2.487494468688965, "logits/rejected": -2.4405553340911865, "logps/chosen": -263.71319580078125, "logps/rejected": -285.8571472167969, "loss": 0.0893, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8358654975891113, "rewards/margins": 4.591804504394531, "rewards/rejected": -3.755938768386841, "step": 1270 }, { "epoch": 1.34, "learning_rate": 1.4819063690713564e-07, "logits/chosen": -2.5427050590515137, "logits/rejected": -2.5181336402893066, "logps/chosen": -281.62969970703125, "logps/rejected": -290.3141174316406, "loss": 0.0821, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.065869927406311, "rewards/margins": 4.983170509338379, "rewards/rejected": -3.9173007011413574, "step": 1280 }, { "epoch": 1.35, "learning_rate": 1.4403497000615883e-07, "logits/chosen": -2.453326940536499, "logits/rejected": -2.500046491622925, "logps/chosen": -280.1790771484375, "logps/rejected": -319.83416748046875, "loss": 0.0745, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9492340087890625, "rewards/margins": 5.671108245849609, "rewards/rejected": -4.721873760223389, "step": 1290 }, { "epoch": 1.36, "learning_rate": 1.3991469456441272e-07, "logits/chosen": -2.4228312969207764, "logits/rejected": -2.390244245529175, "logps/chosen": -255.469482421875, "logps/rejected": -294.24822998046875, "loss": 0.0697, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8743129968643188, "rewards/margins": 5.1060991287231445, "rewards/rejected": -4.231786251068115, "step": 1300 }, { "epoch": 1.36, "eval_logits/chosen": -2.490978479385376, "eval_logits/rejected": -2.447410821914673, "eval_logps/chosen": -287.88104248046875, "eval_logps/rejected": -284.9020690917969, "eval_loss": 0.5549479722976685, "eval_rewards/accuracies": 0.7797619104385376, "eval_rewards/chosen": -0.5919035077095032, "eval_rewards/margins": 1.8771038055419922, "eval_rewards/rejected": -2.4690072536468506, "eval_runtime": 226.9842, "eval_samples_per_second": 8.811, "eval_steps_per_second": 0.278, "step": 1300 }, { "epoch": 1.37, "learning_rate": 1.358311867204244e-07, "logits/chosen": -2.4280619621276855, "logits/rejected": -2.343656063079834, "logps/chosen": -268.0599060058594, "logps/rejected": -297.6957702636719, "loss": 0.0947, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6798113584518433, "rewards/margins": 5.193746566772461, "rewards/rejected": -4.513935565948486, "step": 1310 }, { "epoch": 1.38, "learning_rate": 1.3178581033264216e-07, "logits/chosen": -2.4262986183166504, "logits/rejected": -2.394023895263672, "logps/chosen": -259.4393310546875, "logps/rejected": -309.8895568847656, "loss": 0.0858, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5569504499435425, "rewards/margins": 4.9072160720825195, "rewards/rejected": -4.3502655029296875, "step": 1320 }, { "epoch": 1.39, "learning_rate": 1.2777991652391757e-07, "logits/chosen": -2.435516357421875, "logits/rejected": -2.4123950004577637, "logps/chosen": -276.7604064941406, "logps/rejected": -326.2718811035156, "loss": 0.106, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8505054712295532, "rewards/margins": 5.1305670738220215, "rewards/rejected": -4.280061721801758, "step": 1330 }, { "epoch": 1.4, "learning_rate": 1.2381484323024178e-07, "logits/chosen": -2.3960816860198975, "logits/rejected": -2.347435474395752, "logps/chosen": -262.6712646484375, "logps/rejected": -293.0451354980469, "loss": 0.0996, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.40400832891464233, "rewards/margins": 4.709560394287109, "rewards/rejected": -4.305551528930664, "step": 1340 }, { "epoch": 1.41, "learning_rate": 1.1989191475388516e-07, "logits/chosen": -2.409029245376587, "logits/rejected": -2.461874485015869, "logps/chosen": -233.5521697998047, "logps/rejected": -285.3503112792969, "loss": 0.076, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.8196706771850586, "rewards/margins": 4.8053669929504395, "rewards/rejected": -3.985696792602539, "step": 1350 }, { "epoch": 1.42, "learning_rate": 1.1601244132109179e-07, "logits/chosen": -2.4862618446350098, "logits/rejected": -2.4886698722839355, "logps/chosen": -297.08038330078125, "logps/rejected": -300.3394775390625, "loss": 0.0949, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9196721315383911, "rewards/margins": 5.271363735198975, "rewards/rejected": -4.351692199707031, "step": 1360 }, { "epoch": 1.43, "learning_rate": 1.1217771864447395e-07, "logits/chosen": -2.574280023574829, "logits/rejected": -2.505779504776001, "logps/chosen": -286.8680419921875, "logps/rejected": -280.51043701171875, "loss": 0.1062, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8500127792358398, "rewards/margins": 4.238048076629639, "rewards/rejected": -3.388035297393799, "step": 1370 }, { "epoch": 1.44, "learning_rate": 1.0838902749025499e-07, "logits/chosen": -2.489686965942383, "logits/rejected": -2.460463047027588, "logps/chosen": -263.9564514160156, "logps/rejected": -331.0151672363281, "loss": 0.0824, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.8998570442199707, "rewards/margins": 5.113956451416016, "rewards/rejected": -4.214098930358887, "step": 1380 }, { "epoch": 1.45, "learning_rate": 1.0464763325050358e-07, "logits/chosen": -2.4643807411193848, "logits/rejected": -2.4676525592803955, "logps/chosen": -271.7199401855469, "logps/rejected": -290.0336608886719, "loss": 0.0902, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.772031307220459, "rewards/margins": 4.822333335876465, "rewards/rejected": -4.050302505493164, "step": 1390 }, { "epoch": 1.47, "learning_rate": 1.0095478552050346e-07, "logits/chosen": -2.510715961456299, "logits/rejected": -2.490159273147583, "logps/chosen": -300.0036315917969, "logps/rejected": -296.74310302734375, "loss": 0.0842, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.568196713924408, "rewards/margins": 5.018073081970215, "rewards/rejected": -4.44987678527832, "step": 1400 }, { "epoch": 1.47, "eval_logits/chosen": -2.510014533996582, "eval_logits/rejected": -2.4669189453125, "eval_logps/chosen": -289.3870544433594, "eval_logps/rejected": -286.6549987792969, "eval_loss": 0.557504415512085, "eval_rewards/accuracies": 0.7916666865348816, "eval_rewards/chosen": -0.7425019145011902, "eval_rewards/margins": 1.9017990827560425, "eval_rewards/rejected": -2.644301176071167, "eval_runtime": 227.1707, "eval_samples_per_second": 8.804, "eval_steps_per_second": 0.277, "step": 1400 }, { "epoch": 1.48, "learning_rate": 9.731171768139806e-08, "logits/chosen": -2.496253728866577, "logits/rejected": -2.4402523040771484, "logps/chosen": -295.94378662109375, "logps/rejected": -300.5093688964844, "loss": 0.0781, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9206317663192749, "rewards/margins": 5.305552959442139, "rewards/rejected": -4.384921073913574, "step": 1410 }, { "epoch": 1.49, "learning_rate": 9.37196464882522e-08, "logits/chosen": -2.4766173362731934, "logits/rejected": -2.46634840965271, "logps/chosen": -282.3688659667969, "logps/rejected": -320.37493896484375, "loss": 0.0759, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.712565541267395, "rewards/margins": 5.404716491699219, "rewards/rejected": -4.692151069641113, "step": 1420 }, { "epoch": 1.5, "learning_rate": 9.017977166366444e-08, "logits/chosen": -2.422487258911133, "logits/rejected": -2.3672680854797363, "logps/chosen": -273.617919921875, "logps/rejected": -306.58599853515625, "loss": 0.101, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.921700656414032, "rewards/margins": 5.180459499359131, "rewards/rejected": -4.258758544921875, "step": 1430 }, { "epoch": 1.51, "learning_rate": 8.669327549707095e-08, "logits/chosen": -2.5244929790496826, "logits/rejected": -2.382864475250244, "logps/chosen": -288.40362548828125, "logps/rejected": -311.01214599609375, "loss": 0.0752, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 1.0363457202911377, "rewards/margins": 5.327676296234131, "rewards/rejected": -4.2913312911987305, "step": 1440 }, { "epoch": 1.52, "learning_rate": 8.326132244986931e-08, "logits/chosen": -2.419132709503174, "logits/rejected": -2.466182231903076, "logps/chosen": -267.1825866699219, "logps/rejected": -295.91259765625, "loss": 0.0863, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6177647113800049, "rewards/margins": 4.658391952514648, "rewards/rejected": -4.040627479553223, "step": 1450 }, { "epoch": 1.53, "learning_rate": 7.988505876649862e-08, "logits/chosen": -2.503843307495117, "logits/rejected": -2.446990728378296, "logps/chosen": -288.59442138671875, "logps/rejected": -294.4771423339844, "loss": 0.1098, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9037265777587891, "rewards/margins": 5.174005031585693, "rewards/rejected": -4.270277976989746, "step": 1460 }, { "epoch": 1.54, "learning_rate": 7.656561209160248e-08, "logits/chosen": -2.491608142852783, "logits/rejected": -2.4506735801696777, "logps/chosen": -278.2688903808594, "logps/rejected": -324.8403625488281, "loss": 0.0789, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.7726919054985046, "rewards/margins": 4.673216342926025, "rewards/rejected": -3.900524139404297, "step": 1470 }, { "epoch": 1.55, "learning_rate": 7.330409109340562e-08, "logits/chosen": -2.4645445346832275, "logits/rejected": -2.4681849479675293, "logps/chosen": -298.295654296875, "logps/rejected": -309.98211669921875, "loss": 0.0759, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 1.0682966709136963, "rewards/margins": 5.494315147399902, "rewards/rejected": -4.426018714904785, "step": 1480 }, { "epoch": 1.56, "learning_rate": 7.010158509342681e-08, "logits/chosen": -2.45269513130188, "logits/rejected": -2.3928894996643066, "logps/chosen": -283.2000427246094, "logps/rejected": -311.6351318359375, "loss": 0.0861, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9797157049179077, "rewards/margins": 5.27277135848999, "rewards/rejected": -4.293056011199951, "step": 1490 }, { "epoch": 1.57, "learning_rate": 6.695916370265527e-08, "logits/chosen": -2.468552589416504, "logits/rejected": -2.3719723224639893, "logps/chosen": -263.45855712890625, "logps/rejected": -324.563720703125, "loss": 0.075, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7729991674423218, "rewards/margins": 5.5292768478393555, "rewards/rejected": -4.756278038024902, "step": 1500 }, { "epoch": 1.57, "eval_logits/chosen": -2.5133254528045654, "eval_logits/rejected": -2.469863176345825, "eval_logps/chosen": -287.34356689453125, "eval_logps/rejected": -284.7437744140625, "eval_loss": 0.5590240359306335, "eval_rewards/accuracies": 0.795634925365448, "eval_rewards/chosen": -0.5381516218185425, "eval_rewards/margins": 1.915026307106018, "eval_rewards/rejected": -2.4531776905059814, "eval_runtime": 227.1555, "eval_samples_per_second": 8.805, "eval_steps_per_second": 0.277, "step": 1500 }, { "epoch": 1.58, "learning_rate": 6.387787646430853e-08, "logits/chosen": -2.4319896697998047, "logits/rejected": -2.457484483718872, "logps/chosen": -259.0374755859375, "logps/rejected": -290.4769287109375, "loss": 0.075, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7177586555480957, "rewards/margins": 5.074606418609619, "rewards/rejected": -4.356847286224365, "step": 1510 }, { "epoch": 1.59, "learning_rate": 6.0858752503294e-08, "logits/chosen": -2.452031135559082, "logits/rejected": -2.489417791366577, "logps/chosen": -285.9494934082031, "logps/rejected": -307.8658752441406, "loss": 0.0841, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1370221376419067, "rewards/margins": 5.569462776184082, "rewards/rejected": -4.432440757751465, "step": 1520 }, { "epoch": 1.6, "learning_rate": 5.7902800182489385e-08, "logits/chosen": -2.4313454627990723, "logits/rejected": -2.3867390155792236, "logps/chosen": -257.27667236328125, "logps/rejected": -287.3458557128906, "loss": 0.0863, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.919940173625946, "rewards/margins": 5.269024848937988, "rewards/rejected": -4.349085330963135, "step": 1530 }, { "epoch": 1.61, "learning_rate": 5.5011006765957604e-08, "logits/chosen": -2.504760265350342, "logits/rejected": -2.463238477706909, "logps/chosen": -306.13433837890625, "logps/rejected": -312.7430419921875, "loss": 0.0982, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1153782606124878, "rewards/margins": 5.54492712020874, "rewards/rejected": -4.429548740386963, "step": 1540 }, { "epoch": 1.62, "learning_rate": 5.218433808920883e-08, "logits/chosen": -2.426063299179077, "logits/rejected": -2.4984512329101562, "logps/chosen": -288.5071105957031, "logps/rejected": -303.65179443359375, "loss": 0.0745, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.829882025718689, "rewards/margins": 5.127657890319824, "rewards/rejected": -4.297775745391846, "step": 1550 }, { "epoch": 1.63, "learning_rate": 4.942373823661927e-08, "logits/chosen": -2.5048506259918213, "logits/rejected": -2.459045648574829, "logps/chosen": -242.9965362548828, "logps/rejected": -294.4432678222656, "loss": 0.0902, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.754008948802948, "rewards/margins": 4.974704742431641, "rewards/rejected": -4.220695495605469, "step": 1560 }, { "epoch": 1.64, "learning_rate": 4.6730129226114354e-08, "logits/chosen": -2.423175811767578, "logits/rejected": -2.3990726470947266, "logps/chosen": -301.69903564453125, "logps/rejected": -307.9515075683594, "loss": 0.0956, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.829128623008728, "rewards/margins": 5.344356060028076, "rewards/rejected": -4.515227317810059, "step": 1570 }, { "epoch": 1.65, "learning_rate": 4.41044107012227e-08, "logits/chosen": -2.4997920989990234, "logits/rejected": -2.468302011489868, "logps/chosen": -253.6482696533203, "logps/rejected": -318.84637451171875, "loss": 0.078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.847277045249939, "rewards/margins": 5.337472438812256, "rewards/rejected": -4.490196228027344, "step": 1580 }, { "epoch": 1.66, "learning_rate": 4.1547459630601966e-08, "logits/chosen": -2.449619770050049, "logits/rejected": -2.3968448638916016, "logps/chosen": -255.52041625976562, "logps/rejected": -294.1830749511719, "loss": 0.0869, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5681460499763489, "rewards/margins": 5.1623101234436035, "rewards/rejected": -4.594163417816162, "step": 1590 }, { "epoch": 1.67, "learning_rate": 3.9060130015138857e-08, "logits/chosen": -2.4341704845428467, "logits/rejected": -2.434159278869629, "logps/chosen": -259.70697021484375, "logps/rejected": -279.59356689453125, "loss": 0.098, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.5663882493972778, "rewards/margins": 5.197344779968262, "rewards/rejected": -4.630956649780273, "step": 1600 }, { "epoch": 1.67, "eval_logits/chosen": -2.50919771194458, "eval_logits/rejected": -2.4652130603790283, "eval_logps/chosen": -289.7226867675781, "eval_logps/rejected": -286.9527893066406, "eval_loss": 0.5582706332206726, "eval_rewards/accuracies": 0.7876983880996704, "eval_rewards/chosen": -0.7760666012763977, "eval_rewards/margins": 1.898013710975647, "eval_rewards/rejected": -2.6740806102752686, "eval_runtime": 226.8017, "eval_samples_per_second": 8.818, "eval_steps_per_second": 0.278, "step": 1600 }, { "epoch": 1.68, "learning_rate": 3.664325260271953e-08, "logits/chosen": -2.4864559173583984, "logits/rejected": -2.449871778488159, "logps/chosen": -276.0063781738281, "logps/rejected": -311.7190246582031, "loss": 0.077, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5649909973144531, "rewards/margins": 5.133740425109863, "rewards/rejected": -4.568748950958252, "step": 1610 }, { "epoch": 1.7, "learning_rate": 3.429763461076676e-08, "logits/chosen": -2.577227830886841, "logits/rejected": -2.46634840965271, "logps/chosen": -282.1453552246094, "logps/rejected": -284.077880859375, "loss": 0.0996, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.769231915473938, "rewards/margins": 4.944138526916504, "rewards/rejected": -4.174906253814697, "step": 1620 }, { "epoch": 1.71, "learning_rate": 3.202405945663555e-08, "logits/chosen": -2.4665629863739014, "logits/rejected": -2.4463419914245605, "logps/chosen": -280.31561279296875, "logps/rejected": -316.2984313964844, "loss": 0.0749, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8701330423355103, "rewards/margins": 5.3133392333984375, "rewards/rejected": -4.443205833435059, "step": 1630 }, { "epoch": 1.72, "learning_rate": 2.9823286495958556e-08, "logits/chosen": -2.423018217086792, "logits/rejected": -2.43149471282959, "logps/chosen": -260.8243103027344, "logps/rejected": -296.49163818359375, "loss": 0.0805, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0365338325500488, "rewards/margins": 5.361039638519287, "rewards/rejected": -4.3245062828063965, "step": 1640 }, { "epoch": 1.73, "learning_rate": 2.769605076902695e-08, "logits/chosen": -2.5106143951416016, "logits/rejected": -2.397507905960083, "logps/chosen": -304.38531494140625, "logps/rejected": -303.55804443359375, "loss": 0.0758, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.0600566864013672, "rewards/margins": 5.407637119293213, "rewards/rejected": -4.347580432891846, "step": 1650 }, { "epoch": 1.74, "learning_rate": 2.5643062755293403e-08, "logits/chosen": -2.4272847175598145, "logits/rejected": -2.438948631286621, "logps/chosen": -248.9642791748047, "logps/rejected": -284.9725036621094, "loss": 0.0765, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.9480940699577332, "rewards/margins": 5.264700889587402, "rewards/rejected": -4.3166069984436035, "step": 1660 }, { "epoch": 1.75, "learning_rate": 2.366500813607733e-08, "logits/chosen": -2.474356174468994, "logits/rejected": -2.4235892295837402, "logps/chosen": -278.48834228515625, "logps/rejected": -289.70465087890625, "loss": 0.1163, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5562735795974731, "rewards/margins": 4.8881306648254395, "rewards/rejected": -4.331856727600098, "step": 1670 }, { "epoch": 1.76, "learning_rate": 2.176254756555329e-08, "logits/chosen": -2.4385287761688232, "logits/rejected": -2.4041240215301514, "logps/chosen": -296.6006164550781, "logps/rejected": -300.4929504394531, "loss": 0.0958, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.120516061782837, "rewards/margins": 5.323664665222168, "rewards/rejected": -4.20314884185791, "step": 1680 }, { "epoch": 1.77, "learning_rate": 1.9936316450097468e-08, "logits/chosen": -2.452484369277954, "logits/rejected": -2.4421699047088623, "logps/chosen": -247.1441650390625, "logps/rejected": -289.236083984375, "loss": 0.0853, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.6771191358566284, "rewards/margins": 4.917300224304199, "rewards/rejected": -4.240180969238281, "step": 1690 }, { "epoch": 1.78, "learning_rate": 1.8186924736067477e-08, "logits/chosen": -2.445409059524536, "logits/rejected": -2.3752622604370117, "logps/chosen": -271.67242431640625, "logps/rejected": -303.90191650390625, "loss": 0.0718, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2321231365203857, "rewards/margins": 5.760961055755615, "rewards/rejected": -4.528838157653809, "step": 1700 }, { "epoch": 1.78, "eval_logits/chosen": -2.5036137104034424, "eval_logits/rejected": -2.4592182636260986, "eval_logps/chosen": -289.4940490722656, "eval_logps/rejected": -286.9160461425781, "eval_loss": 0.5593089461326599, "eval_rewards/accuracies": 0.7876983880996704, "eval_rewards/chosen": -0.7531997561454773, "eval_rewards/margins": 1.9172062873840332, "eval_rewards/rejected": -2.6704065799713135, "eval_runtime": 226.9543, "eval_samples_per_second": 8.812, "eval_steps_per_second": 0.278, "step": 1700 }, { "epoch": 1.79, "learning_rate": 1.651495670608488e-08, "logits/chosen": -2.477602243423462, "logits/rejected": -2.4117965698242188, "logps/chosen": -289.7879943847656, "logps/rejected": -337.7546081542969, "loss": 0.07, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2312443256378174, "rewards/margins": 6.023700714111328, "rewards/rejected": -4.79245662689209, "step": 1710 }, { "epoch": 1.8, "learning_rate": 1.4920970783889737e-08, "logits/chosen": -2.405435800552368, "logits/rejected": -2.430202007293701, "logps/chosen": -248.47152709960938, "logps/rejected": -292.82562255859375, "loss": 0.0894, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.799492359161377, "rewards/margins": 5.147605895996094, "rewards/rejected": -4.348113536834717, "step": 1720 }, { "epoch": 1.81, "learning_rate": 1.340549934783164e-08, "logits/chosen": -2.550469398498535, "logits/rejected": -2.466827154159546, "logps/chosen": -280.4027404785156, "logps/rejected": -333.7227783203125, "loss": 0.0688, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8694186210632324, "rewards/margins": 5.244062900543213, "rewards/rejected": -4.374643802642822, "step": 1730 }, { "epoch": 1.82, "learning_rate": 1.1969048553059608e-08, "logits/chosen": -2.557095527648926, "logits/rejected": -2.475942611694336, "logps/chosen": -257.4891052246094, "logps/rejected": -293.01568603515625, "loss": 0.0831, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.7315679788589478, "rewards/margins": 5.32357120513916, "rewards/rejected": -4.59200382232666, "step": 1740 }, { "epoch": 1.83, "learning_rate": 1.06120981624703e-08, "logits/chosen": -2.4494080543518066, "logits/rejected": -2.4143624305725098, "logps/chosen": -308.0076599121094, "logps/rejected": -289.9423828125, "loss": 0.0902, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.730781078338623, "rewards/margins": 5.309952735900879, "rewards/rejected": -4.579172134399414, "step": 1750 }, { "epoch": 1.84, "learning_rate": 9.335101386471284e-09, "logits/chosen": -2.390369415283203, "logits/rejected": -2.367241382598877, "logps/chosen": -230.51223754882812, "logps/rejected": -268.83966064453125, "loss": 0.0824, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6409198045730591, "rewards/margins": 4.989005088806152, "rewards/rejected": -4.348085403442383, "step": 1760 }, { "epoch": 1.85, "learning_rate": 8.138484731612273e-09, "logits/chosen": -2.4734368324279785, "logits/rejected": -2.4684295654296875, "logps/chosen": -285.767578125, "logps/rejected": -318.2266540527344, "loss": 0.0712, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8488200306892395, "rewards/margins": 5.032792091369629, "rewards/rejected": -4.183971881866455, "step": 1770 }, { "epoch": 1.86, "learning_rate": 7.0226478581355e-09, "logits/chosen": -2.4873061180114746, "logits/rejected": -2.4108550548553467, "logps/chosen": -266.17022705078125, "logps/rejected": -303.34539794921875, "loss": 0.0933, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8401802778244019, "rewards/margins": 5.4867119789123535, "rewards/rejected": -4.646531581878662, "step": 1780 }, { "epoch": 1.87, "learning_rate": 5.987963446492383e-09, "logits/chosen": -2.4779117107391357, "logits/rejected": -2.4737093448638916, "logps/chosen": -272.1192932128906, "logps/rejected": -284.1376037597656, "loss": 0.0672, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.734940230846405, "rewards/margins": 5.037637710571289, "rewards/rejected": -4.302697658538818, "step": 1790 }, { "epoch": 1.88, "learning_rate": 5.0347770728713935e-09, "logits/chosen": -2.459961414337158, "logits/rejected": -2.4179654121398926, "logps/chosen": -284.83447265625, "logps/rejected": -286.37811279296875, "loss": 0.0828, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7448663115501404, "rewards/margins": 4.919360160827637, "rewards/rejected": -4.174493312835693, "step": 1800 }, { "epoch": 1.88, "eval_logits/chosen": -2.5007152557373047, "eval_logits/rejected": -2.4560282230377197, "eval_logps/chosen": -289.94671630859375, "eval_logps/rejected": -287.51776123046875, "eval_loss": 0.560624361038208, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -0.7984652519226074, "eval_rewards/margins": 1.9321120977401733, "eval_rewards/rejected": -2.730577230453491, "eval_runtime": 226.8904, "eval_samples_per_second": 8.815, "eval_steps_per_second": 0.278, "step": 1800 }, { "epoch": 1.89, "learning_rate": 4.1634070937782424e-09, "logits/chosen": -2.4268250465393066, "logits/rejected": -2.386983871459961, "logps/chosen": -273.4813232421875, "logps/rejected": -300.4519348144531, "loss": 0.1058, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6572695970535278, "rewards/margins": 4.807135105133057, "rewards/rejected": -4.149865627288818, "step": 1810 }, { "epoch": 1.9, "learning_rate": 3.3741445397075797e-09, "logits/chosen": -2.460108518600464, "logits/rejected": -2.470428705215454, "logps/chosen": -288.3950500488281, "logps/rejected": -303.42926025390625, "loss": 0.0897, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5406145453453064, "rewards/margins": 4.988966464996338, "rewards/rejected": -4.448351860046387, "step": 1820 }, { "epoch": 1.92, "learning_rate": 2.667253017941018e-09, "logits/chosen": -2.477778673171997, "logits/rejected": -2.3743224143981934, "logps/chosen": -285.9638671875, "logps/rejected": -330.74774169921875, "loss": 0.0915, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8586639165878296, "rewards/margins": 5.589924335479736, "rewards/rejected": -4.731259822845459, "step": 1830 }, { "epoch": 1.93, "learning_rate": 2.0429686245045097e-09, "logits/chosen": -2.4486091136932373, "logits/rejected": -2.3403143882751465, "logps/chosen": -275.94903564453125, "logps/rejected": -288.6750183105469, "loss": 0.1047, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.558720052242279, "rewards/margins": 5.043437957763672, "rewards/rejected": -4.484718322753906, "step": 1840 }, { "epoch": 1.94, "learning_rate": 1.5014998653141708e-09, "logits/chosen": -2.3887779712677, "logits/rejected": -2.3415367603302, "logps/chosen": -282.76116943359375, "logps/rejected": -319.32794189453125, "loss": 0.1119, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7303184866905212, "rewards/margins": 4.939064025878906, "rewards/rejected": -4.208745002746582, "step": 1850 }, { "epoch": 1.95, "learning_rate": 1.0430275865371263e-09, "logits/chosen": -2.447024345397949, "logits/rejected": -2.4308583736419678, "logps/chosen": -271.1973876953125, "logps/rejected": -351.03411865234375, "loss": 0.0928, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.6760638952255249, "rewards/margins": 5.9671854972839355, "rewards/rejected": -5.291121482849121, "step": 1860 }, { "epoch": 1.96, "learning_rate": 6.677049141901314e-10, "logits/chosen": -2.4938368797302246, "logits/rejected": -2.4309468269348145, "logps/chosen": -270.1252746582031, "logps/rejected": -304.6353759765625, "loss": 0.0722, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7412785887718201, "rewards/margins": 5.188776969909668, "rewards/rejected": -4.4474992752075195, "step": 1870 }, { "epoch": 1.97, "learning_rate": 3.7565720299687077e-10, "logits/chosen": -2.4537744522094727, "logits/rejected": -2.4095654487609863, "logps/chosen": -279.35968017578125, "logps/rejected": -311.15313720703125, "loss": 0.0809, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6720268130302429, "rewards/margins": 5.531245231628418, "rewards/rejected": -4.859217643737793, "step": 1880 }, { "epoch": 1.98, "learning_rate": 1.6698199452053197e-10, "logits/chosen": -2.3942551612854004, "logits/rejected": -2.362003803253174, "logps/chosen": -258.2568359375, "logps/rejected": -276.70819091796875, "loss": 0.0849, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7251983284950256, "rewards/margins": 4.999472141265869, "rewards/rejected": -4.274273872375488, "step": 1890 }, { "epoch": 1.99, "learning_rate": 4.174898458556009e-11, "logits/chosen": -2.4330520629882812, "logits/rejected": -2.3871498107910156, "logps/chosen": -260.31646728515625, "logps/rejected": -306.0497131347656, "loss": 0.103, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.7053112387657166, "rewards/margins": 5.6467132568359375, "rewards/rejected": -4.941401481628418, "step": 1900 }, { "epoch": 1.99, "eval_logits/chosen": -2.5001614093780518, "eval_logits/rejected": -2.4554271697998047, "eval_logps/chosen": -289.7665710449219, "eval_logps/rejected": -287.3254699707031, "eval_loss": 0.5601376891136169, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": -0.7804563045501709, "eval_rewards/margins": 1.9308936595916748, "eval_rewards/rejected": -2.7113499641418457, "eval_runtime": 226.8425, "eval_samples_per_second": 8.817, "eval_steps_per_second": 0.278, "step": 1900 }, { "epoch": 2.0, "learning_rate": 0.0, "logits/chosen": -2.5347604751586914, "logits/rejected": -2.473098039627075, "logps/chosen": -270.48486328125, "logps/rejected": -325.1864013671875, "loss": 0.0749, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.6818631291389465, "rewards/margins": 4.970059394836426, "rewards/rejected": -4.288196086883545, "step": 1910 }, { "epoch": 2.0, "step": 1910, "total_flos": 0.0, "train_loss": 0.31584552245614417, "train_runtime": 49042.6248, "train_samples_per_second": 2.493, "train_steps_per_second": 0.039 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 955, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }