{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 17412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017229496898690558, "grad_norm": 2.182090997695923, "learning_rate": 5.74052812858783e-11, "logits/chosen": -2.967046022415161, "logits/rejected": -2.9243061542510986, "logps/chosen": -43.99115753173828, "logps/rejected": -41.627906799316406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0017229496898690559, "grad_norm": 2.3819668292999268, "learning_rate": 5.74052812858783e-10, "logits/chosen": -3.0551116466522217, "logits/rejected": -3.025710344314575, "logps/chosen": -50.45957946777344, "logps/rejected": -49.62355041503906, "loss": 0.6929, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": 6.721958925481886e-05, "rewards/margins": 0.00045209971722215414, "rewards/rejected": -0.0003848801425192505, "step": 10 }, { "epoch": 0.0034458993797381117, "grad_norm": 2.2462689876556396, "learning_rate": 1.148105625717566e-09, "logits/chosen": -3.1189348697662354, "logits/rejected": -3.1107780933380127, "logps/chosen": -52.6757926940918, "logps/rejected": -52.99198532104492, "loss": 0.6933, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.00012710278679151088, "rewards/margins": -0.0002601863816380501, "rewards/rejected": 0.00013308358029462397, "step": 20 }, { "epoch": 0.005168849069607168, "grad_norm": 2.578803300857544, "learning_rate": 1.7221584385763488e-09, "logits/chosen": -3.0917344093322754, "logits/rejected": -3.0679714679718018, "logps/chosen": -56.795806884765625, "logps/rejected": -58.4331169128418, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 3.137241492368048e-06, "rewards/margins": -3.769385875784792e-05, "rewards/rejected": 4.08310916100163e-05, "step": 30 }, { "epoch": 0.006891798759476223, "grad_norm": 2.0109100341796875, "learning_rate": 2.296211251435132e-09, "logits/chosen": -3.105067014694214, "logits/rejected": -3.0735316276550293, "logps/chosen": -55.277122497558594, "logps/rejected": -50.67603302001953, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -4.951923619955778e-05, "rewards/margins": -0.00014242697216104716, "rewards/rejected": 9.290773596148938e-05, "step": 40 }, { "epoch": 0.00861474844934528, "grad_norm": 2.3901138305664062, "learning_rate": 2.870264064293915e-09, "logits/chosen": -3.1007907390594482, "logits/rejected": -3.0843148231506348, "logps/chosen": -53.10557174682617, "logps/rejected": -51.503448486328125, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00011030097084585577, "rewards/margins": 0.00014637041022069752, "rewards/rejected": -3.6069457564735785e-05, "step": 50 }, { "epoch": 0.010337698139214336, "grad_norm": 2.7916746139526367, "learning_rate": 3.4443168771526976e-09, "logits/chosen": -3.154005765914917, "logits/rejected": -3.1243300437927246, "logps/chosen": -57.58835983276367, "logps/rejected": -54.1540641784668, "loss": 0.6933, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -1.9247414456913248e-05, "rewards/margins": -0.00029740575700998306, "rewards/rejected": 0.0002781583461910486, "step": 60 }, { "epoch": 0.012060647829083391, "grad_norm": 2.201021194458008, "learning_rate": 4.018369690011481e-09, "logits/chosen": -3.050886392593384, "logits/rejected": -3.0308964252471924, "logps/chosen": -53.7619743347168, "logps/rejected": -53.217613220214844, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 7.266564352903515e-05, "rewards/margins": 3.217640824004775e-06, "rewards/rejected": 6.94479895173572e-05, "step": 70 }, { "epoch": 0.013783597518952447, "grad_norm": 2.4391942024230957, "learning_rate": 4.592422502870264e-09, "logits/chosen": -3.1597275733947754, "logits/rejected": -3.126314640045166, "logps/chosen": -59.0842399597168, "logps/rejected": -54.11265182495117, "loss": 0.693, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.00012190090637886897, "rewards/margins": 0.00029252038802951574, "rewards/rejected": -0.0001706194889266044, "step": 80 }, { "epoch": 0.015506547208821502, "grad_norm": 2.4720358848571777, "learning_rate": 5.166475315729047e-09, "logits/chosen": -2.9935643672943115, "logits/rejected": -2.9788944721221924, "logps/chosen": -53.482994079589844, "logps/rejected": -52.826316833496094, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.00011938335228478536, "rewards/margins": -0.0002771492290776223, "rewards/rejected": 0.00015776584041304886, "step": 90 }, { "epoch": 0.01722949689869056, "grad_norm": 2.486983060836792, "learning_rate": 5.74052812858783e-09, "logits/chosen": -3.1700241565704346, "logits/rejected": -3.107978343963623, "logps/chosen": -55.95051956176758, "logps/rejected": -49.64581298828125, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -5.600529766525142e-05, "rewards/margins": 0.00010592350736260414, "rewards/rejected": -0.00016192883776966482, "step": 100 }, { "epoch": 0.01722949689869056, "eval_logits/chosen": -3.1626386642456055, "eval_logits/rejected": -3.156970262527466, "eval_logps/chosen": -58.71213150024414, "eval_logps/rejected": -63.17595672607422, "eval_loss": 0.6931698322296143, "eval_rewards/accuracies": 0.4993029832839966, "eval_rewards/chosen": -2.4231537736341124e-06, "eval_rewards/margins": -4.4078675273340195e-05, "eval_rewards/rejected": 4.16555158153642e-05, "eval_runtime": 383.9296, "eval_samples_per_second": 11.21, "eval_steps_per_second": 1.401, "step": 100 }, { "epoch": 0.018952446588559616, "grad_norm": 2.5427167415618896, "learning_rate": 6.314580941446612e-09, "logits/chosen": -3.1217751502990723, "logits/rejected": -3.097882032394409, "logps/chosen": -55.59651565551758, "logps/rejected": -52.34502410888672, "loss": 0.6932, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0001945993717527017, "rewards/margins": -5.41870467714034e-05, "rewards/rejected": -0.0001404123322572559, "step": 110 }, { "epoch": 0.02067539627842867, "grad_norm": 2.5627520084381104, "learning_rate": 6.888633754305395e-09, "logits/chosen": -3.065497398376465, "logits/rejected": -3.050015687942505, "logps/chosen": -53.18254470825195, "logps/rejected": -55.58748245239258, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.249760422680993e-06, "rewards/margins": 0.0001491528091719374, "rewards/rejected": -0.0001534025650471449, "step": 120 }, { "epoch": 0.022398345968297727, "grad_norm": 2.136780023574829, "learning_rate": 7.462686567164179e-09, "logits/chosen": -3.1010513305664062, "logits/rejected": -3.0869858264923096, "logps/chosen": -55.178199768066406, "logps/rejected": -53.753334045410156, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0001543060498079285, "rewards/margins": 8.332518336828798e-05, "rewards/rejected": 7.09808518877253e-05, "step": 130 }, { "epoch": 0.024121295658166782, "grad_norm": 2.4314582347869873, "learning_rate": 8.036739380022962e-09, "logits/chosen": -3.1228363513946533, "logits/rejected": -3.1045682430267334, "logps/chosen": -54.168556213378906, "logps/rejected": -53.76570510864258, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00017434923211112618, "rewards/margins": 0.00025869678938761353, "rewards/rejected": -8.434753544861451e-05, "step": 140 }, { "epoch": 0.025844245348035838, "grad_norm": 2.2146575450897217, "learning_rate": 8.610792192881745e-09, "logits/chosen": -3.0275564193725586, "logits/rejected": -3.0097458362579346, "logps/chosen": -52.61577224731445, "logps/rejected": -52.40879440307617, "loss": 0.6931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0001102016176446341, "rewards/margins": 0.0001768237998476252, "rewards/rejected": -6.662221130682155e-05, "step": 150 }, { "epoch": 0.027567195037904894, "grad_norm": 2.15348219871521, "learning_rate": 9.184845005740529e-09, "logits/chosen": -3.0887250900268555, "logits/rejected": -3.0679216384887695, "logps/chosen": -53.48912811279297, "logps/rejected": -54.716888427734375, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8873213775805198e-05, "rewards/margins": 0.00012213035370223224, "rewards/rejected": -0.00014100356202106923, "step": 160 }, { "epoch": 0.02929014472777395, "grad_norm": 2.3527538776397705, "learning_rate": 9.758897818599312e-09, "logits/chosen": -3.076545238494873, "logits/rejected": -3.056947946548462, "logps/chosen": -56.27220916748047, "logps/rejected": -51.344764709472656, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.00012392585631459951, "rewards/margins": 0.00035758252488449216, "rewards/rejected": -0.0002336566394660622, "step": 170 }, { "epoch": 0.031013094417643005, "grad_norm": 2.6023190021514893, "learning_rate": 1.0332950631458094e-08, "logits/chosen": -3.0624301433563232, "logits/rejected": -3.0436959266662598, "logps/chosen": -56.41130447387695, "logps/rejected": -53.7844123840332, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -5.6476717873010784e-05, "rewards/margins": -2.0783279978786595e-05, "rewards/rejected": -3.569346881704405e-05, "step": 180 }, { "epoch": 0.03273604410751206, "grad_norm": 2.6346166133880615, "learning_rate": 1.0907003444316877e-08, "logits/chosen": -3.1240460872650146, "logits/rejected": -3.0806965827941895, "logps/chosen": -58.17539596557617, "logps/rejected": -52.551292419433594, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 8.138192060869187e-05, "rewards/margins": 0.00021620994084514678, "rewards/rejected": -0.00013482803478837013, "step": 190 }, { "epoch": 0.03445899379738112, "grad_norm": 2.5793368816375732, "learning_rate": 1.148105625717566e-08, "logits/chosen": -3.0594937801361084, "logits/rejected": -3.043846607208252, "logps/chosen": -54.11907958984375, "logps/rejected": -54.717613220214844, "loss": 0.6932, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 3.6291618016548455e-05, "rewards/margins": -9.856436918198597e-06, "rewards/rejected": 4.614806312019937e-05, "step": 200 }, { "epoch": 0.03445899379738112, "eval_logits/chosen": -3.163424015045166, "eval_logits/rejected": -3.157769203186035, "eval_logps/chosen": -58.71607208251953, "eval_logps/rejected": -63.17774963378906, "eval_loss": 0.6931805610656738, "eval_rewards/accuracies": 0.4902416467666626, "eval_rewards/chosen": -4.17379051214084e-05, "eval_rewards/margins": -6.551075057359412e-05, "eval_rewards/rejected": 2.3772852728143334e-05, "eval_runtime": 383.8004, "eval_samples_per_second": 11.214, "eval_steps_per_second": 1.402, "step": 200 }, { "epoch": 0.03618194348725017, "grad_norm": 2.2869973182678223, "learning_rate": 1.2055109070034444e-08, "logits/chosen": -3.014346122741699, "logits/rejected": -3.005690097808838, "logps/chosen": -53.27101516723633, "logps/rejected": -57.28047561645508, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -6.969284731894732e-05, "rewards/margins": -0.00016991101438179612, "rewards/rejected": 0.00010021818889072165, "step": 210 }, { "epoch": 0.03790489317711923, "grad_norm": 2.3325183391571045, "learning_rate": 1.2629161882893224e-08, "logits/chosen": -3.050661325454712, "logits/rejected": -3.0193305015563965, "logps/chosen": -52.20319747924805, "logps/rejected": -51.32220458984375, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -3.724562702700496e-05, "rewards/margins": 2.309179944859352e-05, "rewards/rejected": -6.033741374267265e-05, "step": 220 }, { "epoch": 0.03962784286698828, "grad_norm": 2.3984899520874023, "learning_rate": 1.3203214695752007e-08, "logits/chosen": -3.0510871410369873, "logits/rejected": -3.0328636169433594, "logps/chosen": -48.91734313964844, "logps/rejected": -49.937564849853516, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -7.72247658460401e-05, "rewards/margins": -0.00014934124192222953, "rewards/rejected": 7.211649790406227e-05, "step": 230 }, { "epoch": 0.04135079255685734, "grad_norm": 2.250880241394043, "learning_rate": 1.377726750861079e-08, "logits/chosen": -3.0246238708496094, "logits/rejected": -2.9821224212646484, "logps/chosen": -55.9506950378418, "logps/rejected": -52.1674919128418, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 3.468979775789194e-05, "rewards/margins": 0.00016330341168213636, "rewards/rejected": -0.00012861359573435038, "step": 240 }, { "epoch": 0.043073742246726394, "grad_norm": 2.3098740577697754, "learning_rate": 1.4351320321469574e-08, "logits/chosen": -3.118074893951416, "logits/rejected": -3.0976781845092773, "logps/chosen": -52.27685546875, "logps/rejected": -51.094974517822266, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 3.2288546208292246e-05, "rewards/margins": 5.936500201642048e-06, "rewards/rejected": 2.6352028726250865e-05, "step": 250 }, { "epoch": 0.044796691936595454, "grad_norm": 2.3118174076080322, "learning_rate": 1.4925373134328357e-08, "logits/chosen": -3.0943145751953125, "logits/rejected": -3.082043170928955, "logps/chosen": -54.84590530395508, "logps/rejected": -56.64457321166992, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00023266393691301346, "rewards/margins": 0.0005058067617937922, "rewards/rejected": -0.0002731427666731179, "step": 260 }, { "epoch": 0.046519641626464506, "grad_norm": 2.2162578105926514, "learning_rate": 1.549942594718714e-08, "logits/chosen": -3.0333669185638428, "logits/rejected": -3.0151615142822266, "logps/chosen": -53.1147346496582, "logps/rejected": -54.3088264465332, "loss": 0.6931, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 4.805589560419321e-05, "rewards/margins": 8.423054532613605e-05, "rewards/rejected": -3.6174646083964035e-05, "step": 270 }, { "epoch": 0.048242591316333565, "grad_norm": 2.433640956878662, "learning_rate": 1.6073478760045924e-08, "logits/chosen": -3.124502658843994, "logits/rejected": -3.0904550552368164, "logps/chosen": -57.6026496887207, "logps/rejected": -53.421844482421875, "loss": 0.6931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.00010524224489927292, "rewards/margins": 0.00017913521151058376, "rewards/rejected": -7.389295933535323e-05, "step": 280 }, { "epoch": 0.04996554100620262, "grad_norm": 2.2455179691314697, "learning_rate": 1.6647531572904707e-08, "logits/chosen": -3.046982526779175, "logits/rejected": -3.032755136489868, "logps/chosen": -55.389923095703125, "logps/rejected": -54.29637908935547, "loss": 0.6931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.00013479938206728548, "rewards/margins": 1.1545093911990989e-05, "rewards/rejected": -0.00014634447870776057, "step": 290 }, { "epoch": 0.051688490696071676, "grad_norm": 2.358902931213379, "learning_rate": 1.722158438576349e-08, "logits/chosen": -3.0027740001678467, "logits/rejected": -2.994077205657959, "logps/chosen": -52.852867126464844, "logps/rejected": -53.93471145629883, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00010507287515792996, "rewards/margins": -0.00019952570437453687, "rewards/rejected": 9.445285104447976e-05, "step": 300 }, { "epoch": 0.051688490696071676, "eval_logits/chosen": -3.1633002758026123, "eval_logits/rejected": -3.157639265060425, "eval_logps/chosen": -58.70552062988281, "eval_logps/rejected": -63.168418884277344, "eval_loss": 0.6931744813919067, "eval_rewards/accuracies": 0.4846654236316681, "eval_rewards/chosen": 6.376942474162206e-05, "eval_rewards/margins": -5.330716885509901e-05, "eval_rewards/rejected": 0.0001170766117866151, "eval_runtime": 383.8868, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.401, "step": 300 }, { "epoch": 0.05341144038594073, "grad_norm": 2.4745993614196777, "learning_rate": 1.7795637198622274e-08, "logits/chosen": -3.0654685497283936, "logits/rejected": -3.0599312782287598, "logps/chosen": -53.512794494628906, "logps/rejected": -53.305992126464844, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 6.606340321013704e-05, "rewards/margins": 0.0001727335329633206, "rewards/rejected": -0.00010667012247722596, "step": 310 }, { "epoch": 0.05513439007580979, "grad_norm": 2.3377511501312256, "learning_rate": 1.8369690011481057e-08, "logits/chosen": -3.023184299468994, "logits/rejected": -2.996903896331787, "logps/chosen": -54.52845001220703, "logps/rejected": -49.25590515136719, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00020448882423806936, "rewards/margins": -4.727922714664601e-05, "rewards/rejected": -0.00015720954979769886, "step": 320 }, { "epoch": 0.05685733976567884, "grad_norm": 2.3400802612304688, "learning_rate": 1.894374282433984e-08, "logits/chosen": -3.0834453105926514, "logits/rejected": -3.0598013401031494, "logps/chosen": -55.03553009033203, "logps/rejected": -52.22095489501953, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -2.3972297640284523e-05, "rewards/margins": 8.265667565865442e-05, "rewards/rejected": -0.00010662900604074821, "step": 330 }, { "epoch": 0.0585802894555479, "grad_norm": 2.1597745418548584, "learning_rate": 1.9517795637198624e-08, "logits/chosen": -3.0051636695861816, "logits/rejected": -2.983678102493286, "logps/chosen": -52.53996658325195, "logps/rejected": -51.94036102294922, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00013412107364274561, "rewards/margins": -8.569478814024478e-05, "rewards/rejected": -4.842626367462799e-05, "step": 340 }, { "epoch": 0.06030323914541695, "grad_norm": 2.32051420211792, "learning_rate": 2.0091848450057404e-08, "logits/chosen": -2.9775118827819824, "logits/rejected": -2.938002109527588, "logps/chosen": -56.234169006347656, "logps/rejected": -53.5710563659668, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.00024700857466086745, "rewards/margins": -0.0001019686897052452, "rewards/rejected": -0.0001450398558517918, "step": 350 }, { "epoch": 0.06202618883528601, "grad_norm": 2.402641534805298, "learning_rate": 2.0665901262916187e-08, "logits/chosen": -3.12839937210083, "logits/rejected": -3.1054434776306152, "logps/chosen": -54.56840133666992, "logps/rejected": -50.51968002319336, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.00019098764460068196, "rewards/margins": 0.0002470739127602428, "rewards/rejected": -5.608624633168802e-05, "step": 360 }, { "epoch": 0.06374913852515507, "grad_norm": 2.324981927871704, "learning_rate": 2.123995407577497e-08, "logits/chosen": -3.1034820079803467, "logits/rejected": -3.0743255615234375, "logps/chosen": -52.3858642578125, "logps/rejected": -51.36119842529297, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -6.981555088714231e-06, "rewards/margins": 0.00019496475579217076, "rewards/rejected": -0.0002019463136093691, "step": 370 }, { "epoch": 0.06547208821502412, "grad_norm": 2.0785439014434814, "learning_rate": 2.1814006888633754e-08, "logits/chosen": -3.203939914703369, "logits/rejected": -3.179131269454956, "logps/chosen": -53.51581573486328, "logps/rejected": -52.22514724731445, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00012094577687093988, "rewards/margins": 0.0002246426447527483, "rewards/rejected": -0.0003455884288996458, "step": 380 }, { "epoch": 0.06719503790489317, "grad_norm": 2.406686305999756, "learning_rate": 2.2388059701492537e-08, "logits/chosen": -3.100468158721924, "logits/rejected": -3.0749452114105225, "logps/chosen": -56.07271194458008, "logps/rejected": -55.2164306640625, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -7.178229134296998e-05, "rewards/margins": 1.8414790247334167e-05, "rewards/rejected": -9.019709250424057e-05, "step": 390 }, { "epoch": 0.06891798759476224, "grad_norm": 2.0910956859588623, "learning_rate": 2.296211251435132e-08, "logits/chosen": -3.070652484893799, "logits/rejected": -3.0547804832458496, "logps/chosen": -52.63219451904297, "logps/rejected": -52.75767135620117, "loss": 0.6932, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 7.015008304733783e-05, "rewards/margins": -2.7948093702434562e-05, "rewards/rejected": 9.809816401684657e-05, "step": 400 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.1631181240081787, "eval_logits/rejected": -3.157465696334839, "eval_logps/chosen": -58.70676803588867, "eval_logps/rejected": -63.165775299072266, "eval_loss": 0.6931939125061035, "eval_rewards/accuracies": 0.4814126491546631, "eval_rewards/chosen": 5.1321490900591016e-05, "eval_rewards/margins": -9.216690523317084e-05, "eval_rewards/rejected": 0.00014348838885780424, "eval_runtime": 383.9179, "eval_samples_per_second": 11.211, "eval_steps_per_second": 1.401, "step": 400 }, { "epoch": 0.07064093728463129, "grad_norm": 2.1548616886138916, "learning_rate": 2.3536165327210104e-08, "logits/chosen": -3.0745739936828613, "logits/rejected": -3.070368528366089, "logps/chosen": -50.77008819580078, "logps/rejected": -55.54046630859375, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.00024596654111519456, "rewards/margins": -0.00012159270409028977, "rewards/rejected": -0.0001243737933691591, "step": 410 }, { "epoch": 0.07236388697450034, "grad_norm": 2.5353214740753174, "learning_rate": 2.4110218140068887e-08, "logits/chosen": -3.0599067211151123, "logits/rejected": -3.052058696746826, "logps/chosen": -54.18812942504883, "logps/rejected": -53.90911102294922, "loss": 0.6933, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.00022149831056594849, "rewards/margins": -0.00026814654120244086, "rewards/rejected": 4.66481942567043e-05, "step": 420 }, { "epoch": 0.0740868366643694, "grad_norm": 2.2444565296173096, "learning_rate": 2.4684270952927668e-08, "logits/chosen": -3.088376760482788, "logits/rejected": -3.0746445655822754, "logps/chosen": -53.08332443237305, "logps/rejected": -54.16059112548828, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 7.78869361965917e-05, "rewards/margins": 5.263996354187839e-05, "rewards/rejected": 2.5246979930670932e-05, "step": 430 }, { "epoch": 0.07580978635423846, "grad_norm": 2.5102503299713135, "learning_rate": 2.5258323765786448e-08, "logits/chosen": -3.133507490158081, "logits/rejected": -3.098339557647705, "logps/chosen": -54.316444396972656, "logps/rejected": -53.2856330871582, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00011689448729157448, "rewards/margins": 0.0002782618976198137, "rewards/rejected": -0.00039515638491138816, "step": 440 }, { "epoch": 0.07753273604410751, "grad_norm": 2.2812373638153076, "learning_rate": 2.583237657864523e-08, "logits/chosen": -3.0558762550354004, "logits/rejected": -3.0234310626983643, "logps/chosen": -56.092933654785156, "logps/rejected": -54.60505294799805, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 2.950901398435235e-05, "rewards/margins": 6.448172644013539e-05, "rewards/rejected": -3.4972716093761846e-05, "step": 450 }, { "epoch": 0.07925568573397657, "grad_norm": 2.4034018516540527, "learning_rate": 2.6406429391504014e-08, "logits/chosen": -3.0245532989501953, "logits/rejected": -3.0048906803131104, "logps/chosen": -56.2021484375, "logps/rejected": -53.02021408081055, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.834456270444207e-05, "rewards/margins": 0.00031032264814712107, "rewards/rejected": -0.00032866717083379626, "step": 460 }, { "epoch": 0.08097863542384562, "grad_norm": 2.202767848968506, "learning_rate": 2.6980482204362798e-08, "logits/chosen": -3.0473744869232178, "logits/rejected": -3.0157155990600586, "logps/chosen": -53.29618453979492, "logps/rejected": -51.4212760925293, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.00030593553674407303, "rewards/margins": -8.712053386261687e-05, "rewards/rejected": -0.000218815024709329, "step": 470 }, { "epoch": 0.08270158511371468, "grad_norm": 2.4477829933166504, "learning_rate": 2.755453501722158e-08, "logits/chosen": -3.045158863067627, "logits/rejected": -3.0398292541503906, "logps/chosen": -54.22154998779297, "logps/rejected": -58.98264694213867, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0001763901673257351, "rewards/margins": 0.00040636182529851794, "rewards/rejected": -0.0002299716288689524, "step": 480 }, { "epoch": 0.08442453480358374, "grad_norm": 2.4757072925567627, "learning_rate": 2.8128587830080364e-08, "logits/chosen": -2.9551305770874023, "logits/rejected": -2.90541934967041, "logps/chosen": -60.60126876831055, "logps/rejected": -51.388771057128906, "loss": 0.6928, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.00014238920994102955, "rewards/margins": 0.0006421349244192243, "rewards/rejected": -0.0004997456562705338, "step": 490 }, { "epoch": 0.08614748449345279, "grad_norm": 2.2610628604888916, "learning_rate": 2.8702640642939148e-08, "logits/chosen": -3.017432689666748, "logits/rejected": -2.988736629486084, "logps/chosen": -54.987876892089844, "logps/rejected": -51.629493713378906, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0003800485283136368, "rewards/margins": 2.8218981242389418e-05, "rewards/rejected": -0.0004082675150129944, "step": 500 }, { "epoch": 0.08614748449345279, "eval_logits/chosen": -3.163292407989502, "eval_logits/rejected": -3.1576731204986572, "eval_logps/chosen": -58.70521545410156, "eval_logps/rejected": -63.1715202331543, "eval_loss": 0.693157434463501, "eval_rewards/accuracies": 0.4846654236316681, "eval_rewards/chosen": 6.682676757918671e-05, "eval_rewards/margins": -1.9236606021877378e-05, "eval_rewards/rejected": 8.606336632510647e-05, "eval_runtime": 384.1088, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 500 }, { "epoch": 0.08787043418332184, "grad_norm": 2.236431837081909, "learning_rate": 2.927669345579793e-08, "logits/chosen": -3.009709119796753, "logits/rejected": -2.98846435546875, "logps/chosen": -58.28581619262695, "logps/rejected": -52.01591110229492, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00010872525308514014, "rewards/margins": 0.00010300347639713436, "rewards/rejected": -0.00021172869310248643, "step": 510 }, { "epoch": 0.08959338387319091, "grad_norm": 2.0647053718566895, "learning_rate": 2.9850746268656714e-08, "logits/chosen": -3.0577194690704346, "logits/rejected": -3.0321173667907715, "logps/chosen": -56.4144401550293, "logps/rejected": -51.67496871948242, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -7.5087184086442e-05, "rewards/margins": 0.0003751892945729196, "rewards/rejected": -0.0004502764786593616, "step": 520 }, { "epoch": 0.09131633356305996, "grad_norm": 2.065141201019287, "learning_rate": 3.0424799081515494e-08, "logits/chosen": -3.054394245147705, "logits/rejected": -3.0125017166137695, "logps/chosen": -55.73079299926758, "logps/rejected": -51.234642028808594, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0003116551670245826, "rewards/margins": 0.0001754478143993765, "rewards/rejected": -0.0004871029523201287, "step": 530 }, { "epoch": 0.09303928325292901, "grad_norm": 2.235670804977417, "learning_rate": 3.099885189437428e-08, "logits/chosen": -3.040396213531494, "logits/rejected": -3.0238919258117676, "logps/chosen": -52.867393493652344, "logps/rejected": -52.96399688720703, "loss": 0.693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.00015139202878344804, "rewards/margins": 0.0003112528065685183, "rewards/rejected": -0.0004626448208000511, "step": 540 }, { "epoch": 0.09476223294279806, "grad_norm": 2.265388250350952, "learning_rate": 3.157290470723307e-08, "logits/chosen": -3.1010537147521973, "logits/rejected": -3.08301043510437, "logps/chosen": -53.58502960205078, "logps/rejected": -52.021484375, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00011169396748300642, "rewards/margins": 0.00047532361350022256, "rewards/rejected": -0.0005870176246389747, "step": 550 }, { "epoch": 0.09648518263266713, "grad_norm": 2.5687334537506104, "learning_rate": 3.214695752009185e-08, "logits/chosen": -3.0724689960479736, "logits/rejected": -3.0644662380218506, "logps/chosen": -52.136314392089844, "logps/rejected": -55.01055908203125, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0002773841260932386, "rewards/margins": 0.0001261424768017605, "rewards/rejected": -0.0004035266174469143, "step": 560 }, { "epoch": 0.09820813232253618, "grad_norm": 2.243095874786377, "learning_rate": 3.2721010332950634e-08, "logits/chosen": -3.0449230670928955, "logits/rejected": -3.0362496376037598, "logps/chosen": -51.387664794921875, "logps/rejected": -53.87470245361328, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.00025503215147182345, "rewards/margins": 0.00027157997828908265, "rewards/rejected": -0.0005266121588647366, "step": 570 }, { "epoch": 0.09993108201240523, "grad_norm": 1.7692292928695679, "learning_rate": 3.3295063145809414e-08, "logits/chosen": -3.0518791675567627, "logits/rejected": -3.0461270809173584, "logps/chosen": -51.13282775878906, "logps/rejected": -53.26726531982422, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00024760173982940614, "rewards/margins": 6.3309444158221595e-06, "rewards/rejected": -0.0002539327251724899, "step": 580 }, { "epoch": 0.1016540317022743, "grad_norm": 2.0674262046813965, "learning_rate": 3.38691159586682e-08, "logits/chosen": -3.0508594512939453, "logits/rejected": -3.029005527496338, "logps/chosen": -54.90104293823242, "logps/rejected": -54.854095458984375, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0002211978135164827, "rewards/margins": 0.00045788512215949595, "rewards/rejected": -0.0006790829938836396, "step": 590 }, { "epoch": 0.10337698139214335, "grad_norm": 2.3277156352996826, "learning_rate": 3.444316877152698e-08, "logits/chosen": -3.0239500999450684, "logits/rejected": -2.999753713607788, "logps/chosen": -53.89618682861328, "logps/rejected": -56.734901428222656, "loss": 0.6929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00012435871758498251, "rewards/margins": 0.0005410046433098614, "rewards/rejected": -0.0006653633899986744, "step": 600 }, { "epoch": 0.10337698139214335, "eval_logits/chosen": -3.162782907485962, "eval_logits/rejected": -3.1571431159973145, "eval_logps/chosen": -58.687644958496094, "eval_logps/rejected": -63.15595245361328, "eval_loss": 0.6931475400924683, "eval_rewards/accuracies": 0.5037174820899963, "eval_rewards/chosen": 0.00024248022236861289, "eval_rewards/margins": 7.474434937648766e-07, "eval_rewards/rejected": 0.0002417328068986535, "eval_runtime": 384.402, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 600 }, { "epoch": 0.1050999310820124, "grad_norm": 2.2252919673919678, "learning_rate": 3.501722158438576e-08, "logits/chosen": -2.989957332611084, "logits/rejected": -2.9874954223632812, "logps/chosen": -52.57634353637695, "logps/rejected": -53.302146911621094, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.000117315306852106, "rewards/margins": 0.0002942693536169827, "rewards/rejected": -0.00041158468229696155, "step": 610 }, { "epoch": 0.10682288077188146, "grad_norm": 2.3678600788116455, "learning_rate": 3.559127439724455e-08, "logits/chosen": -3.1490225791931152, "logits/rejected": -3.1223597526550293, "logps/chosen": -55.03691482543945, "logps/rejected": -53.308204650878906, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00017901579849421978, "rewards/margins": 0.00023397724726237357, "rewards/rejected": -0.00041299304575659335, "step": 620 }, { "epoch": 0.10854583046175052, "grad_norm": 2.4759716987609863, "learning_rate": 3.616532721010333e-08, "logits/chosen": -3.132228374481201, "logits/rejected": -3.105452060699463, "logps/chosen": -53.894996643066406, "logps/rejected": -50.92278289794922, "loss": 0.693, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00030189091921783984, "rewards/margins": 0.0003871010267175734, "rewards/rejected": -0.0006889918586239219, "step": 630 }, { "epoch": 0.11026878015161957, "grad_norm": 2.4740631580352783, "learning_rate": 3.6739380022962115e-08, "logits/chosen": -3.1005430221557617, "logits/rejected": -3.0895869731903076, "logps/chosen": -52.950958251953125, "logps/rejected": -54.390403747558594, "loss": 0.693, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00038868002593517303, "rewards/margins": 0.0002507609606254846, "rewards/rejected": -0.0006394410156644881, "step": 640 }, { "epoch": 0.11199172984148863, "grad_norm": 2.6813840866088867, "learning_rate": 3.7313432835820895e-08, "logits/chosen": -3.1131179332733154, "logits/rejected": -3.114495038986206, "logps/chosen": -51.519874572753906, "logps/rejected": -54.8907585144043, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0004498251364566386, "rewards/margins": 0.0002433933550491929, "rewards/rejected": -0.0006932184332981706, "step": 650 }, { "epoch": 0.11371467953135768, "grad_norm": 2.233092784881592, "learning_rate": 3.788748564867968e-08, "logits/chosen": -3.0032222270965576, "logits/rejected": -2.9970309734344482, "logps/chosen": -54.689292907714844, "logps/rejected": -52.32310104370117, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0006124047795310616, "rewards/margins": -1.6964500900940038e-05, "rewards/rejected": -0.0005954402731731534, "step": 660 }, { "epoch": 0.11543762922122675, "grad_norm": 2.2063403129577637, "learning_rate": 3.846153846153846e-08, "logits/chosen": -3.0260491371154785, "logits/rejected": -3.0212159156799316, "logps/chosen": -53.04901123046875, "logps/rejected": -57.56760787963867, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.00043413686216808856, "rewards/margins": 3.587744868127629e-05, "rewards/rejected": -0.0004700143472291529, "step": 670 }, { "epoch": 0.1171605789110958, "grad_norm": 2.3225157260894775, "learning_rate": 3.903559127439725e-08, "logits/chosen": -2.977725028991699, "logits/rejected": -2.95261549949646, "logps/chosen": -53.96739959716797, "logps/rejected": -50.952613830566406, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00034176494227722287, "rewards/margins": 0.0005306880921125412, "rewards/rejected": -0.000872453092597425, "step": 680 }, { "epoch": 0.11888352860096485, "grad_norm": 2.5883471965789795, "learning_rate": 3.960964408725603e-08, "logits/chosen": -3.1264681816101074, "logits/rejected": -3.0965752601623535, "logps/chosen": -59.121620178222656, "logps/rejected": -50.600284576416016, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0003911827807314694, "rewards/margins": 0.0001936770131578669, "rewards/rejected": -0.0005848597502335906, "step": 690 }, { "epoch": 0.1206064782908339, "grad_norm": 2.1948463916778564, "learning_rate": 4.018369690011481e-08, "logits/chosen": -3.0841174125671387, "logits/rejected": -3.0557429790496826, "logps/chosen": -55.774620056152344, "logps/rejected": -53.050376892089844, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00046374136582016945, "rewards/margins": 0.00023272904218174517, "rewards/rejected": -0.0006964703206904233, "step": 700 }, { "epoch": 0.1206064782908339, "eval_logits/chosen": -3.1619224548339844, "eval_logits/rejected": -3.156249761581421, "eval_logps/chosen": -58.68220901489258, "eval_logps/rejected": -63.166046142578125, "eval_loss": 0.6930699944496155, "eval_rewards/accuracies": 0.5213754773139954, "eval_rewards/chosen": 0.00029685665504075587, "eval_rewards/margins": 0.0001560932578286156, "eval_rewards/rejected": 0.00014076339721214026, "eval_runtime": 384.2368, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 700 }, { "epoch": 0.12232942798070297, "grad_norm": 2.271681785583496, "learning_rate": 4.0757749712973595e-08, "logits/chosen": -3.0626578330993652, "logits/rejected": -3.0339443683624268, "logps/chosen": -54.66240692138672, "logps/rejected": -54.78789520263672, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00013712870713789016, "rewards/margins": 0.0007613231427967548, "rewards/rejected": -0.0008984518935903907, "step": 710 }, { "epoch": 0.12405237767057202, "grad_norm": 2.499124765396118, "learning_rate": 4.1331802525832375e-08, "logits/chosen": -3.024477481842041, "logits/rejected": -3.0201916694641113, "logps/chosen": -53.375587463378906, "logps/rejected": -54.474205017089844, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0005847887368872762, "rewards/margins": 0.00018051665392704308, "rewards/rejected": -0.0007653054199181497, "step": 720 }, { "epoch": 0.12577532736044109, "grad_norm": 2.374276638031006, "learning_rate": 4.190585533869116e-08, "logits/chosen": -3.141143321990967, "logits/rejected": -3.116030693054199, "logps/chosen": -56.386009216308594, "logps/rejected": -52.64072799682617, "loss": 0.6928, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.00053749093785882, "rewards/margins": 0.0006588941905647516, "rewards/rejected": -0.0011963851284235716, "step": 730 }, { "epoch": 0.12749827705031014, "grad_norm": 2.216444969177246, "learning_rate": 4.247990815154994e-08, "logits/chosen": -3.023132562637329, "logits/rejected": -2.9973642826080322, "logps/chosen": -54.94916534423828, "logps/rejected": -53.81367111206055, "loss": 0.6927, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0003230666625313461, "rewards/margins": 0.000822404574137181, "rewards/rejected": -0.0011454712366685271, "step": 740 }, { "epoch": 0.1292212267401792, "grad_norm": 2.4230763912200928, "learning_rate": 4.305396096440873e-08, "logits/chosen": -3.196871519088745, "logits/rejected": -3.168738603591919, "logps/chosen": -56.03461456298828, "logps/rejected": -54.13713455200195, "loss": 0.6926, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00022107257973402739, "rewards/margins": 0.0010960177751258016, "rewards/rejected": -0.0013170904712751508, "step": 750 }, { "epoch": 0.13094417643004824, "grad_norm": 2.5480167865753174, "learning_rate": 4.362801377726751e-08, "logits/chosen": -3.0470354557037354, "logits/rejected": -3.0082664489746094, "logps/chosen": -54.47247314453125, "logps/rejected": -49.642242431640625, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00036018434911966324, "rewards/margins": 0.0008850126760080457, "rewards/rejected": -0.001245196908712387, "step": 760 }, { "epoch": 0.1326671261199173, "grad_norm": 2.0964155197143555, "learning_rate": 4.420206659012629e-08, "logits/chosen": -3.0978498458862305, "logits/rejected": -3.0744853019714355, "logps/chosen": -52.949180603027344, "logps/rejected": -52.302772521972656, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0002988922060467303, "rewards/margins": 0.0006420908612199128, "rewards/rejected": -0.0009409830090589821, "step": 770 }, { "epoch": 0.13439007580978635, "grad_norm": 2.6657774448394775, "learning_rate": 4.4776119402985075e-08, "logits/chosen": -3.09268856048584, "logits/rejected": -3.0615758895874023, "logps/chosen": -53.21660232543945, "logps/rejected": -51.31987380981445, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0007662334246560931, "rewards/margins": 0.0007213911740109324, "rewards/rejected": -0.0014876247150823474, "step": 780 }, { "epoch": 0.1361130254996554, "grad_norm": 2.2570981979370117, "learning_rate": 4.5350172215843855e-08, "logits/chosen": -3.1002702713012695, "logits/rejected": -3.06695294380188, "logps/chosen": -53.959014892578125, "logps/rejected": -53.691558837890625, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0005236599827185273, "rewards/margins": 0.0007057437906041741, "rewards/rejected": -0.0012294035404920578, "step": 790 }, { "epoch": 0.13783597518952448, "grad_norm": 2.698899507522583, "learning_rate": 4.592422502870264e-08, "logits/chosen": -2.9863438606262207, "logits/rejected": -2.9603400230407715, "logps/chosen": -55.07085418701172, "logps/rejected": -54.56496047973633, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0007329906220547855, "rewards/margins": 0.0009640827775001526, "rewards/rejected": -0.0016970733413472772, "step": 800 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.1617555618286133, "eval_logits/rejected": -3.1561031341552734, "eval_logps/chosen": -58.64912414550781, "eval_logps/rejected": -63.132232666015625, "eval_loss": 0.6930738091468811, "eval_rewards/accuracies": 0.5204461216926575, "eval_rewards/chosen": 0.0006277780630625784, "eval_rewards/margins": 0.00014891859609633684, "eval_rewards/rejected": 0.00047885943786241114, "eval_runtime": 383.9375, "eval_samples_per_second": 11.21, "eval_steps_per_second": 1.401, "step": 800 }, { "epoch": 0.13955892487939353, "grad_norm": 2.244809865951538, "learning_rate": 4.649827784156142e-08, "logits/chosen": -3.060981035232544, "logits/rejected": -3.032869338989258, "logps/chosen": -56.69407272338867, "logps/rejected": -55.69977951049805, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0005667932564392686, "rewards/margins": 0.0007999593508429825, "rewards/rejected": -0.0013667525490745902, "step": 810 }, { "epoch": 0.14128187456926258, "grad_norm": 2.1597232818603516, "learning_rate": 4.707233065442021e-08, "logits/chosen": -3.1103479862213135, "logits/rejected": -3.084733724594116, "logps/chosen": -51.6510009765625, "logps/rejected": -50.874507904052734, "loss": 0.6929, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0010331816738471389, "rewards/margins": 0.00044258739217184484, "rewards/rejected": -0.0014757690951228142, "step": 820 }, { "epoch": 0.14300482425913164, "grad_norm": 2.4357376098632812, "learning_rate": 4.764638346727899e-08, "logits/chosen": -3.0383548736572266, "logits/rejected": -3.0229320526123047, "logps/chosen": -54.6389274597168, "logps/rejected": -54.20611572265625, "loss": 0.6923, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0005912609631195664, "rewards/margins": 0.0017076656222343445, "rewards/rejected": -0.002298926468938589, "step": 830 }, { "epoch": 0.1447277739490007, "grad_norm": 2.5532875061035156, "learning_rate": 4.8220436280137775e-08, "logits/chosen": -3.1419107913970947, "logits/rejected": -3.1154308319091797, "logps/chosen": -54.17340850830078, "logps/rejected": -49.72704315185547, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0007852077251300216, "rewards/margins": 0.001062640454620123, "rewards/rejected": -0.0018478482961654663, "step": 840 }, { "epoch": 0.14645072363886974, "grad_norm": 2.294887065887451, "learning_rate": 4.8794489092996555e-08, "logits/chosen": -3.017284393310547, "logits/rejected": -3.006587505340576, "logps/chosen": -50.93817138671875, "logps/rejected": -55.09174728393555, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.001170767704024911, "rewards/margins": 0.0003900538431480527, "rewards/rejected": -0.0015608215471729636, "step": 850 }, { "epoch": 0.1481736733287388, "grad_norm": 2.3279764652252197, "learning_rate": 4.9368541905855335e-08, "logits/chosen": -3.0429883003234863, "logits/rejected": -3.0229313373565674, "logps/chosen": -52.9595832824707, "logps/rejected": -52.52375030517578, "loss": 0.6926, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0010034366277977824, "rewards/margins": 0.001169817871414125, "rewards/rejected": -0.002173254732042551, "step": 860 }, { "epoch": 0.14989662301860784, "grad_norm": 1.9383747577667236, "learning_rate": 4.994259471871412e-08, "logits/chosen": -3.1169488430023193, "logits/rejected": -3.1132357120513916, "logps/chosen": -51.40092849731445, "logps/rejected": -53.88822555541992, "loss": 0.6925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0009935570415109396, "rewards/margins": 0.0013303172308951616, "rewards/rejected": -0.0023238742724061012, "step": 870 }, { "epoch": 0.15161957270847692, "grad_norm": 1.9051223993301392, "learning_rate": 5.0516647531572895e-08, "logits/chosen": -3.030507802963257, "logits/rejected": -3.007730007171631, "logps/chosen": -51.83906173706055, "logps/rejected": -51.63774871826172, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -0.0010915338061749935, "rewards/margins": 0.0014844697434455156, "rewards/rejected": -0.0025760033167898655, "step": 880 }, { "epoch": 0.15334252239834598, "grad_norm": 2.257697582244873, "learning_rate": 5.109070034443168e-08, "logits/chosen": -3.0588555335998535, "logits/rejected": -3.0214855670928955, "logps/chosen": -58.502159118652344, "logps/rejected": -54.376953125, "loss": 0.6921, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.00015094592527020723, "rewards/margins": 0.002015637932345271, "rewards/rejected": -0.0021665836684405804, "step": 890 }, { "epoch": 0.15506547208821503, "grad_norm": 2.1569342613220215, "learning_rate": 5.166475315729046e-08, "logits/chosen": -3.0790534019470215, "logits/rejected": -3.068725109100342, "logps/chosen": -54.269317626953125, "logps/rejected": -52.61185836791992, "loss": 0.6927, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.001325699151493609, "rewards/margins": 0.0009382838616147637, "rewards/rejected": -0.0022639830131083727, "step": 900 }, { "epoch": 0.15506547208821503, "eval_logits/chosen": -3.1610217094421387, "eval_logits/rejected": -3.155405282974243, "eval_logps/chosen": -58.634517669677734, "eval_logps/rejected": -63.131717681884766, "eval_loss": 0.6930044889450073, "eval_rewards/accuracies": 0.5299721360206604, "eval_rewards/chosen": 0.0007737455889582634, "eval_rewards/margins": 0.0002896834339480847, "eval_rewards/rejected": 0.00048406212590634823, "eval_runtime": 384.2068, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 900 }, { "epoch": 0.15678842177808408, "grad_norm": 2.1278598308563232, "learning_rate": 5.223880597014925e-08, "logits/chosen": -3.049861192703247, "logits/rejected": -3.0418248176574707, "logps/chosen": -51.15930938720703, "logps/rejected": -52.267738342285156, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0013226511655375361, "rewards/margins": 0.0007780655287206173, "rewards/rejected": -0.0021007168106734753, "step": 910 }, { "epoch": 0.15851137146795313, "grad_norm": 2.3197872638702393, "learning_rate": 5.281285878300803e-08, "logits/chosen": -3.0911948680877686, "logits/rejected": -3.047741651535034, "logps/chosen": -54.45844650268555, "logps/rejected": -49.756954193115234, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0013628543820232153, "rewards/margins": 0.0016774596879258752, "rewards/rejected": -0.0030403141863644123, "step": 920 }, { "epoch": 0.16023432115782218, "grad_norm": 2.573028326034546, "learning_rate": 5.3386911595866815e-08, "logits/chosen": -3.152437686920166, "logits/rejected": -3.1363942623138428, "logps/chosen": -52.601463317871094, "logps/rejected": -54.781776428222656, "loss": 0.6923, "rewards/accuracies": 0.625, "rewards/chosen": -0.001176035962998867, "rewards/margins": 0.0017335328739136457, "rewards/rejected": -0.002909568604081869, "step": 930 }, { "epoch": 0.16195727084769124, "grad_norm": 2.1972992420196533, "learning_rate": 5.3960964408725595e-08, "logits/chosen": -3.1324801445007324, "logits/rejected": -3.094834327697754, "logps/chosen": -60.283714294433594, "logps/rejected": -55.158233642578125, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0007838603924028575, "rewards/margins": 0.001946564530953765, "rewards/rejected": -0.0027304249815642834, "step": 940 }, { "epoch": 0.16368022053756032, "grad_norm": 2.200590133666992, "learning_rate": 5.4535017221584375e-08, "logits/chosen": -2.925036907196045, "logits/rejected": -2.9077956676483154, "logps/chosen": -55.45914840698242, "logps/rejected": -55.95990753173828, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0018776354845613241, "rewards/margins": 0.0014111895579844713, "rewards/rejected": -0.003288825275376439, "step": 950 }, { "epoch": 0.16540317022742937, "grad_norm": 2.2784512042999268, "learning_rate": 5.510907003444316e-08, "logits/chosen": -2.8940415382385254, "logits/rejected": -2.8971915245056152, "logps/chosen": -50.99476623535156, "logps/rejected": -55.84647750854492, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.002534528961405158, "rewards/margins": -7.07468279870227e-05, "rewards/rejected": -0.002463781973347068, "step": 960 }, { "epoch": 0.16712611991729842, "grad_norm": 2.317347764968872, "learning_rate": 5.568312284730194e-08, "logits/chosen": -3.0644147396087646, "logits/rejected": -3.0288257598876953, "logps/chosen": -60.86735153198242, "logps/rejected": -52.7350959777832, "loss": 0.6927, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0020383347291499376, "rewards/margins": 0.0008646571077406406, "rewards/rejected": -0.0029029916040599346, "step": 970 }, { "epoch": 0.16884906960716747, "grad_norm": 3.076958179473877, "learning_rate": 5.625717566016073e-08, "logits/chosen": -3.166388750076294, "logits/rejected": -3.1467061042785645, "logps/chosen": -56.14971923828125, "logps/rejected": -55.15375900268555, "loss": 0.6921, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0010097816120833158, "rewards/margins": 0.0021285037510097027, "rewards/rejected": -0.003138285130262375, "step": 980 }, { "epoch": 0.17057201929703653, "grad_norm": 2.459468126296997, "learning_rate": 5.683122847301951e-08, "logits/chosen": -3.033179759979248, "logits/rejected": -3.0098400115966797, "logps/chosen": -54.752769470214844, "logps/rejected": -52.98857879638672, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0018352055922150612, "rewards/margins": 0.0018653625156730413, "rewards/rejected": -0.00370056857354939, "step": 990 }, { "epoch": 0.17229496898690558, "grad_norm": 2.205505609512329, "learning_rate": 5.7405281285878295e-08, "logits/chosen": -2.985236883163452, "logits/rejected": -2.9574520587921143, "logps/chosen": -57.078163146972656, "logps/rejected": -52.022727966308594, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00209948536939919, "rewards/margins": 0.0006992360576987267, "rewards/rejected": -0.0027987216599285603, "step": 1000 }, { "epoch": 0.17229496898690558, "eval_logits/chosen": -3.159637212753296, "eval_logits/rejected": -3.1539881229400635, "eval_logps/chosen": -58.606048583984375, "eval_logps/rejected": -63.1075325012207, "eval_loss": 0.6929837465286255, "eval_rewards/accuracies": 0.5257899761199951, "eval_rewards/chosen": 0.0010584620758891106, "eval_rewards/margins": 0.0003325818106532097, "eval_rewards/rejected": 0.000725880148820579, "eval_runtime": 384.3994, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 1000 }, { "epoch": 0.17401791867677463, "grad_norm": 2.36352801322937, "learning_rate": 5.7979334098737075e-08, "logits/chosen": -2.9207940101623535, "logits/rejected": -2.931384563446045, "logps/chosen": -54.00878143310547, "logps/rejected": -58.289215087890625, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0025096333120018244, "rewards/margins": 0.001520833233371377, "rewards/rejected": -0.004030467011034489, "step": 1010 }, { "epoch": 0.17574086836664368, "grad_norm": 2.2233808040618896, "learning_rate": 5.855338691159586e-08, "logits/chosen": -3.1037838459014893, "logits/rejected": -3.0682175159454346, "logps/chosen": -57.21088790893555, "logps/rejected": -54.620094299316406, "loss": 0.6918, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0010826075449585915, "rewards/margins": 0.002807574113830924, "rewards/rejected": -0.0038901816587895155, "step": 1020 }, { "epoch": 0.17746381805651276, "grad_norm": 2.3776323795318604, "learning_rate": 5.912743972445464e-08, "logits/chosen": -3.1529653072357178, "logits/rejected": -3.1257083415985107, "logps/chosen": -53.88542556762695, "logps/rejected": -52.225921630859375, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0016663509886711836, "rewards/margins": 0.0014677213039249182, "rewards/rejected": -0.0031340725254267454, "step": 1030 }, { "epoch": 0.17918676774638181, "grad_norm": 2.4457168579101562, "learning_rate": 5.970149253731343e-08, "logits/chosen": -3.068108081817627, "logits/rejected": -3.0578951835632324, "logps/chosen": -53.64398193359375, "logps/rejected": -55.90372848510742, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.002456160495057702, "rewards/margins": 0.0012058572610840201, "rewards/rejected": -0.0036620174068957567, "step": 1040 }, { "epoch": 0.18090971743625087, "grad_norm": 2.4612817764282227, "learning_rate": 6.02755453501722e-08, "logits/chosen": -2.9894585609436035, "logits/rejected": -2.9540462493896484, "logps/chosen": -57.33226776123047, "logps/rejected": -51.00169372558594, "loss": 0.692, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0018922159215435386, "rewards/margins": 0.002393498318269849, "rewards/rejected": -0.004285714589059353, "step": 1050 }, { "epoch": 0.18263266712611992, "grad_norm": 2.44649338722229, "learning_rate": 6.084959816303099e-08, "logits/chosen": -2.998039722442627, "logits/rejected": -2.978816509246826, "logps/chosen": -56.747283935546875, "logps/rejected": -55.302650451660156, "loss": 0.6928, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0024602219928056, "rewards/margins": 0.0008022012189030647, "rewards/rejected": -0.0032624229788780212, "step": 1060 }, { "epoch": 0.18435561681598897, "grad_norm": 2.27636981010437, "learning_rate": 6.142365097588978e-08, "logits/chosen": -3.141723394393921, "logits/rejected": -3.1078040599823, "logps/chosen": -56.14226531982422, "logps/rejected": -53.68745040893555, "loss": 0.692, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0021100935991853476, "rewards/margins": 0.002281313529238105, "rewards/rejected": -0.00439140759408474, "step": 1070 }, { "epoch": 0.18607856650585802, "grad_norm": 2.2701478004455566, "learning_rate": 6.199770378874856e-08, "logits/chosen": -3.119141101837158, "logits/rejected": -3.0958266258239746, "logps/chosen": -56.046653747558594, "logps/rejected": -51.1988639831543, "loss": 0.6921, "rewards/accuracies": 0.59375, "rewards/chosen": -0.002608765149489045, "rewards/margins": 0.0020744693465530872, "rewards/rejected": -0.004683234728872776, "step": 1080 }, { "epoch": 0.18780151619572708, "grad_norm": 2.366807699203491, "learning_rate": 6.257175660160735e-08, "logits/chosen": -2.996194839477539, "logits/rejected": -2.9877772331237793, "logps/chosen": -52.50612258911133, "logps/rejected": -53.06543731689453, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.002744440920650959, "rewards/margins": 0.0015723633114248514, "rewards/rejected": -0.004316803999245167, "step": 1090 }, { "epoch": 0.18952446588559613, "grad_norm": 2.1252856254577637, "learning_rate": 6.314580941446614e-08, "logits/chosen": -3.0620028972625732, "logits/rejected": -3.0592315196990967, "logps/chosen": -51.11980056762695, "logps/rejected": -55.002479553222656, "loss": 0.6922, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.002123672980815172, "rewards/margins": 0.0018237263429909945, "rewards/rejected": -0.00394739955663681, "step": 1100 }, { "epoch": 0.18952446588559613, "eval_logits/chosen": -3.157933235168457, "eval_logits/rejected": -3.1523184776306152, "eval_logps/chosen": -58.581974029541016, "eval_logps/rejected": -63.11029815673828, "eval_loss": 0.6928511261940002, "eval_rewards/accuracies": 0.5455390214920044, "eval_rewards/chosen": 0.0012992529664188623, "eval_rewards/margins": 0.0006009479984641075, "eval_rewards/rejected": 0.0006983049679547548, "eval_runtime": 384.272, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 1100 }, { "epoch": 0.1912474155754652, "grad_norm": 2.6385481357574463, "learning_rate": 6.371986222732492e-08, "logits/chosen": -3.059314727783203, "logits/rejected": -3.071633815765381, "logps/chosen": -53.256324768066406, "logps/rejected": -56.966346740722656, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0036048232577741146, "rewards/margins": 0.0015382547862827778, "rewards/rejected": -0.00514307850971818, "step": 1110 }, { "epoch": 0.19297036526533426, "grad_norm": 2.4045803546905518, "learning_rate": 6.42939150401837e-08, "logits/chosen": -3.0894057750701904, "logits/rejected": -3.0675110816955566, "logps/chosen": -56.65040969848633, "logps/rejected": -54.24614334106445, "loss": 0.692, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.002793778432533145, "rewards/margins": 0.00222732312977314, "rewards/rejected": -0.005021101329475641, "step": 1120 }, { "epoch": 0.1946933149552033, "grad_norm": 2.2403244972229004, "learning_rate": 6.486796785304248e-08, "logits/chosen": -3.1555721759796143, "logits/rejected": -3.131082534790039, "logps/chosen": -52.319297790527344, "logps/rejected": -54.821922302246094, "loss": 0.6915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.002869230229407549, "rewards/margins": 0.0034268468152731657, "rewards/rejected": -0.006296076811850071, "step": 1130 }, { "epoch": 0.19641626464507236, "grad_norm": 2.4267640113830566, "learning_rate": 6.544202066590127e-08, "logits/chosen": -3.0627176761627197, "logits/rejected": -3.0290186405181885, "logps/chosen": -57.1142463684082, "logps/rejected": -53.50138473510742, "loss": 0.6917, "rewards/accuracies": 0.59375, "rewards/chosen": -0.002328848699107766, "rewards/margins": 0.0028365778271108866, "rewards/rejected": -0.005165426526218653, "step": 1140 }, { "epoch": 0.19813921433494142, "grad_norm": 2.3757588863372803, "learning_rate": 6.601607347876004e-08, "logits/chosen": -3.002267360687256, "logits/rejected": -2.983065128326416, "logps/chosen": -53.62241744995117, "logps/rejected": -55.014984130859375, "loss": 0.6918, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.002660442143678665, "rewards/margins": 0.0026335858274251223, "rewards/rejected": -0.005294027738273144, "step": 1150 }, { "epoch": 0.19986216402481047, "grad_norm": 2.2249338626861572, "learning_rate": 6.659012629161883e-08, "logits/chosen": -3.098814010620117, "logits/rejected": -3.090008020401001, "logps/chosen": -54.18245315551758, "logps/rejected": -55.039894104003906, "loss": 0.6915, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0015682090306654572, "rewards/margins": 0.003350636688992381, "rewards/rejected": -0.004918845370411873, "step": 1160 }, { "epoch": 0.20158511371467952, "grad_norm": 2.138969898223877, "learning_rate": 6.716417910447762e-08, "logits/chosen": -2.970677137374878, "logits/rejected": -2.957944393157959, "logps/chosen": -52.4093132019043, "logps/rejected": -55.242156982421875, "loss": 0.692, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.003300183219835162, "rewards/margins": 0.0023874850012362003, "rewards/rejected": -0.005687667988240719, "step": 1170 }, { "epoch": 0.2033080634045486, "grad_norm": 2.552302598953247, "learning_rate": 6.77382319173364e-08, "logits/chosen": -2.942534923553467, "logits/rejected": -2.911280632019043, "logps/chosen": -53.13288116455078, "logps/rejected": -51.744728088378906, "loss": 0.6911, "rewards/accuracies": 0.65625, "rewards/chosen": -0.003310777246952057, "rewards/margins": 0.0041159880347549915, "rewards/rejected": -0.007426765747368336, "step": 1180 }, { "epoch": 0.20503101309441765, "grad_norm": 2.4945688247680664, "learning_rate": 6.831228473019518e-08, "logits/chosen": -3.1397457122802734, "logits/rejected": -3.1044998168945312, "logps/chosen": -59.08231735229492, "logps/rejected": -51.09773635864258, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": -0.0038283977191895247, "rewards/margins": 0.003150052158161998, "rewards/rejected": -0.006978449411690235, "step": 1190 }, { "epoch": 0.2067539627842867, "grad_norm": 2.115513801574707, "learning_rate": 6.888633754305396e-08, "logits/chosen": -2.971470594406128, "logits/rejected": -2.9564225673675537, "logps/chosen": -53.521575927734375, "logps/rejected": -51.83867645263672, "loss": 0.6921, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.003571131033822894, "rewards/margins": 0.002098171738907695, "rewards/rejected": -0.0056693037040531635, "step": 1200 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.155562162399292, "eval_logits/rejected": -3.1499741077423096, "eval_logps/chosen": -58.541587829589844, "eval_logps/rejected": -63.1010627746582, "eval_loss": 0.6926991939544678, "eval_rewards/accuracies": 0.5573884844779968, "eval_rewards/chosen": 0.0017030423041433096, "eval_rewards/margins": 0.00091239542234689, "eval_rewards/rejected": 0.0007906469982117414, "eval_runtime": 383.8638, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.402, "step": 1200 }, { "epoch": 0.20847691247415576, "grad_norm": 2.2921652793884277, "learning_rate": 6.946039035591275e-08, "logits/chosen": -3.070786714553833, "logits/rejected": -3.0367579460144043, "logps/chosen": -54.041099548339844, "logps/rejected": -53.65398025512695, "loss": 0.6912, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0025284376461058855, "rewards/margins": 0.003854484762996435, "rewards/rejected": -0.006382922641932964, "step": 1210 }, { "epoch": 0.2101998621640248, "grad_norm": 2.1366147994995117, "learning_rate": 7.003444316877152e-08, "logits/chosen": -3.081575870513916, "logits/rejected": -3.0545802116394043, "logps/chosen": -53.79796600341797, "logps/rejected": -53.03608322143555, "loss": 0.6915, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004166724160313606, "rewards/margins": 0.003359371330589056, "rewards/rejected": -0.00752609595656395, "step": 1220 }, { "epoch": 0.21192281185389386, "grad_norm": 2.5944831371307373, "learning_rate": 7.060849598163031e-08, "logits/chosen": -3.1516284942626953, "logits/rejected": -3.109452724456787, "logps/chosen": -56.013023376464844, "logps/rejected": -53.00898361206055, "loss": 0.6906, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0026216122787445784, "rewards/margins": 0.005220313090831041, "rewards/rejected": -0.007841925136744976, "step": 1230 }, { "epoch": 0.2136457615437629, "grad_norm": 2.100095272064209, "learning_rate": 7.11825487944891e-08, "logits/chosen": -3.0129342079162598, "logits/rejected": -2.9959495067596436, "logps/chosen": -52.92523956298828, "logps/rejected": -53.61211013793945, "loss": 0.6917, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004881435539573431, "rewards/margins": 0.0030384205747395754, "rewards/rejected": -0.00791985634714365, "step": 1240 }, { "epoch": 0.21536871123363197, "grad_norm": 2.538419485092163, "learning_rate": 7.175660160734788e-08, "logits/chosen": -3.1327199935913086, "logits/rejected": -3.0964229106903076, "logps/chosen": -54.134681701660156, "logps/rejected": -51.95219039916992, "loss": 0.6915, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004306506365537643, "rewards/margins": 0.0033414550125598907, "rewards/rejected": -0.007647961378097534, "step": 1250 }, { "epoch": 0.21709166092350105, "grad_norm": 2.117171049118042, "learning_rate": 7.233065442020666e-08, "logits/chosen": -2.9809911251068115, "logits/rejected": -2.9495654106140137, "logps/chosen": -53.408287048339844, "logps/rejected": -51.155670166015625, "loss": 0.691, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.004329412244260311, "rewards/margins": 0.004383914638310671, "rewards/rejected": -0.00871332548558712, "step": 1260 }, { "epoch": 0.2188146106133701, "grad_norm": 2.452468156814575, "learning_rate": 7.290470723306544e-08, "logits/chosen": -3.041698694229126, "logits/rejected": -3.0354225635528564, "logps/chosen": -52.282005310058594, "logps/rejected": -55.53786087036133, "loss": 0.6911, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0040619089268147945, "rewards/margins": 0.004267952870577574, "rewards/rejected": -0.008329860866069794, "step": 1270 }, { "epoch": 0.22053756030323915, "grad_norm": 2.3395743370056152, "learning_rate": 7.347876004592423e-08, "logits/chosen": -3.1130595207214355, "logits/rejected": -3.119959592819214, "logps/chosen": -51.6298828125, "logps/rejected": -60.651336669921875, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00458336528390646, "rewards/margins": 0.0027965959161520004, "rewards/rejected": -0.00737996120005846, "step": 1280 }, { "epoch": 0.2222605099931082, "grad_norm": 2.2128279209136963, "learning_rate": 7.405281285878302e-08, "logits/chosen": -3.013061761856079, "logits/rejected": -2.9813873767852783, "logps/chosen": -54.9996452331543, "logps/rejected": -51.27374267578125, "loss": 0.6909, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.003987212665379047, "rewards/margins": 0.004552985541522503, "rewards/rejected": -0.00854019820690155, "step": 1290 }, { "epoch": 0.22398345968297725, "grad_norm": 2.1431968212127686, "learning_rate": 7.462686567164179e-08, "logits/chosen": -3.0925610065460205, "logits/rejected": -3.079920530319214, "logps/chosen": -50.90013885498047, "logps/rejected": -54.639366149902344, "loss": 0.692, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0068520144559443, "rewards/margins": 0.002317605074495077, "rewards/rejected": -0.009169620461761951, "step": 1300 }, { "epoch": 0.22398345968297725, "eval_logits/chosen": -3.1535146236419678, "eval_logits/rejected": -3.147897958755493, "eval_logps/chosen": -58.509735107421875, "eval_logps/rejected": -63.11225891113281, "eval_loss": 0.6924899220466614, "eval_rewards/accuracies": 0.559944212436676, "eval_rewards/chosen": 0.0020215357653796673, "eval_rewards/margins": 0.0013428872916847467, "eval_rewards/rejected": 0.0006786484736949205, "eval_runtime": 383.765, "eval_samples_per_second": 11.215, "eval_steps_per_second": 1.402, "step": 1300 }, { "epoch": 0.2257064093728463, "grad_norm": 1.90495765209198, "learning_rate": 7.520091848450058e-08, "logits/chosen": -3.0930347442626953, "logits/rejected": -3.057450532913208, "logps/chosen": -53.83110427856445, "logps/rejected": -50.54579162597656, "loss": 0.69, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004905714187771082, "rewards/margins": 0.006351909134536982, "rewards/rejected": -0.011257623322308064, "step": 1310 }, { "epoch": 0.22742935906271536, "grad_norm": 2.291388988494873, "learning_rate": 7.577497129735936e-08, "logits/chosen": -3.066067934036255, "logits/rejected": -3.040505886077881, "logps/chosen": -55.77803421020508, "logps/rejected": -51.723289489746094, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.0035824254155158997, "rewards/margins": 0.00628857035189867, "rewards/rejected": -0.00987099390476942, "step": 1320 }, { "epoch": 0.22915230875258444, "grad_norm": 2.4262771606445312, "learning_rate": 7.634902411021814e-08, "logits/chosen": -3.144801616668701, "logits/rejected": -3.1141879558563232, "logps/chosen": -55.5483283996582, "logps/rejected": -51.02151107788086, "loss": 0.6904, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0033073134254664183, "rewards/margins": 0.005517328158020973, "rewards/rejected": -0.008824641816318035, "step": 1330 }, { "epoch": 0.2308752584424535, "grad_norm": 2.2920780181884766, "learning_rate": 7.692307692307692e-08, "logits/chosen": -2.9861254692077637, "logits/rejected": -2.9647624492645264, "logps/chosen": -55.54059600830078, "logps/rejected": -53.8024787902832, "loss": 0.6905, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0040119001641869545, "rewards/margins": 0.005336456932127476, "rewards/rejected": -0.00934835709631443, "step": 1340 }, { "epoch": 0.23259820813232254, "grad_norm": 2.204136848449707, "learning_rate": 7.749712973593571e-08, "logits/chosen": -3.0514941215515137, "logits/rejected": -3.02690052986145, "logps/chosen": -54.713539123535156, "logps/rejected": -54.30706024169922, "loss": 0.6909, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.006122550927102566, "rewards/margins": 0.0046083503402769566, "rewards/rejected": -0.01073090173304081, "step": 1350 }, { "epoch": 0.2343211578221916, "grad_norm": 2.3276925086975098, "learning_rate": 7.80711825487945e-08, "logits/chosen": -3.008610248565674, "logits/rejected": -2.996319532394409, "logps/chosen": -55.81654739379883, "logps/rejected": -57.032752990722656, "loss": 0.6912, "rewards/accuracies": 0.5625, "rewards/chosen": -0.006140447221696377, "rewards/margins": 0.003938029054552317, "rewards/rejected": -0.01007847674190998, "step": 1360 }, { "epoch": 0.23604410751206065, "grad_norm": 2.3988797664642334, "learning_rate": 7.864523536165327e-08, "logits/chosen": -3.122248411178589, "logits/rejected": -3.102567195892334, "logps/chosen": -53.2535285949707, "logps/rejected": -53.98606491088867, "loss": 0.6901, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.005786883644759655, "rewards/margins": 0.006200735457241535, "rewards/rejected": -0.011987620033323765, "step": 1370 }, { "epoch": 0.2377670572019297, "grad_norm": 2.1158041954040527, "learning_rate": 7.921928817451206e-08, "logits/chosen": -3.013899326324463, "logits/rejected": -2.995651960372925, "logps/chosen": -54.72368621826172, "logps/rejected": -53.36114501953125, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": -0.003909071907401085, "rewards/margins": 0.006528546568006277, "rewards/rejected": -0.01043761894106865, "step": 1380 }, { "epoch": 0.23949000689179875, "grad_norm": 2.3443217277526855, "learning_rate": 7.979334098737084e-08, "logits/chosen": -3.163419008255005, "logits/rejected": -3.1359851360321045, "logps/chosen": -57.40605926513672, "logps/rejected": -54.61787796020508, "loss": 0.69, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0051222434267401695, "rewards/margins": 0.006491777952760458, "rewards/rejected": -0.011614019982516766, "step": 1390 }, { "epoch": 0.2412129565816678, "grad_norm": 2.137240409851074, "learning_rate": 8.036739380022962e-08, "logits/chosen": -2.9831480979919434, "logits/rejected": -2.9576594829559326, "logps/chosen": -56.4083137512207, "logps/rejected": -54.09514617919922, "loss": 0.6898, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.005995193496346474, "rewards/margins": 0.006904745940119028, "rewards/rejected": -0.012899939902126789, "step": 1400 }, { "epoch": 0.2412129565816678, "eval_logits/chosen": -3.149954080581665, "eval_logits/rejected": -3.144317626953125, "eval_logps/chosen": -58.50584411621094, "eval_logps/rejected": -63.15810775756836, "eval_loss": 0.6922518610954285, "eval_rewards/accuracies": 0.5743494629859924, "eval_rewards/chosen": 0.002060503698885441, "eval_rewards/margins": 0.001840322045609355, "eval_rewards/rejected": 0.00022018159506842494, "eval_runtime": 384.1614, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.4, "step": 1400 }, { "epoch": 0.24293590627153688, "grad_norm": 2.024836301803589, "learning_rate": 8.09414466130884e-08, "logits/chosen": -3.0475363731384277, "logits/rejected": -3.035557508468628, "logps/chosen": -55.06357955932617, "logps/rejected": -54.45977020263672, "loss": 0.6904, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006899215281009674, "rewards/margins": 0.005695374682545662, "rewards/rejected": -0.012594589963555336, "step": 1410 }, { "epoch": 0.24465885596140594, "grad_norm": 2.118668794631958, "learning_rate": 8.151549942594719e-08, "logits/chosen": -2.965179204940796, "logits/rejected": -2.9710865020751953, "logps/chosen": -51.54658126831055, "logps/rejected": -55.7078971862793, "loss": 0.692, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.007924264296889305, "rewards/margins": 0.0023214942775666714, "rewards/rejected": -0.01024575810879469, "step": 1420 }, { "epoch": 0.246381805651275, "grad_norm": 2.4635019302368164, "learning_rate": 8.208955223880598e-08, "logits/chosen": -3.084005355834961, "logits/rejected": -3.072390079498291, "logps/chosen": -54.2066535949707, "logps/rejected": -57.81000900268555, "loss": 0.6897, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006220073439180851, "rewards/margins": 0.006981330923736095, "rewards/rejected": -0.013201403431594372, "step": 1430 }, { "epoch": 0.24810475534114404, "grad_norm": 2.1570234298706055, "learning_rate": 8.266360505166475e-08, "logits/chosen": -3.0112099647521973, "logits/rejected": -2.979926824569702, "logps/chosen": -51.29058074951172, "logps/rejected": -49.935813903808594, "loss": 0.6895, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.008280965499579906, "rewards/margins": 0.007432927843183279, "rewards/rejected": -0.015713894739747047, "step": 1440 }, { "epoch": 0.2498277050310131, "grad_norm": 2.805506944656372, "learning_rate": 8.323765786452354e-08, "logits/chosen": -3.0243566036224365, "logits/rejected": -2.986956834793091, "logps/chosen": -56.046485900878906, "logps/rejected": -53.360313415527344, "loss": 0.6889, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.005713744089007378, "rewards/margins": 0.008688968606293201, "rewards/rejected": -0.014402711763978004, "step": 1450 }, { "epoch": 0.25155065472088217, "grad_norm": 2.2970547676086426, "learning_rate": 8.381171067738232e-08, "logits/chosen": -3.0269100666046143, "logits/rejected": -2.9944651126861572, "logps/chosen": -52.6595344543457, "logps/rejected": -52.14793014526367, "loss": 0.6896, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00959908775985241, "rewards/margins": 0.007228456437587738, "rewards/rejected": -0.016827544197440147, "step": 1460 }, { "epoch": 0.2532736044107512, "grad_norm": 2.453721046447754, "learning_rate": 8.43857634902411e-08, "logits/chosen": -3.1917262077331543, "logits/rejected": -3.1557235717773438, "logps/chosen": -57.65922927856445, "logps/rejected": -56.3996696472168, "loss": 0.6878, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.006355701480060816, "rewards/margins": 0.010877120308578014, "rewards/rejected": -0.017232822254300117, "step": 1470 }, { "epoch": 0.2549965541006203, "grad_norm": 2.2138876914978027, "learning_rate": 8.495981630309988e-08, "logits/chosen": -3.158738613128662, "logits/rejected": -3.122800588607788, "logps/chosen": -52.21733856201172, "logps/rejected": -53.142784118652344, "loss": 0.6908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009066091850399971, "rewards/margins": 0.004957942292094231, "rewards/rejected": -0.014024032279849052, "step": 1480 }, { "epoch": 0.2567195037904893, "grad_norm": 2.4184203147888184, "learning_rate": 8.553386911595867e-08, "logits/chosen": -3.090259075164795, "logits/rejected": -3.0655534267425537, "logps/chosen": -58.28464889526367, "logps/rejected": -55.2327766418457, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": -0.0090095866471529, "rewards/margins": 0.006619918160140514, "rewards/rejected": -0.01562950387597084, "step": 1490 }, { "epoch": 0.2584424534803584, "grad_norm": 2.3401448726654053, "learning_rate": 8.610792192881746e-08, "logits/chosen": -2.998882293701172, "logits/rejected": -2.9962730407714844, "logps/chosen": -50.91393280029297, "logps/rejected": -54.471473693847656, "loss": 0.6889, "rewards/accuracies": 0.59375, "rewards/chosen": -0.009252166375517845, "rewards/margins": 0.00879758782684803, "rewards/rejected": -0.018049754202365875, "step": 1500 }, { "epoch": 0.2584424534803584, "eval_logits/chosen": -3.1461923122406006, "eval_logits/rejected": -3.1405577659606934, "eval_logps/chosen": -58.5425910949707, "eval_logps/rejected": -63.25124740600586, "eval_loss": 0.6919842958450317, "eval_rewards/accuracies": 0.582713782787323, "eval_rewards/chosen": 0.0016930237179622054, "eval_rewards/margins": 0.0024043007288128138, "eval_rewards/rejected": -0.0007112768362276256, "eval_runtime": 383.883, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.401, "step": 1500 }, { "epoch": 0.2601654031702274, "grad_norm": 2.474595785140991, "learning_rate": 8.668197474167623e-08, "logits/chosen": -2.9482054710388184, "logits/rejected": -2.912536144256592, "logps/chosen": -59.38923263549805, "logps/rejected": -54.630149841308594, "loss": 0.6882, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007833080366253853, "rewards/margins": 0.010202744975686073, "rewards/rejected": -0.018035825341939926, "step": 1510 }, { "epoch": 0.2618883528600965, "grad_norm": 2.566002368927002, "learning_rate": 8.725602755453502e-08, "logits/chosen": -3.061122417449951, "logits/rejected": -3.053142786026001, "logps/chosen": -53.38776779174805, "logps/rejected": -52.94761276245117, "loss": 0.6917, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011371207423508167, "rewards/margins": 0.0031259614042937756, "rewards/rejected": -0.014497170224785805, "step": 1520 }, { "epoch": 0.26361130254996556, "grad_norm": 2.5937631130218506, "learning_rate": 8.78300803673938e-08, "logits/chosen": -2.9471049308776855, "logits/rejected": -2.930510997772217, "logps/chosen": -52.76145553588867, "logps/rejected": -52.55853271484375, "loss": 0.6897, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.010728351771831512, "rewards/margins": 0.00731478538364172, "rewards/rejected": -0.018043136224150658, "step": 1530 }, { "epoch": 0.2653342522398346, "grad_norm": 2.2903971672058105, "learning_rate": 8.840413318025258e-08, "logits/chosen": -3.07633113861084, "logits/rejected": -3.0442919731140137, "logps/chosen": -56.99201202392578, "logps/rejected": -53.094139099121094, "loss": 0.6883, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.010865415446460247, "rewards/margins": 0.01004198007285595, "rewards/rejected": -0.02090739645063877, "step": 1540 }, { "epoch": 0.26705720192970367, "grad_norm": 2.1574084758758545, "learning_rate": 8.897818599311136e-08, "logits/chosen": -2.994337558746338, "logits/rejected": -2.9744420051574707, "logps/chosen": -54.561859130859375, "logps/rejected": -52.64752197265625, "loss": 0.6887, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.011925270780920982, "rewards/margins": 0.009143774397671223, "rewards/rejected": -0.02106904610991478, "step": 1550 }, { "epoch": 0.2687801516195727, "grad_norm": 2.2910408973693848, "learning_rate": 8.955223880597015e-08, "logits/chosen": -3.008223056793213, "logits/rejected": -2.9907495975494385, "logps/chosen": -52.24835205078125, "logps/rejected": -52.69355392456055, "loss": 0.6901, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01400584727525711, "rewards/margins": 0.006327374372631311, "rewards/rejected": -0.020333221182227135, "step": 1560 }, { "epoch": 0.2705031013094418, "grad_norm": 2.2692337036132812, "learning_rate": 9.012629161882894e-08, "logits/chosen": -3.0593621730804443, "logits/rejected": -3.061344861984253, "logps/chosen": -52.996070861816406, "logps/rejected": -58.17250442504883, "loss": 0.689, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.010799935087561607, "rewards/margins": 0.008623559959232807, "rewards/rejected": -0.01942349411547184, "step": 1570 }, { "epoch": 0.2722260509993108, "grad_norm": 2.6995997428894043, "learning_rate": 9.070034443168771e-08, "logits/chosen": -2.9940383434295654, "logits/rejected": -2.984046459197998, "logps/chosen": -54.371482849121094, "logps/rejected": -55.91767120361328, "loss": 0.6904, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.014562253840267658, "rewards/margins": 0.005708019249141216, "rewards/rejected": -0.020270273089408875, "step": 1580 }, { "epoch": 0.2739490006891799, "grad_norm": 2.4585492610931396, "learning_rate": 9.12743972445465e-08, "logits/chosen": -3.0394287109375, "logits/rejected": -3.0248665809631348, "logps/chosen": -55.17779541015625, "logps/rejected": -56.344078063964844, "loss": 0.69, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.010473682545125484, "rewards/margins": 0.006545486859977245, "rewards/rejected": -0.01701917126774788, "step": 1590 }, { "epoch": 0.27567195037904896, "grad_norm": 2.37933611869812, "learning_rate": 9.184845005740528e-08, "logits/chosen": -3.031161069869995, "logits/rejected": -2.992920160293579, "logps/chosen": -53.5450325012207, "logps/rejected": -52.9721794128418, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.013657860457897186, "rewards/margins": 0.006563941482454538, "rewards/rejected": -0.020221803337335587, "step": 1600 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.141144037246704, "eval_logits/rejected": -3.135498285293579, "eval_logps/chosen": -58.59821701049805, "eval_logps/rejected": -63.36439895629883, "eval_loss": 0.6917136907577515, "eval_rewards/accuracies": 0.5785316228866577, "eval_rewards/chosen": 0.0011368298437446356, "eval_rewards/margins": 0.0029795218724757433, "eval_rewards/rejected": -0.0018426921451464295, "eval_runtime": 383.7288, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 1600 }, { "epoch": 0.277394900068918, "grad_norm": 2.1841118335723877, "learning_rate": 9.242250287026406e-08, "logits/chosen": -3.0701465606689453, "logits/rejected": -3.044910192489624, "logps/chosen": -57.4371337890625, "logps/rejected": -59.913230895996094, "loss": 0.69, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015430314466357231, "rewards/margins": 0.0067165689542889595, "rewards/rejected": -0.022146884351968765, "step": 1610 }, { "epoch": 0.27911784975878706, "grad_norm": 2.1465470790863037, "learning_rate": 9.299655568312284e-08, "logits/chosen": -2.9920380115509033, "logits/rejected": -2.9698596000671387, "logps/chosen": -52.94218063354492, "logps/rejected": -56.29070281982422, "loss": 0.6897, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.015021272003650665, "rewards/margins": 0.007233158685266972, "rewards/rejected": -0.022254429757595062, "step": 1620 }, { "epoch": 0.2808407994486561, "grad_norm": 2.3757145404815674, "learning_rate": 9.357060849598163e-08, "logits/chosen": -3.0620319843292236, "logits/rejected": -3.0356557369232178, "logps/chosen": -56.7093505859375, "logps/rejected": -53.99016189575195, "loss": 0.688, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.011995895765721798, "rewards/margins": 0.010642603039741516, "rewards/rejected": -0.02263849787414074, "step": 1630 }, { "epoch": 0.28256374913852517, "grad_norm": 2.431318521499634, "learning_rate": 9.414466130884042e-08, "logits/chosen": -3.103769063949585, "logits/rejected": -3.0667669773101807, "logps/chosen": -58.213539123535156, "logps/rejected": -53.96561813354492, "loss": 0.6866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.012591751292347908, "rewards/margins": 0.013634230010211468, "rewards/rejected": -0.026225978508591652, "step": 1640 }, { "epoch": 0.2842866988283942, "grad_norm": 2.546212911605835, "learning_rate": 9.471871412169919e-08, "logits/chosen": -3.073122978210449, "logits/rejected": -3.0456044673919678, "logps/chosen": -58.147926330566406, "logps/rejected": -57.04644775390625, "loss": 0.6876, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.009114892221987247, "rewards/margins": 0.01149419229477644, "rewards/rejected": -0.020609084516763687, "step": 1650 }, { "epoch": 0.28600964851826327, "grad_norm": 2.534294366836548, "learning_rate": 9.529276693455798e-08, "logits/chosen": -3.1083006858825684, "logits/rejected": -3.061486005783081, "logps/chosen": -56.081565856933594, "logps/rejected": -52.29656219482422, "loss": 0.686, "rewards/accuracies": 0.65625, "rewards/chosen": -0.011100168339908123, "rewards/margins": 0.014708215370774269, "rewards/rejected": -0.025808382779359818, "step": 1660 }, { "epoch": 0.2877325982081323, "grad_norm": 2.5519416332244873, "learning_rate": 9.586681974741676e-08, "logits/chosen": -3.095654010772705, "logits/rejected": -3.0735526084899902, "logps/chosen": -55.3333740234375, "logps/rejected": -57.44159698486328, "loss": 0.6862, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.008187592029571533, "rewards/margins": 0.014174317941069603, "rewards/rejected": -0.022361911833286285, "step": 1670 }, { "epoch": 0.2894555478980014, "grad_norm": 2.3085849285125732, "learning_rate": 9.644087256027555e-08, "logits/chosen": -3.0520882606506348, "logits/rejected": -3.0259764194488525, "logps/chosen": -57.2276496887207, "logps/rejected": -57.16650390625, "loss": 0.6877, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01619306206703186, "rewards/margins": 0.011389026418328285, "rewards/rejected": -0.027582088485360146, "step": 1680 }, { "epoch": 0.29117849758787046, "grad_norm": 2.3965001106262207, "learning_rate": 9.701492537313432e-08, "logits/chosen": -3.0315260887145996, "logits/rejected": -3.015839099884033, "logps/chosen": -57.2199821472168, "logps/rejected": -54.25101852416992, "loss": 0.6918, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.01674691215157509, "rewards/margins": 0.003029621671885252, "rewards/rejected": -0.01977653242647648, "step": 1690 }, { "epoch": 0.2929014472777395, "grad_norm": 2.577658176422119, "learning_rate": 9.758897818599311e-08, "logits/chosen": -2.9396812915802, "logits/rejected": -2.9444081783294678, "logps/chosen": -51.40871047973633, "logps/rejected": -57.893470764160156, "loss": 0.6897, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01845657080411911, "rewards/margins": 0.0073503716848790646, "rewards/rejected": -0.02580694481730461, "step": 1700 }, { "epoch": 0.2929014472777395, "eval_logits/chosen": -3.1350889205932617, "eval_logits/rejected": -3.129427671432495, "eval_logps/chosen": -58.698516845703125, "eval_logps/rejected": -63.54674530029297, "eval_loss": 0.6913267374038696, "eval_rewards/accuracies": 0.5727230310440063, "eval_rewards/chosen": 0.00013377063442021608, "eval_rewards/margins": 0.003800000064074993, "eval_rewards/rejected": -0.003666229546070099, "eval_runtime": 383.9188, "eval_samples_per_second": 11.211, "eval_steps_per_second": 1.401, "step": 1700 }, { "epoch": 0.29462439696760856, "grad_norm": 2.655137062072754, "learning_rate": 9.81630309988519e-08, "logits/chosen": -3.071646213531494, "logits/rejected": -3.030850887298584, "logps/chosen": -59.82062530517578, "logps/rejected": -56.85618209838867, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014108404517173767, "rewards/margins": 0.009618564508855343, "rewards/rejected": -0.023726969957351685, "step": 1710 }, { "epoch": 0.2963473466574776, "grad_norm": 2.334182024002075, "learning_rate": 9.873708381171067e-08, "logits/chosen": -3.0800578594207764, "logits/rejected": -3.05077862739563, "logps/chosen": -58.03089141845703, "logps/rejected": -53.44916915893555, "loss": 0.6894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.01604996994137764, "rewards/margins": 0.007883192040026188, "rewards/rejected": -0.0239331666380167, "step": 1720 }, { "epoch": 0.29807029634734666, "grad_norm": 2.21150541305542, "learning_rate": 9.931113662456946e-08, "logits/chosen": -3.074854612350464, "logits/rejected": -3.0577147006988525, "logps/chosen": -55.74415969848633, "logps/rejected": -54.86790084838867, "loss": 0.6906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.018098412081599236, "rewards/margins": 0.0055232299491763115, "rewards/rejected": -0.02362164482474327, "step": 1730 }, { "epoch": 0.2997932460372157, "grad_norm": 2.375692844390869, "learning_rate": 9.988518943742824e-08, "logits/chosen": -3.0235977172851562, "logits/rejected": -3.0136208534240723, "logps/chosen": -54.302406311035156, "logps/rejected": -56.829742431640625, "loss": 0.6906, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.022011317312717438, "rewards/margins": 0.005672975443303585, "rewards/rejected": -0.027684291824698448, "step": 1740 }, { "epoch": 0.30151619572708477, "grad_norm": 2.5655417442321777, "learning_rate": 9.999993568953616e-08, "logits/chosen": -3.0672988891601562, "logits/rejected": -3.0561492443084717, "logps/chosen": -56.56293869018555, "logps/rejected": -58.1347541809082, "loss": 0.6897, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.013769115321338177, "rewards/margins": 0.007312041707336903, "rewards/rejected": -0.02108115702867508, "step": 1750 }, { "epoch": 0.30323914541695385, "grad_norm": 2.461460590362549, "learning_rate": 9.99996744285603e-08, "logits/chosen": -3.054316282272339, "logits/rejected": -3.017752170562744, "logps/chosen": -57.53413772583008, "logps/rejected": -56.03937911987305, "loss": 0.6855, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.015109856612980366, "rewards/margins": 0.015798311680555344, "rewards/rejected": -0.030908167362213135, "step": 1760 }, { "epoch": 0.3049620951068229, "grad_norm": 2.043713331222534, "learning_rate": 9.999921219871774e-08, "logits/chosen": -3.0814831256866455, "logits/rejected": -3.0449986457824707, "logps/chosen": -57.06001663208008, "logps/rejected": -52.38982391357422, "loss": 0.6866, "rewards/accuracies": 0.65625, "rewards/chosen": -0.020783161744475365, "rewards/margins": 0.013692972250282764, "rewards/rejected": -0.034476134926080704, "step": 1770 }, { "epoch": 0.30668504479669195, "grad_norm": 2.726989269256592, "learning_rate": 9.99985490018664e-08, "logits/chosen": -2.9961390495300293, "logits/rejected": -2.998124599456787, "logps/chosen": -56.47089385986328, "logps/rejected": -61.23005294799805, "loss": 0.6893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01952342316508293, "rewards/margins": 0.008153209462761879, "rewards/rejected": -0.02767663262784481, "step": 1780 }, { "epoch": 0.308407994486561, "grad_norm": 2.657099962234497, "learning_rate": 9.99976848406719e-08, "logits/chosen": -3.1415393352508545, "logits/rejected": -3.1191763877868652, "logps/chosen": -53.60601043701172, "logps/rejected": -55.48347854614258, "loss": 0.6867, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01610630378127098, "rewards/margins": 0.013423112221062183, "rewards/rejected": -0.02952941693365574, "step": 1790 }, { "epoch": 0.31013094417643006, "grad_norm": 2.361691474914551, "learning_rate": 9.999661971860766e-08, "logits/chosen": -3.105226516723633, "logits/rejected": -3.0813050270080566, "logps/chosen": -53.7916374206543, "logps/rejected": -55.527015686035156, "loss": 0.6857, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01989469677209854, "rewards/margins": 0.015467017889022827, "rewards/rejected": -0.03536171466112137, "step": 1800 }, { "epoch": 0.31013094417643006, "eval_logits/chosen": -3.130061149597168, "eval_logits/rejected": -3.124415397644043, "eval_logps/chosen": -58.86876678466797, "eval_logps/rejected": -63.78819274902344, "eval_loss": 0.6909947991371155, "eval_rewards/accuracies": 0.5734200477600098, "eval_rewards/chosen": -0.0015687240520492196, "eval_rewards/margins": 0.004511936567723751, "eval_rewards/rejected": -0.0060806600376963615, "eval_runtime": 384.0945, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 1800 }, { "epoch": 0.3118538938662991, "grad_norm": 2.5181891918182373, "learning_rate": 9.999535363995486e-08, "logits/chosen": -3.107663869857788, "logits/rejected": -3.0721559524536133, "logps/chosen": -58.23188400268555, "logps/rejected": -56.67511749267578, "loss": 0.6866, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.016147421672940254, "rewards/margins": 0.013697738759219646, "rewards/rejected": -0.029845163226127625, "step": 1810 }, { "epoch": 0.31357684355616816, "grad_norm": 2.3864996433258057, "learning_rate": 9.999388660980235e-08, "logits/chosen": -3.0781984329223633, "logits/rejected": -3.043715000152588, "logps/chosen": -57.25934982299805, "logps/rejected": -53.792823791503906, "loss": 0.6884, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.020972583442926407, "rewards/margins": 0.010134930722415447, "rewards/rejected": -0.03110751509666443, "step": 1820 }, { "epoch": 0.31529979324603724, "grad_norm": 2.3336403369903564, "learning_rate": 9.999221863404672e-08, "logits/chosen": -2.9730467796325684, "logits/rejected": -2.9593658447265625, "logps/chosen": -57.46650314331055, "logps/rejected": -57.38367462158203, "loss": 0.6898, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.019232075661420822, "rewards/margins": 0.007276470772922039, "rewards/rejected": -0.026508551090955734, "step": 1830 }, { "epoch": 0.31702274293590627, "grad_norm": 2.525904417037964, "learning_rate": 9.999034971939226e-08, "logits/chosen": -3.1814403533935547, "logits/rejected": -3.1774861812591553, "logps/chosen": -57.478782653808594, "logps/rejected": -58.26776123046875, "loss": 0.6906, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.026586761698126793, "rewards/margins": 0.00562839861959219, "rewards/rejected": -0.03221515938639641, "step": 1840 }, { "epoch": 0.31874569262577535, "grad_norm": 2.4550936222076416, "learning_rate": 9.998827987335088e-08, "logits/chosen": -3.0470614433288574, "logits/rejected": -3.047095775604248, "logps/chosen": -55.64310836791992, "logps/rejected": -57.90021896362305, "loss": 0.6897, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.02433396503329277, "rewards/margins": 0.007592732552438974, "rewards/rejected": -0.03192669898271561, "step": 1850 }, { "epoch": 0.32046864231564437, "grad_norm": 2.6183524131774902, "learning_rate": 9.998600910424211e-08, "logits/chosen": -2.979332447052002, "logits/rejected": -2.9432613849639893, "logps/chosen": -58.84608840942383, "logps/rejected": -57.41669845581055, "loss": 0.6829, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.01554133277386427, "rewards/margins": 0.02127157151699066, "rewards/rejected": -0.036812908947467804, "step": 1860 }, { "epoch": 0.32219159200551345, "grad_norm": 2.7818078994750977, "learning_rate": 9.99835374211931e-08, "logits/chosen": -3.025416135787964, "logits/rejected": -2.9999618530273438, "logps/chosen": -58.08173751831055, "logps/rejected": -58.033287048339844, "loss": 0.683, "rewards/accuracies": 0.65625, "rewards/chosen": -0.021320369094610214, "rewards/margins": 0.021128328517079353, "rewards/rejected": -0.04244869947433472, "step": 1870 }, { "epoch": 0.3239145416953825, "grad_norm": 2.4889914989471436, "learning_rate": 9.998086483413856e-08, "logits/chosen": -3.070798397064209, "logits/rejected": -3.033024549484253, "logps/chosen": -54.67546844482422, "logps/rejected": -55.34244918823242, "loss": 0.684, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.019596396014094353, "rewards/margins": 0.01912146620452404, "rewards/rejected": -0.038717858493328094, "step": 1880 }, { "epoch": 0.32563749138525155, "grad_norm": 2.650590419769287, "learning_rate": 9.997799135382066e-08, "logits/chosen": -3.093008279800415, "logits/rejected": -3.087238073348999, "logps/chosen": -54.76836013793945, "logps/rejected": -58.22447967529297, "loss": 0.6832, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.017106035724282265, "rewards/margins": 0.020660221576690674, "rewards/rejected": -0.03776625543832779, "step": 1890 }, { "epoch": 0.32736044107512063, "grad_norm": 2.6379759311676025, "learning_rate": 9.997491699178911e-08, "logits/chosen": -3.0253844261169434, "logits/rejected": -2.992520332336426, "logps/chosen": -59.2156982421875, "logps/rejected": -55.42045974731445, "loss": 0.6866, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.017759006470441818, "rewards/margins": 0.013827571645379066, "rewards/rejected": -0.031586579978466034, "step": 1900 }, { "epoch": 0.32736044107512063, "eval_logits/chosen": -3.124460220336914, "eval_logits/rejected": -3.1187853813171387, "eval_logps/chosen": -59.09389114379883, "eval_logps/rejected": -64.0829849243164, "eval_loss": 0.690680742263794, "eval_rewards/accuracies": 0.5843401551246643, "eval_rewards/chosen": -0.003819952253252268, "eval_rewards/margins": 0.005208645015954971, "eval_rewards/rejected": -0.009028596803545952, "eval_runtime": 384.1711, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 1900 }, { "epoch": 0.32908339076498966, "grad_norm": 2.629287004470825, "learning_rate": 9.997164176040098e-08, "logits/chosen": -2.9245457649230957, "logits/rejected": -2.8900465965270996, "logps/chosen": -57.03288650512695, "logps/rejected": -57.40998458862305, "loss": 0.6858, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.025651965290308, "rewards/margins": 0.01547262817621231, "rewards/rejected": -0.04112459346652031, "step": 1910 }, { "epoch": 0.33080634045485874, "grad_norm": 2.4715516567230225, "learning_rate": 9.996816567282078e-08, "logits/chosen": -3.037757396697998, "logits/rejected": -3.009986400604248, "logps/chosen": -57.68037796020508, "logps/rejected": -57.707550048828125, "loss": 0.6851, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0231170617043972, "rewards/margins": 0.016938531771302223, "rewards/rejected": -0.040055595338344574, "step": 1920 }, { "epoch": 0.33252929014472776, "grad_norm": 2.241358757019043, "learning_rate": 9.996448874302028e-08, "logits/chosen": -2.9859111309051514, "logits/rejected": -2.9490742683410645, "logps/chosen": -56.32316207885742, "logps/rejected": -57.95221710205078, "loss": 0.6852, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.024327397346496582, "rewards/margins": 0.016690973192453384, "rewards/rejected": -0.041018370538949966, "step": 1930 }, { "epoch": 0.33425223983459684, "grad_norm": 2.3036279678344727, "learning_rate": 9.996061098577856e-08, "logits/chosen": -3.008873462677002, "logits/rejected": -2.99051570892334, "logps/chosen": -53.86616134643555, "logps/rejected": -54.19719696044922, "loss": 0.6873, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029261711984872818, "rewards/margins": 0.012471325695514679, "rewards/rejected": -0.0417330376803875, "step": 1940 }, { "epoch": 0.33597518952446587, "grad_norm": 2.34013295173645, "learning_rate": 9.995653241668189e-08, "logits/chosen": -3.0146219730377197, "logits/rejected": -3.0129730701446533, "logps/chosen": -55.39385223388672, "logps/rejected": -59.005035400390625, "loss": 0.6869, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.028862157836556435, "rewards/margins": 0.013413484208285809, "rewards/rejected": -0.04227564111351967, "step": 1950 }, { "epoch": 0.33769813921433495, "grad_norm": 2.4543216228485107, "learning_rate": 9.995225305212369e-08, "logits/chosen": -3.020707368850708, "logits/rejected": -3.00138521194458, "logps/chosen": -56.99605178833008, "logps/rejected": -58.58466339111328, "loss": 0.6863, "rewards/accuracies": 0.625, "rewards/chosen": -0.02636531926691532, "rewards/margins": 0.01451261155307293, "rewards/rejected": -0.04087793081998825, "step": 1960 }, { "epoch": 0.33942108890420397, "grad_norm": 2.306859016418457, "learning_rate": 9.994777290930442e-08, "logits/chosen": -3.0646586418151855, "logits/rejected": -3.0352931022644043, "logps/chosen": -58.447509765625, "logps/rejected": -56.149253845214844, "loss": 0.6836, "rewards/accuracies": 0.625, "rewards/chosen": -0.023009028285741806, "rewards/margins": 0.020027779042720795, "rewards/rejected": -0.0430368110537529, "step": 1970 }, { "epoch": 0.34114403859407305, "grad_norm": 2.5643606185913086, "learning_rate": 9.994309200623163e-08, "logits/chosen": -2.9704651832580566, "logits/rejected": -2.9648497104644775, "logps/chosen": -58.838722229003906, "logps/rejected": -57.81995391845703, "loss": 0.6909, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.032509177923202515, "rewards/margins": 0.005296523217111826, "rewards/rejected": -0.037805698812007904, "step": 1980 }, { "epoch": 0.34286698828394213, "grad_norm": 2.5272481441497803, "learning_rate": 9.993821036171974e-08, "logits/chosen": -3.0978000164031982, "logits/rejected": -3.0630557537078857, "logps/chosen": -53.90228271484375, "logps/rejected": -53.10187530517578, "loss": 0.6844, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.030678829178214073, "rewards/margins": 0.018644079566001892, "rewards/rejected": -0.049322910606861115, "step": 1990 }, { "epoch": 0.34458993797381116, "grad_norm": 2.7310101985931396, "learning_rate": 9.993312799539004e-08, "logits/chosen": -3.019115686416626, "logits/rejected": -3.025444507598877, "logps/chosen": -53.52924346923828, "logps/rejected": -62.47046661376953, "loss": 0.6872, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03568344935774803, "rewards/margins": 0.012897400185465813, "rewards/rejected": -0.048580851405858994, "step": 2000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.11759614944458, "eval_logits/rejected": -3.1119582653045654, "eval_logps/chosen": -59.45721435546875, "eval_logps/rejected": -64.52278137207031, "eval_loss": 0.6903449892997742, "eval_rewards/accuracies": 0.5861988663673401, "eval_rewards/chosen": -0.0074531808495521545, "eval_rewards/margins": 0.00597338005900383, "eval_rewards/rejected": -0.013426561839878559, "eval_runtime": 383.6437, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 2000 }, { "epoch": 0.34631288766368024, "grad_norm": 2.5221426486968994, "learning_rate": 9.992784492767061e-08, "logits/chosen": -3.0243418216705322, "logits/rejected": -2.9959535598754883, "logps/chosen": -58.401031494140625, "logps/rejected": -56.41370391845703, "loss": 0.6846, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02799166366457939, "rewards/margins": 0.018121318891644478, "rewards/rejected": -0.04611298069357872, "step": 2010 }, { "epoch": 0.34803583735354926, "grad_norm": 2.3270416259765625, "learning_rate": 9.992236117979623e-08, "logits/chosen": -3.0124897956848145, "logits/rejected": -2.9907262325286865, "logps/chosen": -53.28919219970703, "logps/rejected": -59.992469787597656, "loss": 0.6844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03459874913096428, "rewards/margins": 0.01853993721306324, "rewards/rejected": -0.05313868448138237, "step": 2020 }, { "epoch": 0.34975878704341834, "grad_norm": 2.6218039989471436, "learning_rate": 9.991667677380831e-08, "logits/chosen": -3.099220037460327, "logits/rejected": -3.0776753425598145, "logps/chosen": -60.10406494140625, "logps/rejected": -59.94960403442383, "loss": 0.6851, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.030517857521772385, "rewards/margins": 0.017163077369332314, "rewards/rejected": -0.04768093675374985, "step": 2030 }, { "epoch": 0.35148173673328736, "grad_norm": 2.4967219829559326, "learning_rate": 9.991079173255476e-08, "logits/chosen": -2.962681531906128, "logits/rejected": -2.9499523639678955, "logps/chosen": -56.1843376159668, "logps/rejected": -58.91411209106445, "loss": 0.6862, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.034683868288993835, "rewards/margins": 0.014922414906322956, "rewards/rejected": -0.049606285989284515, "step": 2040 }, { "epoch": 0.35320468642315644, "grad_norm": 2.641596794128418, "learning_rate": 9.990470607968994e-08, "logits/chosen": -3.0792365074157715, "logits/rejected": -3.0543460845947266, "logps/chosen": -54.29334259033203, "logps/rejected": -58.872283935546875, "loss": 0.6866, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.030713215470314026, "rewards/margins": 0.013904750347137451, "rewards/rejected": -0.04461796581745148, "step": 2050 }, { "epoch": 0.3549276361130255, "grad_norm": 2.320413589477539, "learning_rate": 9.989841983967456e-08, "logits/chosen": -3.066166400909424, "logits/rejected": -3.027841567993164, "logps/chosen": -58.014869689941406, "logps/rejected": -56.31104278564453, "loss": 0.6834, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026854073628783226, "rewards/margins": 0.020434152334928513, "rewards/rejected": -0.04728822782635689, "step": 2060 }, { "epoch": 0.35665058580289455, "grad_norm": 2.640760898590088, "learning_rate": 9.989193303777551e-08, "logits/chosen": -3.0829617977142334, "logits/rejected": -3.0598063468933105, "logps/chosen": -58.368186950683594, "logps/rejected": -59.42899703979492, "loss": 0.6859, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.030749907717108727, "rewards/margins": 0.015715401619672775, "rewards/rejected": -0.04646530747413635, "step": 2070 }, { "epoch": 0.35837353549276363, "grad_norm": 2.6373374462127686, "learning_rate": 9.988524570006591e-08, "logits/chosen": -3.0231032371520996, "logits/rejected": -2.993591785430908, "logps/chosen": -55.636573791503906, "logps/rejected": -56.2973518371582, "loss": 0.6842, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03768758103251457, "rewards/margins": 0.018977124243974686, "rewards/rejected": -0.05666470527648926, "step": 2080 }, { "epoch": 0.36009648518263265, "grad_norm": 2.432570457458496, "learning_rate": 9.987835785342484e-08, "logits/chosen": -3.047208547592163, "logits/rejected": -3.0437216758728027, "logps/chosen": -57.70501708984375, "logps/rejected": -59.10963821411133, "loss": 0.6879, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03310250863432884, "rewards/margins": 0.01140835415571928, "rewards/rejected": -0.0445108599960804, "step": 2090 }, { "epoch": 0.36181943487250173, "grad_norm": 2.84558367729187, "learning_rate": 9.987126952553735e-08, "logits/chosen": -2.9957191944122314, "logits/rejected": -2.9737207889556885, "logps/chosen": -59.138671875, "logps/rejected": -56.697593688964844, "loss": 0.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03792215511202812, "rewards/margins": 0.01656353287398815, "rewards/rejected": -0.054485686123371124, "step": 2100 }, { "epoch": 0.36181943487250173, "eval_logits/chosen": -3.111330270767212, "eval_logits/rejected": -3.1056759357452393, "eval_logps/chosen": -59.95340347290039, "eval_logps/rejected": -65.123046875, "eval_loss": 0.689890444278717, "eval_rewards/accuracies": 0.5813196897506714, "eval_rewards/chosen": -0.012415084056556225, "eval_rewards/margins": 0.007014105096459389, "eval_rewards/rejected": -0.01942918822169304, "eval_runtime": 383.7312, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 2100 }, { "epoch": 0.36354238456237076, "grad_norm": 2.4933652877807617, "learning_rate": 9.986398074489428e-08, "logits/chosen": -3.051694869995117, "logits/rejected": -3.0545380115509033, "logps/chosen": -53.636512756347656, "logps/rejected": -60.6682243347168, "loss": 0.6901, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.047368988394737244, "rewards/margins": 0.00741946417838335, "rewards/rejected": -0.05478845164179802, "step": 2110 }, { "epoch": 0.36526533425223984, "grad_norm": 2.5531551837921143, "learning_rate": 9.985649154079221e-08, "logits/chosen": -2.9160945415496826, "logits/rejected": -2.888899326324463, "logps/chosen": -56.72404098510742, "logps/rejected": -55.765869140625, "loss": 0.6853, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.031483907252550125, "rewards/margins": 0.01687944494187832, "rewards/rejected": -0.048363346606492996, "step": 2120 }, { "epoch": 0.3669882839421089, "grad_norm": 2.257911205291748, "learning_rate": 9.984880194333322e-08, "logits/chosen": -2.9528069496154785, "logits/rejected": -2.934330463409424, "logps/chosen": -58.443580627441406, "logps/rejected": -58.60265350341797, "loss": 0.6833, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03973262384533882, "rewards/margins": 0.020924894139170647, "rewards/rejected": -0.06065751239657402, "step": 2130 }, { "epoch": 0.36871123363197794, "grad_norm": 2.5291523933410645, "learning_rate": 9.984091198342495e-08, "logits/chosen": -2.9302122592926025, "logits/rejected": -2.922152042388916, "logps/chosen": -54.165618896484375, "logps/rejected": -59.854225158691406, "loss": 0.6898, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.055104512721300125, "rewards/margins": 0.007914445362985134, "rewards/rejected": -0.06301895529031754, "step": 2140 }, { "epoch": 0.370434183321847, "grad_norm": 2.5453853607177734, "learning_rate": 9.983282169278032e-08, "logits/chosen": -2.9811716079711914, "logits/rejected": -2.9349172115325928, "logps/chosen": -57.90852737426758, "logps/rejected": -54.29497146606445, "loss": 0.6786, "rewards/accuracies": 0.65625, "rewards/chosen": -0.032199397683143616, "rewards/margins": 0.030662229284644127, "rewards/rejected": -0.06286162883043289, "step": 2150 }, { "epoch": 0.37215713301171605, "grad_norm": 2.5689234733581543, "learning_rate": 9.982453110391746e-08, "logits/chosen": -2.9477953910827637, "logits/rejected": -2.932558536529541, "logps/chosen": -58.82707595825195, "logps/rejected": -55.440582275390625, "loss": 0.6846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04070050269365311, "rewards/margins": 0.018166832625865936, "rewards/rejected": -0.05886733531951904, "step": 2160 }, { "epoch": 0.3738800827015851, "grad_norm": 2.6440985202789307, "learning_rate": 9.981604025015961e-08, "logits/chosen": -3.068624496459961, "logits/rejected": -3.052961826324463, "logps/chosen": -58.257469177246094, "logps/rejected": -57.409690856933594, "loss": 0.6836, "rewards/accuracies": 0.65625, "rewards/chosen": -0.039262399077415466, "rewards/margins": 0.020013421773910522, "rewards/rejected": -0.059275828301906586, "step": 2170 }, { "epoch": 0.37560303239145415, "grad_norm": 2.8089723587036133, "learning_rate": 9.980734916563493e-08, "logits/chosen": -3.116698741912842, "logits/rejected": -3.0789332389831543, "logps/chosen": -62.264122009277344, "logps/rejected": -59.9702033996582, "loss": 0.679, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04240008071064949, "rewards/margins": 0.029958883300423622, "rewards/rejected": -0.07235896587371826, "step": 2180 }, { "epoch": 0.37732598208132323, "grad_norm": 2.7721993923187256, "learning_rate": 9.97984578852764e-08, "logits/chosen": -3.1796822547912598, "logits/rejected": -3.173408031463623, "logps/chosen": -57.43373489379883, "logps/rejected": -59.5949592590332, "loss": 0.6881, "rewards/accuracies": 0.5625, "rewards/chosen": -0.045918986201286316, "rewards/margins": 0.011477927677333355, "rewards/rejected": -0.05739691108465195, "step": 2190 }, { "epoch": 0.37904893177119225, "grad_norm": 2.6272075176239014, "learning_rate": 9.978936644482165e-08, "logits/chosen": -3.0088448524475098, "logits/rejected": -2.985863447189331, "logps/chosen": -56.26617431640625, "logps/rejected": -61.8056640625, "loss": 0.6786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03957448527216911, "rewards/margins": 0.03054499626159668, "rewards/rejected": -0.07011948525905609, "step": 2200 }, { "epoch": 0.37904893177119225, "eval_logits/chosen": -3.103456735610962, "eval_logits/rejected": -3.0977530479431152, "eval_logps/chosen": -60.563751220703125, "eval_logps/rejected": -65.85384368896484, "eval_loss": 0.6893664598464966, "eval_rewards/accuracies": 0.5836431384086609, "eval_rewards/chosen": -0.018518557772040367, "eval_rewards/margins": 0.008218604139983654, "eval_rewards/rejected": -0.026737162843346596, "eval_runtime": 384.474, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 2200 }, { "epoch": 0.38077188146106133, "grad_norm": 2.6518142223358154, "learning_rate": 9.978007488081286e-08, "logits/chosen": -3.0658512115478516, "logits/rejected": -3.0496134757995605, "logps/chosen": -58.498046875, "logps/rejected": -59.838340759277344, "loss": 0.6821, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03771699219942093, "rewards/margins": 0.02357042208313942, "rewards/rejected": -0.061287421733140945, "step": 2210 }, { "epoch": 0.3824948311509304, "grad_norm": 2.6780636310577393, "learning_rate": 9.977058323059658e-08, "logits/chosen": -2.8927483558654785, "logits/rejected": -2.885488986968994, "logps/chosen": -57.95989227294922, "logps/rejected": -59.91791915893555, "loss": 0.684, "rewards/accuracies": 0.5625, "rewards/chosen": -0.040818952023983, "rewards/margins": 0.019889000803232193, "rewards/rejected": -0.060707949101924896, "step": 2220 }, { "epoch": 0.38421778084079944, "grad_norm": 2.4511308670043945, "learning_rate": 9.976089153232354e-08, "logits/chosen": -3.0489158630371094, "logits/rejected": -3.0171875953674316, "logps/chosen": -59.09993362426758, "logps/rejected": -57.86940383911133, "loss": 0.6854, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.054223526269197464, "rewards/margins": 0.017050547525286674, "rewards/rejected": -0.07127406448125839, "step": 2230 }, { "epoch": 0.3859407305306685, "grad_norm": 2.8595824241638184, "learning_rate": 9.975099982494864e-08, "logits/chosen": -3.0016238689422607, "logits/rejected": -2.9658570289611816, "logps/chosen": -61.5693473815918, "logps/rejected": -60.30341720581055, "loss": 0.6842, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.047597117722034454, "rewards/margins": 0.019501682370901108, "rewards/rejected": -0.06709879636764526, "step": 2240 }, { "epoch": 0.38766368022053754, "grad_norm": 2.8945720195770264, "learning_rate": 9.974090814823062e-08, "logits/chosen": -2.960645914077759, "logits/rejected": -2.9262959957122803, "logps/chosen": -59.94556427001953, "logps/rejected": -60.19099044799805, "loss": 0.6827, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04595700651407242, "rewards/margins": 0.0224994458258152, "rewards/rejected": -0.06845644861459732, "step": 2250 }, { "epoch": 0.3893866299104066, "grad_norm": 2.5157716274261475, "learning_rate": 9.9730616542732e-08, "logits/chosen": -2.9172933101654053, "logits/rejected": -2.8915657997131348, "logps/chosen": -65.81031036376953, "logps/rejected": -64.24557495117188, "loss": 0.6847, "rewards/accuracies": 0.5625, "rewards/chosen": -0.050026725977659225, "rewards/margins": 0.018777305260300636, "rewards/rejected": -0.06880402565002441, "step": 2260 }, { "epoch": 0.39110957960027565, "grad_norm": 2.750781297683716, "learning_rate": 9.972012504981892e-08, "logits/chosen": -2.902327299118042, "logits/rejected": -2.8693079948425293, "logps/chosen": -59.975929260253906, "logps/rejected": -59.61238479614258, "loss": 0.6842, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.059145379811525345, "rewards/margins": 0.019813846796751022, "rewards/rejected": -0.07895922660827637, "step": 2270 }, { "epoch": 0.3928325292901447, "grad_norm": 2.7032175064086914, "learning_rate": 9.970943371166087e-08, "logits/chosen": -2.9799790382385254, "logits/rejected": -2.969865083694458, "logps/chosen": -58.25428009033203, "logps/rejected": -61.76343536376953, "loss": 0.6866, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.04810738563537598, "rewards/margins": 0.014225492253899574, "rewards/rejected": -0.0623328797519207, "step": 2280 }, { "epoch": 0.3945554789800138, "grad_norm": 2.7236714363098145, "learning_rate": 9.969854257123071e-08, "logits/chosen": -2.8744850158691406, "logits/rejected": -2.846024513244629, "logps/chosen": -58.028656005859375, "logps/rejected": -61.29276657104492, "loss": 0.6819, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.05145770311355591, "rewards/margins": 0.02386692725121975, "rewards/rejected": -0.07532462477684021, "step": 2290 }, { "epoch": 0.39627842866988283, "grad_norm": 2.628142833709717, "learning_rate": 9.968745167230428e-08, "logits/chosen": -3.0581414699554443, "logits/rejected": -3.0243396759033203, "logps/chosen": -60.101112365722656, "logps/rejected": -60.5775032043457, "loss": 0.6801, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04156041145324707, "rewards/margins": 0.02765594981610775, "rewards/rejected": -0.06921636313199997, "step": 2300 }, { "epoch": 0.39627842866988283, "eval_logits/chosen": -3.096874237060547, "eval_logits/rejected": -3.091181755065918, "eval_logps/chosen": -61.009498596191406, "eval_logps/rejected": -66.41000366210938, "eval_loss": 0.6888702511787415, "eval_rewards/accuracies": 0.5915427803993225, "eval_rewards/chosen": -0.022975951433181763, "eval_rewards/margins": 0.00932283978909254, "eval_rewards/rejected": -0.03229879215359688, "eval_runtime": 384.0977, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 2300 }, { "epoch": 0.3980013783597519, "grad_norm": 2.7111597061157227, "learning_rate": 9.967616105946042e-08, "logits/chosen": -2.959782838821411, "logits/rejected": -2.944237232208252, "logps/chosen": -56.70935821533203, "logps/rejected": -59.37823486328125, "loss": 0.6824, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05631225183606148, "rewards/margins": 0.02312556840479374, "rewards/rejected": -0.07943782210350037, "step": 2310 }, { "epoch": 0.39972432804962094, "grad_norm": 2.6125195026397705, "learning_rate": 9.966467077808063e-08, "logits/chosen": -3.0550694465637207, "logits/rejected": -3.01396107673645, "logps/chosen": -62.40388870239258, "logps/rejected": -59.26466751098633, "loss": 0.6782, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0496249794960022, "rewards/margins": 0.03154844418168068, "rewards/rejected": -0.08117341250181198, "step": 2320 }, { "epoch": 0.40144727773949, "grad_norm": 2.649064064025879, "learning_rate": 9.965298087434898e-08, "logits/chosen": -2.9658901691436768, "logits/rejected": -2.9439010620117188, "logps/chosen": -61.02021026611328, "logps/rejected": -60.8330078125, "loss": 0.6772, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.052108265459537506, "rewards/margins": 0.03366889804601669, "rewards/rejected": -0.0857771709561348, "step": 2330 }, { "epoch": 0.40317022742935904, "grad_norm": 2.5695133209228516, "learning_rate": 9.964109139525195e-08, "logits/chosen": -3.006885051727295, "logits/rejected": -2.9949378967285156, "logps/chosen": -58.57697677612305, "logps/rejected": -63.1572380065918, "loss": 0.6866, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.06275421380996704, "rewards/margins": 0.015041634440422058, "rewards/rejected": -0.0777958482503891, "step": 2340 }, { "epoch": 0.4048931771192281, "grad_norm": 2.731903076171875, "learning_rate": 9.962900238857812e-08, "logits/chosen": -2.9901535511016846, "logits/rejected": -2.967587947845459, "logps/chosen": -63.169761657714844, "logps/rejected": -63.23868942260742, "loss": 0.6789, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04908466711640358, "rewards/margins": 0.030140314251184464, "rewards/rejected": -0.07922497391700745, "step": 2350 }, { "epoch": 0.4066161268090972, "grad_norm": 2.462651014328003, "learning_rate": 9.96167139029181e-08, "logits/chosen": -3.009126663208008, "logits/rejected": -2.996079921722412, "logps/chosen": -56.32133865356445, "logps/rejected": -58.798248291015625, "loss": 0.6852, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05959261581301689, "rewards/margins": 0.01709309034049511, "rewards/rejected": -0.07668570429086685, "step": 2360 }, { "epoch": 0.4083390764989662, "grad_norm": 2.946608304977417, "learning_rate": 9.960422598766427e-08, "logits/chosen": -3.023228406906128, "logits/rejected": -3.016806125640869, "logps/chosen": -59.85640335083008, "logps/rejected": -62.530982971191406, "loss": 0.6809, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.060944247990846634, "rewards/margins": 0.02571847476065159, "rewards/rejected": -0.08666272461414337, "step": 2370 }, { "epoch": 0.4100620261888353, "grad_norm": 2.487330198287964, "learning_rate": 9.95915386930106e-08, "logits/chosen": -2.949016809463501, "logits/rejected": -2.924262762069702, "logps/chosen": -59.72568893432617, "logps/rejected": -60.83997344970703, "loss": 0.6809, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05383528396487236, "rewards/margins": 0.02591845951974392, "rewards/rejected": -0.07975375652313232, "step": 2380 }, { "epoch": 0.41178497587870433, "grad_norm": 2.6496143341064453, "learning_rate": 9.957865206995243e-08, "logits/chosen": -3.033184766769409, "logits/rejected": -3.006194591522217, "logps/chosen": -62.02552032470703, "logps/rejected": -61.70001220703125, "loss": 0.6799, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05791522189974785, "rewards/margins": 0.02814369462430477, "rewards/rejected": -0.08605891466140747, "step": 2390 }, { "epoch": 0.4135079255685734, "grad_norm": 2.566384792327881, "learning_rate": 9.956556617028632e-08, "logits/chosen": -3.080162763595581, "logits/rejected": -3.059443473815918, "logps/chosen": -58.2050666809082, "logps/rejected": -61.38813018798828, "loss": 0.683, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.05904518440365791, "rewards/margins": 0.022201133891940117, "rewards/rejected": -0.08124633133411407, "step": 2400 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -3.088081121444702, "eval_logits/rejected": -3.0823891162872314, "eval_logps/chosen": -61.75593185424805, "eval_logps/rejected": -67.30512237548828, "eval_loss": 0.6882190108299255, "eval_rewards/accuracies": 0.586663544178009, "eval_rewards/chosen": -0.030440330505371094, "eval_rewards/margins": 0.010809744708240032, "eval_rewards/rejected": -0.0412500761449337, "eval_runtime": 384.1108, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 2400 }, { "epoch": 0.41523087525844243, "grad_norm": 2.3918588161468506, "learning_rate": 9.955228104660978e-08, "logits/chosen": -3.053192615509033, "logits/rejected": -3.013669729232788, "logps/chosen": -59.11090850830078, "logps/rejected": -58.06081008911133, "loss": 0.6764, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05260564014315605, "rewards/margins": 0.035319771617650986, "rewards/rejected": -0.08792541176080704, "step": 2410 }, { "epoch": 0.4169538249483115, "grad_norm": 2.5461032390594482, "learning_rate": 9.953879675232106e-08, "logits/chosen": -3.0487606525421143, "logits/rejected": -3.02602219581604, "logps/chosen": -62.342079162597656, "logps/rejected": -63.47113037109375, "loss": 0.6821, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06159963086247444, "rewards/margins": 0.024086806923151016, "rewards/rejected": -0.08568643033504486, "step": 2420 }, { "epoch": 0.41867677463818054, "grad_norm": 2.704826593399048, "learning_rate": 9.952511334161901e-08, "logits/chosen": -3.0106520652770996, "logits/rejected": -2.988302230834961, "logps/chosen": -60.93560028076172, "logps/rejected": -60.20219802856445, "loss": 0.6823, "rewards/accuracies": 0.59375, "rewards/chosen": -0.060926198959350586, "rewards/margins": 0.023762371391057968, "rewards/rejected": -0.08468855917453766, "step": 2430 }, { "epoch": 0.4203997243280496, "grad_norm": 2.968585729598999, "learning_rate": 9.951123086950277e-08, "logits/chosen": -3.0059704780578613, "logits/rejected": -2.992574453353882, "logps/chosen": -60.81300735473633, "logps/rejected": -63.385353088378906, "loss": 0.6786, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05756126716732979, "rewards/margins": 0.03129119426012039, "rewards/rejected": -0.08885245025157928, "step": 2440 }, { "epoch": 0.4221226740179187, "grad_norm": 2.6429266929626465, "learning_rate": 9.949714939177159e-08, "logits/chosen": -2.962749481201172, "logits/rejected": -2.939649820327759, "logps/chosen": -60.22247314453125, "logps/rejected": -61.68815231323242, "loss": 0.6826, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07164134085178375, "rewards/margins": 0.022660810500383377, "rewards/rejected": -0.09430215507745743, "step": 2450 }, { "epoch": 0.4238456237077877, "grad_norm": 2.7109174728393555, "learning_rate": 9.94828689650246e-08, "logits/chosen": -2.9447712898254395, "logits/rejected": -2.912111759185791, "logps/chosen": -62.50591278076172, "logps/rejected": -61.696937561035156, "loss": 0.6801, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06364482641220093, "rewards/margins": 0.027994727715849876, "rewards/rejected": -0.09163956344127655, "step": 2460 }, { "epoch": 0.4255685733976568, "grad_norm": 2.8311927318573, "learning_rate": 9.946838964666062e-08, "logits/chosen": -3.1035609245300293, "logits/rejected": -3.0822768211364746, "logps/chosen": -60.968597412109375, "logps/rejected": -61.770057678222656, "loss": 0.6785, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.059243809431791306, "rewards/margins": 0.03137410059571266, "rewards/rejected": -0.09061791002750397, "step": 2470 }, { "epoch": 0.4272915230875258, "grad_norm": 2.8282628059387207, "learning_rate": 9.945371149487787e-08, "logits/chosen": -3.0006637573242188, "logits/rejected": -2.9696104526519775, "logps/chosen": -62.03342819213867, "logps/rejected": -59.79804229736328, "loss": 0.6853, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.076552614569664, "rewards/margins": 0.0171709842979908, "rewards/rejected": -0.0937236100435257, "step": 2480 }, { "epoch": 0.4290144727773949, "grad_norm": 2.5286202430725098, "learning_rate": 9.943883456867374e-08, "logits/chosen": -2.983997344970703, "logits/rejected": -2.975327253341675, "logps/chosen": -57.564537048339844, "logps/rejected": -60.89754104614258, "loss": 0.6853, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07525698840618134, "rewards/margins": 0.017769118770956993, "rewards/rejected": -0.09302611649036407, "step": 2490 }, { "epoch": 0.43073742246726393, "grad_norm": 2.9713549613952637, "learning_rate": 9.942375892784464e-08, "logits/chosen": -3.0604982376098633, "logits/rejected": -3.0365564823150635, "logps/chosen": -64.35960388183594, "logps/rejected": -65.81145477294922, "loss": 0.6853, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07468898594379425, "rewards/margins": 0.017729226499795914, "rewards/rejected": -0.09241821616888046, "step": 2500 }, { "epoch": 0.43073742246726393, "eval_logits/chosen": -3.079015016555786, "eval_logits/rejected": -3.073314905166626, "eval_logps/chosen": -62.63667678833008, "eval_logps/rejected": -68.33291625976562, "eval_loss": 0.6875827312469482, "eval_rewards/accuracies": 0.5841078162193298, "eval_rewards/chosen": -0.039247795939445496, "eval_rewards/margins": 0.012280100956559181, "eval_rewards/rejected": -0.051527902483940125, "eval_runtime": 384.3755, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 2500 }, { "epoch": 0.432460372157133, "grad_norm": 2.7617154121398926, "learning_rate": 9.940848463298563e-08, "logits/chosen": -2.9101836681365967, "logits/rejected": -2.900426149368286, "logps/chosen": -59.78874588012695, "logps/rejected": -62.5969352722168, "loss": 0.678, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.07403077185153961, "rewards/margins": 0.03212301433086395, "rewards/rejected": -0.10615377128124237, "step": 2510 }, { "epoch": 0.4341833218470021, "grad_norm": 2.5668039321899414, "learning_rate": 9.939301174549025e-08, "logits/chosen": -2.8987350463867188, "logits/rejected": -2.8712899684906006, "logps/chosen": -59.41999435424805, "logps/rejected": -59.10844039916992, "loss": 0.6784, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06951779872179031, "rewards/margins": 0.03154792636632919, "rewards/rejected": -0.10106571763753891, "step": 2520 }, { "epoch": 0.4359062715368711, "grad_norm": 2.6358416080474854, "learning_rate": 9.93773403275503e-08, "logits/chosen": -2.9857163429260254, "logits/rejected": -2.9810783863067627, "logps/chosen": -59.38280487060547, "logps/rejected": -63.86140823364258, "loss": 0.6868, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08213236927986145, "rewards/margins": 0.014956777915358543, "rewards/rejected": -0.09708914905786514, "step": 2530 }, { "epoch": 0.4376292212267402, "grad_norm": 2.670677661895752, "learning_rate": 9.936147044215552e-08, "logits/chosen": -3.025181531906128, "logits/rejected": -3.014481544494629, "logps/chosen": -62.18967819213867, "logps/rejected": -65.07129669189453, "loss": 0.6813, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08649241179227829, "rewards/margins": 0.025742972269654274, "rewards/rejected": -0.11223538219928741, "step": 2540 }, { "epoch": 0.4393521709166092, "grad_norm": 2.7967097759246826, "learning_rate": 9.934540215309342e-08, "logits/chosen": -2.969196319580078, "logits/rejected": -2.9364800453186035, "logps/chosen": -65.66370391845703, "logps/rejected": -64.30455017089844, "loss": 0.6781, "rewards/accuracies": 0.625, "rewards/chosen": -0.06980601698160172, "rewards/margins": 0.03352366015315056, "rewards/rejected": -0.10332968086004257, "step": 2550 }, { "epoch": 0.4410751206064783, "grad_norm": 2.8186585903167725, "learning_rate": 9.932913552494887e-08, "logits/chosen": -3.057368755340576, "logits/rejected": -3.032867670059204, "logps/chosen": -62.49481201171875, "logps/rejected": -64.1727066040039, "loss": 0.6797, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.072344571352005, "rewards/margins": 0.02918105758726597, "rewards/rejected": -0.10152564197778702, "step": 2560 }, { "epoch": 0.4427980702963473, "grad_norm": 2.7252604961395264, "learning_rate": 9.931267062310407e-08, "logits/chosen": -2.991008758544922, "logits/rejected": -2.972618579864502, "logps/chosen": -65.75273895263672, "logps/rejected": -66.60904693603516, "loss": 0.6795, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.07151017338037491, "rewards/margins": 0.029543984681367874, "rewards/rejected": -0.10105415433645248, "step": 2570 }, { "epoch": 0.4445210199862164, "grad_norm": 3.11721134185791, "learning_rate": 9.929600751373807e-08, "logits/chosen": -3.0143558979034424, "logits/rejected": -2.9956576824188232, "logps/chosen": -61.84235763549805, "logps/rejected": -63.36035919189453, "loss": 0.6854, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08456549793481827, "rewards/margins": 0.01721056178212166, "rewards/rejected": -0.10177604854106903, "step": 2580 }, { "epoch": 0.4462439696760855, "grad_norm": 3.2454628944396973, "learning_rate": 9.927914626382665e-08, "logits/chosen": -2.992849588394165, "logits/rejected": -2.9559502601623535, "logps/chosen": -64.04624938964844, "logps/rejected": -62.281715393066406, "loss": 0.6761, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07351807504892349, "rewards/margins": 0.036832019686698914, "rewards/rejected": -0.1103500947356224, "step": 2590 }, { "epoch": 0.4479669193659545, "grad_norm": 2.8180019855499268, "learning_rate": 9.926208694114196e-08, "logits/chosen": -2.9650466442108154, "logits/rejected": -2.9174916744232178, "logps/chosen": -65.36735534667969, "logps/rejected": -60.16841506958008, "loss": 0.6775, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0827118456363678, "rewards/margins": 0.0343327671289444, "rewards/rejected": -0.1170446127653122, "step": 2600 }, { "epoch": 0.4479669193659545, "eval_logits/chosen": -3.072807788848877, "eval_logits/rejected": -3.0670812129974365, "eval_logps/chosen": -63.35167694091797, "eval_logps/rejected": -69.17733001708984, "eval_loss": 0.6870245337486267, "eval_rewards/accuracies": 0.5834107995033264, "eval_rewards/chosen": -0.04639780893921852, "eval_rewards/margins": 0.013574252836406231, "eval_rewards/rejected": -0.05997206270694733, "eval_runtime": 383.9968, "eval_samples_per_second": 11.208, "eval_steps_per_second": 1.401, "step": 2600 }, { "epoch": 0.4496898690558236, "grad_norm": 2.7699742317199707, "learning_rate": 9.924482961425232e-08, "logits/chosen": -2.944060802459717, "logits/rejected": -2.9047398567199707, "logps/chosen": -66.3946304321289, "logps/rejected": -63.1397819519043, "loss": 0.6774, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08015574514865875, "rewards/margins": 0.03451739624142647, "rewards/rejected": -0.11467315256595612, "step": 2610 }, { "epoch": 0.4514128187456926, "grad_norm": 2.652411460876465, "learning_rate": 9.922737435252189e-08, "logits/chosen": -3.031888961791992, "logits/rejected": -3.0019288063049316, "logps/chosen": -59.47108840942383, "logps/rejected": -62.49885177612305, "loss": 0.6751, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0820513367652893, "rewards/margins": 0.03846115618944168, "rewards/rejected": -0.12051250040531158, "step": 2620 }, { "epoch": 0.4531357684355617, "grad_norm": 3.081232786178589, "learning_rate": 9.92097212261104e-08, "logits/chosen": -2.916567325592041, "logits/rejected": -2.8844668865203857, "logps/chosen": -62.49360275268555, "logps/rejected": -65.05132293701172, "loss": 0.6734, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07680071145296097, "rewards/margins": 0.042157966643571854, "rewards/rejected": -0.11895868927240372, "step": 2630 }, { "epoch": 0.4548587181254307, "grad_norm": 2.6305322647094727, "learning_rate": 9.919187030597288e-08, "logits/chosen": -2.9371588230133057, "logits/rejected": -2.91597056388855, "logps/chosen": -58.90857696533203, "logps/rejected": -60.515663146972656, "loss": 0.6808, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08813159167766571, "rewards/margins": 0.027149613946676254, "rewards/rejected": -0.11528120934963226, "step": 2640 }, { "epoch": 0.4565816678152998, "grad_norm": 2.584092140197754, "learning_rate": 9.91738216638594e-08, "logits/chosen": -2.9085030555725098, "logits/rejected": -2.8908283710479736, "logps/chosen": -59.93761444091797, "logps/rejected": -64.91102600097656, "loss": 0.6789, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07506780326366425, "rewards/margins": 0.031199375167489052, "rewards/rejected": -0.10626717656850815, "step": 2650 }, { "epoch": 0.4583046175051689, "grad_norm": 2.9883782863616943, "learning_rate": 9.915557537231472e-08, "logits/chosen": -2.9364726543426514, "logits/rejected": -2.894967555999756, "logps/chosen": -66.78648376464844, "logps/rejected": -65.0617904663086, "loss": 0.6743, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08496657758951187, "rewards/margins": 0.040906231850385666, "rewards/rejected": -0.12587280571460724, "step": 2660 }, { "epoch": 0.4600275671950379, "grad_norm": 2.668771266937256, "learning_rate": 9.913713150467805e-08, "logits/chosen": -2.9089488983154297, "logits/rejected": -2.8806443214416504, "logps/chosen": -62.62455368041992, "logps/rejected": -64.50057983398438, "loss": 0.6757, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08678580075502396, "rewards/margins": 0.037604156881570816, "rewards/rejected": -0.12438994646072388, "step": 2670 }, { "epoch": 0.461750516884907, "grad_norm": 3.0649049282073975, "learning_rate": 9.911849013508274e-08, "logits/chosen": -2.954739570617676, "logits/rejected": -2.9277560710906982, "logps/chosen": -67.18395233154297, "logps/rejected": -64.95777130126953, "loss": 0.6824, "rewards/accuracies": 0.625, "rewards/chosen": -0.08936417102813721, "rewards/margins": 0.023585891351103783, "rewards/rejected": -0.11295004934072495, "step": 2680 }, { "epoch": 0.463473466574776, "grad_norm": 2.8779923915863037, "learning_rate": 9.9099651338456e-08, "logits/chosen": -2.927757978439331, "logits/rejected": -2.9055356979370117, "logps/chosen": -60.169456481933594, "logps/rejected": -62.86531448364258, "loss": 0.6772, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09528367221355438, "rewards/margins": 0.03450409695506096, "rewards/rejected": -0.12978777289390564, "step": 2690 }, { "epoch": 0.4651964162646451, "grad_norm": 2.7362937927246094, "learning_rate": 9.908061519051851e-08, "logits/chosen": -2.956076145172119, "logits/rejected": -2.9295990467071533, "logps/chosen": -61.81547164916992, "logps/rejected": -65.11744689941406, "loss": 0.6788, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.08584700524806976, "rewards/margins": 0.030563393607735634, "rewards/rejected": -0.11641039699316025, "step": 2700 }, { "epoch": 0.4651964162646451, "eval_logits/chosen": -3.0667531490325928, "eval_logits/rejected": -3.0610108375549316, "eval_logps/chosen": -64.02751922607422, "eval_logps/rejected": -69.99384307861328, "eval_loss": 0.6864203214645386, "eval_rewards/accuracies": 0.5894516706466675, "eval_rewards/chosen": -0.053156204521656036, "eval_rewards/margins": 0.014980971813201904, "eval_rewards/rejected": -0.06813717633485794, "eval_runtime": 384.0531, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 2700 }, { "epoch": 0.4669193659545141, "grad_norm": 3.2074708938598633, "learning_rate": 9.906138176778426e-08, "logits/chosen": -2.9789416790008545, "logits/rejected": -2.9601712226867676, "logps/chosen": -67.79176330566406, "logps/rejected": -65.58930969238281, "loss": 0.685, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08345109969377518, "rewards/margins": 0.018934572115540504, "rewards/rejected": -0.10238567739725113, "step": 2710 }, { "epoch": 0.4686423156443832, "grad_norm": 2.9427285194396973, "learning_rate": 9.904195114756013e-08, "logits/chosen": -2.9684221744537354, "logits/rejected": -2.9718992710113525, "logps/chosen": -61.5818977355957, "logps/rejected": -65.64582061767578, "loss": 0.683, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09534648805856705, "rewards/margins": 0.0234840027987957, "rewards/rejected": -0.11883047968149185, "step": 2720 }, { "epoch": 0.4703652653342522, "grad_norm": 3.190385341644287, "learning_rate": 9.90223234079456e-08, "logits/chosen": -2.9681687355041504, "logits/rejected": -2.9486961364746094, "logps/chosen": -67.40243530273438, "logps/rejected": -66.68904113769531, "loss": 0.6812, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08835718780755997, "rewards/margins": 0.026280706748366356, "rewards/rejected": -0.11463788896799088, "step": 2730 }, { "epoch": 0.4720882150241213, "grad_norm": 2.9422459602355957, "learning_rate": 9.900249862783253e-08, "logits/chosen": -2.9548585414886475, "logits/rejected": -2.9337644577026367, "logps/chosen": -62.42155075073242, "logps/rejected": -61.25109100341797, "loss": 0.683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0956229493021965, "rewards/margins": 0.023061102256178856, "rewards/rejected": -0.11868405342102051, "step": 2740 }, { "epoch": 0.4738111647139904, "grad_norm": 2.94500994682312, "learning_rate": 9.898247688690467e-08, "logits/chosen": -2.8857979774475098, "logits/rejected": -2.8807554244995117, "logps/chosen": -59.32061004638672, "logps/rejected": -65.92103576660156, "loss": 0.6823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10056865215301514, "rewards/margins": 0.024371769279241562, "rewards/rejected": -0.1249404177069664, "step": 2750 }, { "epoch": 0.4755341144038594, "grad_norm": 3.078279495239258, "learning_rate": 9.896225826563748e-08, "logits/chosen": -2.936070203781128, "logits/rejected": -2.9187848567962646, "logps/chosen": -63.932708740234375, "logps/rejected": -67.78277587890625, "loss": 0.6779, "rewards/accuracies": 0.625, "rewards/chosen": -0.09657719731330872, "rewards/margins": 0.03319309279322624, "rewards/rejected": -0.12977029383182526, "step": 2760 }, { "epoch": 0.4772570640937285, "grad_norm": 3.2596120834350586, "learning_rate": 9.894184284529776e-08, "logits/chosen": -3.0649921894073486, "logits/rejected": -3.0350372791290283, "logps/chosen": -64.24156951904297, "logps/rejected": -64.7466812133789, "loss": 0.6811, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.09026803076267242, "rewards/margins": 0.026812776923179626, "rewards/rejected": -0.11708080768585205, "step": 2770 }, { "epoch": 0.4789800137835975, "grad_norm": 2.8784027099609375, "learning_rate": 9.892123070794331e-08, "logits/chosen": -2.884995460510254, "logits/rejected": -2.855255603790283, "logps/chosen": -62.906028747558594, "logps/rejected": -63.008453369140625, "loss": 0.6827, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10077200829982758, "rewards/margins": 0.023161105811595917, "rewards/rejected": -0.12393312156200409, "step": 2780 }, { "epoch": 0.4807029634734666, "grad_norm": 2.7700676918029785, "learning_rate": 9.890042193642267e-08, "logits/chosen": -3.0011134147644043, "logits/rejected": -2.9677700996398926, "logps/chosen": -61.55634689331055, "logps/rejected": -62.90153884887695, "loss": 0.6793, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09796085208654404, "rewards/margins": 0.029925968497991562, "rewards/rejected": -0.1278868317604065, "step": 2790 }, { "epoch": 0.4824259131633356, "grad_norm": 3.2735071182250977, "learning_rate": 9.887941661437464e-08, "logits/chosen": -2.9830212593078613, "logits/rejected": -2.9580564498901367, "logps/chosen": -69.73869323730469, "logps/rejected": -69.80674743652344, "loss": 0.6781, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09703021496534348, "rewards/margins": 0.032661326229572296, "rewards/rejected": -0.12969152629375458, "step": 2800 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -3.059511661529541, "eval_logits/rejected": -3.053757429122925, "eval_logps/chosen": -64.5224609375, "eval_logps/rejected": -70.57685852050781, "eval_loss": 0.6860405206680298, "eval_rewards/accuracies": 0.5875929594039917, "eval_rewards/chosen": -0.05810568854212761, "eval_rewards/margins": 0.01586167700588703, "eval_rewards/rejected": -0.07396736741065979, "eval_runtime": 383.8951, "eval_samples_per_second": 11.211, "eval_steps_per_second": 1.401, "step": 2800 }, { "epoch": 0.4841488628532047, "grad_norm": 3.0435869693756104, "learning_rate": 9.885821482622812e-08, "logits/chosen": -2.8739161491394043, "logits/rejected": -2.8495869636535645, "logps/chosen": -64.78532409667969, "logps/rejected": -68.93119049072266, "loss": 0.676, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09449522942304611, "rewards/margins": 0.03718101978302002, "rewards/rejected": -0.13167624175548553, "step": 2810 }, { "epoch": 0.48587181254307377, "grad_norm": 3.153567314147949, "learning_rate": 9.883681665720162e-08, "logits/chosen": -3.0217156410217285, "logits/rejected": -3.0097148418426514, "logps/chosen": -63.83794021606445, "logps/rejected": -65.2159652709961, "loss": 0.681, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09510736167430878, "rewards/margins": 0.026854107156395912, "rewards/rejected": -0.12196147441864014, "step": 2820 }, { "epoch": 0.4875947622329428, "grad_norm": 2.8859848976135254, "learning_rate": 9.881522219330303e-08, "logits/chosen": -2.8256988525390625, "logits/rejected": -2.8104841709136963, "logps/chosen": -64.98249816894531, "logps/rejected": -67.81361389160156, "loss": 0.6778, "rewards/accuracies": 0.625, "rewards/chosen": -0.09481976181268692, "rewards/margins": 0.03367151692509651, "rewards/rejected": -0.12849128246307373, "step": 2830 }, { "epoch": 0.48931771192281187, "grad_norm": 2.861546516418457, "learning_rate": 9.879343152132922e-08, "logits/chosen": -2.9965028762817383, "logits/rejected": -2.9852347373962402, "logps/chosen": -62.454078674316406, "logps/rejected": -64.68760681152344, "loss": 0.677, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09817804396152496, "rewards/margins": 0.03489768132567406, "rewards/rejected": -0.13307572901248932, "step": 2840 }, { "epoch": 0.4910406616126809, "grad_norm": 3.1723101139068604, "learning_rate": 9.87714447288657e-08, "logits/chosen": -2.956745147705078, "logits/rejected": -2.9374237060546875, "logps/chosen": -62.217132568359375, "logps/rejected": -69.06208038330078, "loss": 0.6735, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0997069925069809, "rewards/margins": 0.04242680221796036, "rewards/rejected": -0.14213378727436066, "step": 2850 }, { "epoch": 0.49276361130255, "grad_norm": 2.6628482341766357, "learning_rate": 9.874926190428623e-08, "logits/chosen": -2.9363627433776855, "logits/rejected": -2.9122955799102783, "logps/chosen": -64.47489166259766, "logps/rejected": -65.41607666015625, "loss": 0.6749, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09510374069213867, "rewards/margins": 0.03928123041987419, "rewards/rejected": -0.13438497483730316, "step": 2860 }, { "epoch": 0.494486560992419, "grad_norm": 3.0727081298828125, "learning_rate": 9.872688313675258e-08, "logits/chosen": -2.976762056350708, "logits/rejected": -2.9534010887145996, "logps/chosen": -65.26378631591797, "logps/rejected": -65.30891418457031, "loss": 0.6792, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0914800837635994, "rewards/margins": 0.031239259988069534, "rewards/rejected": -0.12271933257579803, "step": 2870 }, { "epoch": 0.4962095106822881, "grad_norm": 3.6503279209136963, "learning_rate": 9.870430851621399e-08, "logits/chosen": -3.0517380237579346, "logits/rejected": -3.025599718093872, "logps/chosen": -65.53878784179688, "logps/rejected": -64.24637603759766, "loss": 0.679, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1019076555967331, "rewards/margins": 0.03129301965236664, "rewards/rejected": -0.13320067524909973, "step": 2880 }, { "epoch": 0.49793246037215716, "grad_norm": 2.7357583045959473, "learning_rate": 9.8681538133407e-08, "logits/chosen": -3.0604350566864014, "logits/rejected": -3.0549769401550293, "logps/chosen": -62.771644592285156, "logps/rejected": -66.26950073242188, "loss": 0.6747, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09813951700925827, "rewards/margins": 0.03936195746064186, "rewards/rejected": -0.13750147819519043, "step": 2890 }, { "epoch": 0.4996554100620262, "grad_norm": 3.0844948291778564, "learning_rate": 9.865857207985499e-08, "logits/chosen": -2.977860927581787, "logits/rejected": -2.9682095050811768, "logps/chosen": -61.629608154296875, "logps/rejected": -63.678428649902344, "loss": 0.6796, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09141556918621063, "rewards/margins": 0.03002287819981575, "rewards/rejected": -0.12143845856189728, "step": 2900 }, { "epoch": 0.4996554100620262, "eval_logits/chosen": -3.051745891571045, "eval_logits/rejected": -3.0459930896759033, "eval_logps/chosen": -64.81282806396484, "eval_logps/rejected": -70.94561767578125, "eval_loss": 0.6857013702392578, "eval_rewards/accuracies": 0.589219331741333, "eval_rewards/chosen": -0.06100931763648987, "eval_rewards/margins": 0.016645701602101326, "eval_rewards/rejected": -0.07765501737594604, "eval_runtime": 384.1651, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.4, "step": 2900 }, { "epoch": 0.5013783597518953, "grad_norm": 3.6293253898620605, "learning_rate": 9.863541044786776e-08, "logits/chosen": -3.017298936843872, "logits/rejected": -3.005140781402588, "logps/chosen": -65.72318267822266, "logps/rejected": -69.51607513427734, "loss": 0.6744, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.09616000950336456, "rewards/margins": 0.039995647966861725, "rewards/rejected": -0.1361556500196457, "step": 2910 }, { "epoch": 0.5031013094417643, "grad_norm": 2.943253517150879, "learning_rate": 9.861205333054126e-08, "logits/chosen": -2.961942195892334, "logits/rejected": -2.958191156387329, "logps/chosen": -64.05611419677734, "logps/rejected": -68.82247161865234, "loss": 0.6804, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10485990345478058, "rewards/margins": 0.028304243460297585, "rewards/rejected": -0.13316413760185242, "step": 2920 }, { "epoch": 0.5048242591316333, "grad_norm": 2.9052278995513916, "learning_rate": 9.858850082175718e-08, "logits/chosen": -2.9340879917144775, "logits/rejected": -2.9065613746643066, "logps/chosen": -63.232330322265625, "logps/rejected": -66.09905242919922, "loss": 0.6715, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.09781093895435333, "rewards/margins": 0.046499740332365036, "rewards/rejected": -0.14431066811084747, "step": 2930 }, { "epoch": 0.5065472088215024, "grad_norm": 2.8808388710021973, "learning_rate": 9.856475301618254e-08, "logits/chosen": -2.9635181427001953, "logits/rejected": -2.9268853664398193, "logps/chosen": -61.1923942565918, "logps/rejected": -63.786415100097656, "loss": 0.6783, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.097320556640625, "rewards/margins": 0.032691989094018936, "rewards/rejected": -0.13001254200935364, "step": 2940 }, { "epoch": 0.5082701585113715, "grad_norm": 2.7430708408355713, "learning_rate": 9.854081000926937e-08, "logits/chosen": -2.9962668418884277, "logits/rejected": -2.977900981903076, "logps/chosen": -63.719512939453125, "logps/rejected": -68.13462829589844, "loss": 0.6739, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.093983955681324, "rewards/margins": 0.04202844947576523, "rewards/rejected": -0.13601240515708923, "step": 2950 }, { "epoch": 0.5099931082012406, "grad_norm": 2.9821202754974365, "learning_rate": 9.851667189725428e-08, "logits/chosen": -2.9551444053649902, "logits/rejected": -2.92824125289917, "logps/chosen": -63.79551315307617, "logps/rejected": -66.1274185180664, "loss": 0.6729, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09847555309534073, "rewards/margins": 0.04384133964776993, "rewards/rejected": -0.14231689274311066, "step": 2960 }, { "epoch": 0.5117160578911096, "grad_norm": 3.6537888050079346, "learning_rate": 9.849233877715805e-08, "logits/chosen": -2.887934446334839, "logits/rejected": -2.8602499961853027, "logps/chosen": -65.93989562988281, "logps/rejected": -66.46612548828125, "loss": 0.6778, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10702387243509293, "rewards/margins": 0.033498454838991165, "rewards/rejected": -0.1405223309993744, "step": 2970 }, { "epoch": 0.5134390075809786, "grad_norm": 3.1210203170776367, "learning_rate": 9.846781074678536e-08, "logits/chosen": -2.885059356689453, "logits/rejected": -2.8590919971466064, "logps/chosen": -63.145973205566406, "logps/rejected": -65.72850036621094, "loss": 0.6735, "rewards/accuracies": 0.6875, "rewards/chosen": -0.092067651450634, "rewards/margins": 0.042408816516399384, "rewards/rejected": -0.1344764679670334, "step": 2980 }, { "epoch": 0.5151619572708477, "grad_norm": 2.9466779232025146, "learning_rate": 9.844308790472422e-08, "logits/chosen": -2.9299604892730713, "logits/rejected": -2.9023847579956055, "logps/chosen": -67.03337097167969, "logps/rejected": -68.5904769897461, "loss": 0.677, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10423538833856583, "rewards/margins": 0.03552241995930672, "rewards/rejected": -0.13975778222084045, "step": 2990 }, { "epoch": 0.5168849069607168, "grad_norm": 2.654740810394287, "learning_rate": 9.841817035034571e-08, "logits/chosen": -2.9410808086395264, "logits/rejected": -2.935465097427368, "logps/chosen": -62.14415740966797, "logps/rejected": -66.91738891601562, "loss": 0.6805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10131809860467911, "rewards/margins": 0.0289118941873312, "rewards/rejected": -0.13022999465465546, "step": 3000 }, { "epoch": 0.5168849069607168, "eval_logits/chosen": -3.0425312519073486, "eval_logits/rejected": -3.0368008613586426, "eval_logps/chosen": -65.2876968383789, "eval_logps/rejected": -71.51773071289062, "eval_loss": 0.6852889657020569, "eval_rewards/accuracies": 0.5994423627853394, "eval_rewards/chosen": -0.06575805693864822, "eval_rewards/margins": 0.017618007957935333, "eval_rewards/rejected": -0.08337606489658356, "eval_runtime": 384.254, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 3000 }, { "epoch": 0.5186078566505858, "grad_norm": 3.3054888248443604, "learning_rate": 9.839305818380355e-08, "logits/chosen": -2.9660398960113525, "logits/rejected": -2.943049669265747, "logps/chosen": -66.16236114501953, "logps/rejected": -66.30188751220703, "loss": 0.6833, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10974695533514023, "rewards/margins": 0.022326484322547913, "rewards/rejected": -0.13207343220710754, "step": 3010 }, { "epoch": 0.5203308063404548, "grad_norm": 3.1893117427825928, "learning_rate": 9.836775150603366e-08, "logits/chosen": -3.0493593215942383, "logits/rejected": -3.0183663368225098, "logps/chosen": -66.458251953125, "logps/rejected": -65.6940689086914, "loss": 0.6795, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12219719588756561, "rewards/margins": 0.031587425619363785, "rewards/rejected": -0.1537846326828003, "step": 3020 }, { "epoch": 0.5220537560303239, "grad_norm": 3.151909351348877, "learning_rate": 9.834225041875381e-08, "logits/chosen": -2.953901767730713, "logits/rejected": -2.932375192642212, "logps/chosen": -67.33077239990234, "logps/rejected": -68.61319732666016, "loss": 0.6842, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11655862629413605, "rewards/margins": 0.02067907340824604, "rewards/rejected": -0.13723769783973694, "step": 3030 }, { "epoch": 0.523776705720193, "grad_norm": 3.0096349716186523, "learning_rate": 9.831655502446314e-08, "logits/chosen": -3.0559146404266357, "logits/rejected": -3.049262523651123, "logps/chosen": -63.00276565551758, "logps/rejected": -68.90313720703125, "loss": 0.6807, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.1181497722864151, "rewards/margins": 0.027725104242563248, "rewards/rejected": -0.14587488770484924, "step": 3040 }, { "epoch": 0.525499655410062, "grad_norm": 2.7092087268829346, "learning_rate": 9.829066542644183e-08, "logits/chosen": -2.9160568714141846, "logits/rejected": -2.902904987335205, "logps/chosen": -64.6626968383789, "logps/rejected": -68.76962280273438, "loss": 0.683, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.11104599386453629, "rewards/margins": 0.023075086995959282, "rewards/rejected": -0.13412107527256012, "step": 3050 }, { "epoch": 0.5272226050999311, "grad_norm": 3.636561632156372, "learning_rate": 9.826458172875056e-08, "logits/chosen": -2.9516592025756836, "logits/rejected": -2.933215856552124, "logps/chosen": -66.14519500732422, "logps/rejected": -67.65269470214844, "loss": 0.6807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10469415038824081, "rewards/margins": 0.028158079832792282, "rewards/rejected": -0.1328522264957428, "step": 3060 }, { "epoch": 0.5289455547898001, "grad_norm": 2.9957070350646973, "learning_rate": 9.823830403623031e-08, "logits/chosen": -2.9116501808166504, "logits/rejected": -2.8861348628997803, "logps/chosen": -69.54967498779297, "logps/rejected": -68.41423797607422, "loss": 0.6802, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11607084423303604, "rewards/margins": 0.028689906001091003, "rewards/rejected": -0.14476075768470764, "step": 3070 }, { "epoch": 0.5306685044796692, "grad_norm": 2.8178534507751465, "learning_rate": 9.821183245450169e-08, "logits/chosen": -2.82523512840271, "logits/rejected": -2.808353900909424, "logps/chosen": -63.33665084838867, "logps/rejected": -69.95553588867188, "loss": 0.6819, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12478204071521759, "rewards/margins": 0.026761427521705627, "rewards/rejected": -0.15154346823692322, "step": 3080 }, { "epoch": 0.5323914541695383, "grad_norm": 2.963575601577759, "learning_rate": 9.818516708996468e-08, "logits/chosen": -2.9095349311828613, "logits/rejected": -2.8905649185180664, "logps/chosen": -64.69622039794922, "logps/rejected": -68.59099578857422, "loss": 0.6757, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.11084137856960297, "rewards/margins": 0.038026027381420135, "rewards/rejected": -0.1488673985004425, "step": 3090 }, { "epoch": 0.5341144038594073, "grad_norm": 3.720427989959717, "learning_rate": 9.815830804979814e-08, "logits/chosen": -2.942284107208252, "logits/rejected": -2.912932872772217, "logps/chosen": -64.48738861083984, "logps/rejected": -66.56813049316406, "loss": 0.673, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10566981881856918, "rewards/margins": 0.044303666800260544, "rewards/rejected": -0.14997348189353943, "step": 3100 }, { "epoch": 0.5341144038594073, "eval_logits/chosen": -3.0381288528442383, "eval_logits/rejected": -3.0324244499206543, "eval_logps/chosen": -65.33868408203125, "eval_logps/rejected": -71.64682006835938, "eval_loss": 0.6849254369735718, "eval_rewards/accuracies": 0.5987453460693359, "eval_rewards/chosen": -0.06626785546541214, "eval_rewards/margins": 0.018399151042103767, "eval_rewards/rejected": -0.08466700464487076, "eval_runtime": 383.7545, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 3100 }, { "epoch": 0.5358373535492763, "grad_norm": 3.346313238143921, "learning_rate": 9.813125544195938e-08, "logits/chosen": -2.884772777557373, "logits/rejected": -2.8942549228668213, "logps/chosen": -62.70061492919922, "logps/rejected": -69.97631072998047, "loss": 0.6802, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12088136374950409, "rewards/margins": 0.029403049498796463, "rewards/rejected": -0.15028440952301025, "step": 3110 }, { "epoch": 0.5375603032391454, "grad_norm": 3.142883777618408, "learning_rate": 9.810400937518376e-08, "logits/chosen": -2.9309773445129395, "logits/rejected": -2.9074718952178955, "logps/chosen": -66.43892669677734, "logps/rejected": -70.55831146240234, "loss": 0.6734, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1042814701795578, "rewards/margins": 0.043249666690826416, "rewards/rejected": -0.14753112196922302, "step": 3120 }, { "epoch": 0.5392832529290145, "grad_norm": 2.9250478744506836, "learning_rate": 9.807656995898422e-08, "logits/chosen": -2.8482320308685303, "logits/rejected": -2.8396620750427246, "logps/chosen": -62.866912841796875, "logps/rejected": -66.6762924194336, "loss": 0.6819, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10785956680774689, "rewards/margins": 0.02550504542887211, "rewards/rejected": -0.13336461782455444, "step": 3130 }, { "epoch": 0.5410062026188835, "grad_norm": 3.300189256668091, "learning_rate": 9.80489373036508e-08, "logits/chosen": -2.962305784225464, "logits/rejected": -2.9471447467803955, "logps/chosen": -63.59661102294922, "logps/rejected": -69.7351303100586, "loss": 0.6743, "rewards/accuracies": 0.625, "rewards/chosen": -0.1189948171377182, "rewards/margins": 0.04094112291932106, "rewards/rejected": -0.15993592143058777, "step": 3140 }, { "epoch": 0.5427291523087526, "grad_norm": 3.1307990550994873, "learning_rate": 9.802111152025037e-08, "logits/chosen": -2.962672710418701, "logits/rejected": -2.934166193008423, "logps/chosen": -64.71379089355469, "logps/rejected": -68.6727294921875, "loss": 0.6715, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10344825685024261, "rewards/margins": 0.047257013618946075, "rewards/rejected": -0.1507052630186081, "step": 3150 }, { "epoch": 0.5444521019986216, "grad_norm": 3.283798933029175, "learning_rate": 9.799309272062592e-08, "logits/chosen": -2.900205612182617, "logits/rejected": -2.8715476989746094, "logps/chosen": -64.25485229492188, "logps/rejected": -67.60700225830078, "loss": 0.671, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10108542442321777, "rewards/margins": 0.048650797456502914, "rewards/rejected": -0.1497362107038498, "step": 3160 }, { "epoch": 0.5461750516884907, "grad_norm": 3.3534271717071533, "learning_rate": 9.796488101739633e-08, "logits/chosen": -2.928654432296753, "logits/rejected": -2.8962783813476562, "logps/chosen": -67.16883850097656, "logps/rejected": -66.64054107666016, "loss": 0.666, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.11279411613941193, "rewards/margins": 0.05875105783343315, "rewards/rejected": -0.17154517769813538, "step": 3170 }, { "epoch": 0.5478980013783598, "grad_norm": 2.9155259132385254, "learning_rate": 9.793647652395582e-08, "logits/chosen": -2.9806151390075684, "logits/rejected": -2.9435207843780518, "logps/chosen": -64.6708984375, "logps/rejected": -67.30673217773438, "loss": 0.6723, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.11414506286382675, "rewards/margins": 0.04464191943407059, "rewards/rejected": -0.15878698229789734, "step": 3180 }, { "epoch": 0.5496209510682288, "grad_norm": 3.3338780403137207, "learning_rate": 9.79078793544735e-08, "logits/chosen": -2.976921558380127, "logits/rejected": -2.971113681793213, "logps/chosen": -64.18693542480469, "logps/rejected": -74.24832916259766, "loss": 0.6675, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.11889241635799408, "rewards/margins": 0.05693773180246353, "rewards/rejected": -0.1758301556110382, "step": 3190 }, { "epoch": 0.5513439007580979, "grad_norm": 3.4823660850524902, "learning_rate": 9.787908962389295e-08, "logits/chosen": -2.899880886077881, "logits/rejected": -2.8854739665985107, "logps/chosen": -65.92109680175781, "logps/rejected": -66.75161743164062, "loss": 0.6747, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10592423379421234, "rewards/margins": 0.04026971384882927, "rewards/rejected": -0.1461939513683319, "step": 3200 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -3.0267412662506104, "eval_logits/rejected": -3.0209221839904785, "eval_logps/chosen": -66.50939178466797, "eval_logps/rejected": -72.99625396728516, "eval_loss": 0.6841797232627869, "eval_rewards/accuracies": 0.6026951670646667, "eval_rewards/chosen": -0.07797504216432571, "eval_rewards/margins": 0.02018626220524311, "eval_rewards/rejected": -0.09816130250692368, "eval_runtime": 383.4589, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.403, "step": 3200 }, { "epoch": 0.5530668504479669, "grad_norm": 3.5785820484161377, "learning_rate": 9.785010744793172e-08, "logits/chosen": -2.817628860473633, "logits/rejected": -2.7936043739318848, "logps/chosen": -65.95722961425781, "logps/rejected": -68.59443664550781, "loss": 0.6745, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1200583204627037, "rewards/margins": 0.040913067758083344, "rewards/rejected": -0.16097138822078705, "step": 3210 }, { "epoch": 0.554789800137836, "grad_norm": 3.0678091049194336, "learning_rate": 9.782093294308085e-08, "logits/chosen": -2.8513503074645996, "logits/rejected": -2.8463664054870605, "logps/chosen": -63.11109161376953, "logps/rejected": -68.88777160644531, "loss": 0.6787, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.12767264246940613, "rewards/margins": 0.0323764905333519, "rewards/rejected": -0.16004912555217743, "step": 3220 }, { "epoch": 0.556512749827705, "grad_norm": 3.1148808002471924, "learning_rate": 9.779156622660444e-08, "logits/chosen": -2.9057564735412598, "logits/rejected": -2.889871120452881, "logps/chosen": -66.41254425048828, "logps/rejected": -71.27462768554688, "loss": 0.6817, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12255162000656128, "rewards/margins": 0.026407569646835327, "rewards/rejected": -0.1489591747522354, "step": 3230 }, { "epoch": 0.5582356995175741, "grad_norm": 3.051084041595459, "learning_rate": 9.77620074165392e-08, "logits/chosen": -3.024712324142456, "logits/rejected": -3.0021655559539795, "logps/chosen": -70.19468688964844, "logps/rejected": -69.71147155761719, "loss": 0.679, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13958023488521576, "rewards/margins": 0.03209796920418739, "rewards/rejected": -0.17167820036411285, "step": 3240 }, { "epoch": 0.5599586492074431, "grad_norm": 3.0525102615356445, "learning_rate": 9.77322566316939e-08, "logits/chosen": -2.9169459342956543, "logits/rejected": -2.8961451053619385, "logps/chosen": -64.2944107055664, "logps/rejected": -70.3061752319336, "loss": 0.6686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11721847206354141, "rewards/margins": 0.05275779962539673, "rewards/rejected": -0.16997626423835754, "step": 3250 }, { "epoch": 0.5616815988973122, "grad_norm": 3.325883388519287, "learning_rate": 9.770231399164894e-08, "logits/chosen": -2.9300780296325684, "logits/rejected": -2.9128031730651855, "logps/chosen": -64.05619049072266, "logps/rejected": -69.93900299072266, "loss": 0.6724, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1252075880765915, "rewards/margins": 0.046792708337306976, "rewards/rejected": -0.17200028896331787, "step": 3260 }, { "epoch": 0.5634045485871813, "grad_norm": 3.3600032329559326, "learning_rate": 9.76721796167559e-08, "logits/chosen": -2.986571788787842, "logits/rejected": -2.982628583908081, "logps/chosen": -68.49488067626953, "logps/rejected": -76.15458679199219, "loss": 0.6706, "rewards/accuracies": 0.625, "rewards/chosen": -0.13365280628204346, "rewards/margins": 0.04901333898305893, "rewards/rejected": -0.1826661378145218, "step": 3270 }, { "epoch": 0.5651274982770503, "grad_norm": 3.352196455001831, "learning_rate": 9.764185362813697e-08, "logits/chosen": -2.989205837249756, "logits/rejected": -2.978252649307251, "logps/chosen": -62.46497344970703, "logps/rejected": -68.76921844482422, "loss": 0.6785, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13044258952140808, "rewards/margins": 0.03327161446213722, "rewards/rejected": -0.1637142151594162, "step": 3280 }, { "epoch": 0.5668504479669194, "grad_norm": 3.0868594646453857, "learning_rate": 9.761133614768454e-08, "logits/chosen": -3.0345520973205566, "logits/rejected": -3.0039210319519043, "logps/chosen": -65.42103576660156, "logps/rejected": -71.00712585449219, "loss": 0.6681, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12254806607961655, "rewards/margins": 0.05429477244615555, "rewards/rejected": -0.1768428534269333, "step": 3290 }, { "epoch": 0.5685733976567884, "grad_norm": 3.6567270755767822, "learning_rate": 9.758062729806067e-08, "logits/chosen": -2.9109435081481934, "logits/rejected": -2.8896119594573975, "logps/chosen": -68.30950927734375, "logps/rejected": -71.20828247070312, "loss": 0.6743, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1339302361011505, "rewards/margins": 0.04238232970237732, "rewards/rejected": -0.17631253600120544, "step": 3300 }, { "epoch": 0.5685733976567884, "eval_logits/chosen": -3.013587474822998, "eval_logits/rejected": -3.007781982421875, "eval_logps/chosen": -67.0761947631836, "eval_logps/rejected": -73.70807647705078, "eval_loss": 0.6835702657699585, "eval_rewards/accuracies": 0.6022304892539978, "eval_rewards/chosen": -0.08364301174879074, "eval_rewards/margins": 0.02163654938340187, "eval_rewards/rejected": -0.10527956485748291, "eval_runtime": 384.5045, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 3300 }, { "epoch": 0.5702963473466575, "grad_norm": 3.2779042720794678, "learning_rate": 9.754972720269664e-08, "logits/chosen": -2.849915027618408, "logits/rejected": -2.822054624557495, "logps/chosen": -65.15449523925781, "logps/rejected": -67.6290512084961, "loss": 0.671, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.12055698782205582, "rewards/margins": 0.04874001070857048, "rewards/rejected": -0.169296994805336, "step": 3310 }, { "epoch": 0.5720192970365265, "grad_norm": 3.8480207920074463, "learning_rate": 9.751863598579238e-08, "logits/chosen": -2.906965494155884, "logits/rejected": -2.871720552444458, "logps/chosen": -67.09576416015625, "logps/rejected": -67.40914916992188, "loss": 0.6698, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1338813602924347, "rewards/margins": 0.05075927823781967, "rewards/rejected": -0.18464066088199615, "step": 3320 }, { "epoch": 0.5737422467263956, "grad_norm": 3.519470453262329, "learning_rate": 9.748735377231605e-08, "logits/chosen": -3.0069165229797363, "logits/rejected": -2.9763424396514893, "logps/chosen": -65.80399322509766, "logps/rejected": -70.90829467773438, "loss": 0.6669, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11590591818094254, "rewards/margins": 0.05641712620854378, "rewards/rejected": -0.1723230481147766, "step": 3330 }, { "epoch": 0.5754651964162646, "grad_norm": 3.386451005935669, "learning_rate": 9.745588068800347e-08, "logits/chosen": -2.94425892829895, "logits/rejected": -2.9264957904815674, "logps/chosen": -69.43077850341797, "logps/rejected": -72.27314758300781, "loss": 0.6717, "rewards/accuracies": 0.625, "rewards/chosen": -0.1351763904094696, "rewards/margins": 0.047761060297489166, "rewards/rejected": -0.18293747305870056, "step": 3340 }, { "epoch": 0.5771881461061337, "grad_norm": 3.2059457302093506, "learning_rate": 9.742421685935769e-08, "logits/chosen": -2.8373639583587646, "logits/rejected": -2.822887897491455, "logps/chosen": -69.8943099975586, "logps/rejected": -74.19965362548828, "loss": 0.6811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.141156405210495, "rewards/margins": 0.027597282081842422, "rewards/rejected": -0.1687537133693695, "step": 3350 }, { "epoch": 0.5789110957960028, "grad_norm": 3.369702100753784, "learning_rate": 9.739236241364839e-08, "logits/chosen": -2.9559147357940674, "logits/rejected": -2.9220709800720215, "logps/chosen": -68.06131744384766, "logps/rejected": -69.8244857788086, "loss": 0.6735, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1324608325958252, "rewards/margins": 0.043469466269016266, "rewards/rejected": -0.17593030631542206, "step": 3360 }, { "epoch": 0.5806340454858718, "grad_norm": 3.384464740753174, "learning_rate": 9.736031747891145e-08, "logits/chosen": -2.8926303386688232, "logits/rejected": -2.882845401763916, "logps/chosen": -64.24259185791016, "logps/rejected": -71.77339935302734, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": -0.13287286460399628, "rewards/margins": 0.04472458362579346, "rewards/rejected": -0.17759746313095093, "step": 3370 }, { "epoch": 0.5823569951757409, "grad_norm": 3.587631940841675, "learning_rate": 9.732808218394841e-08, "logits/chosen": -2.9863932132720947, "logits/rejected": -2.955165386199951, "logps/chosen": -67.76382446289062, "logps/rejected": -67.68370056152344, "loss": 0.6694, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12302947044372559, "rewards/margins": 0.05237020179629326, "rewards/rejected": -0.17539969086647034, "step": 3380 }, { "epoch": 0.5840799448656099, "grad_norm": 3.019854784011841, "learning_rate": 9.729565665832591e-08, "logits/chosen": -2.906973361968994, "logits/rejected": -2.881031036376953, "logps/chosen": -67.75544738769531, "logps/rejected": -69.20933532714844, "loss": 0.6765, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1402776688337326, "rewards/margins": 0.0374799519777298, "rewards/rejected": -0.1777576506137848, "step": 3390 }, { "epoch": 0.585802894555479, "grad_norm": 3.560659170150757, "learning_rate": 9.726304103237522e-08, "logits/chosen": -2.955738067626953, "logits/rejected": -2.916970729827881, "logps/chosen": -65.63565826416016, "logps/rejected": -70.20353698730469, "loss": 0.6653, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.12778934836387634, "rewards/margins": 0.06039997935295105, "rewards/rejected": -0.1881893128156662, "step": 3400 }, { "epoch": 0.585802894555479, "eval_logits/chosen": -3.0048635005950928, "eval_logits/rejected": -2.999053716659546, "eval_logps/chosen": -67.17575073242188, "eval_logps/rejected": -73.867431640625, "eval_loss": 0.6833036541938782, "eval_rewards/accuracies": 0.6010687947273254, "eval_rewards/chosen": -0.08463861048221588, "eval_rewards/margins": 0.022234512493014336, "eval_rewards/rejected": -0.10687312483787537, "eval_runtime": 383.9636, "eval_samples_per_second": 11.209, "eval_steps_per_second": 1.401, "step": 3400 }, { "epoch": 0.587525844245348, "grad_norm": 3.1826560497283936, "learning_rate": 9.723023543719171e-08, "logits/chosen": -2.846552848815918, "logits/rejected": -2.8179874420166016, "logps/chosen": -62.55934524536133, "logps/rejected": -64.88141632080078, "loss": 0.6711, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1298610270023346, "rewards/margins": 0.04827074706554413, "rewards/rejected": -0.17813177406787872, "step": 3410 }, { "epoch": 0.5892487939352171, "grad_norm": 4.114423751831055, "learning_rate": 9.719724000463429e-08, "logits/chosen": -2.8679070472717285, "logits/rejected": -2.8436543941497803, "logps/chosen": -64.84510803222656, "logps/rejected": -70.152587890625, "loss": 0.6714, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12077782303094864, "rewards/margins": 0.04775964096188545, "rewards/rejected": -0.1685374677181244, "step": 3420 }, { "epoch": 0.5909717436250862, "grad_norm": 3.188581943511963, "learning_rate": 9.716405486732494e-08, "logits/chosen": -2.9511055946350098, "logits/rejected": -2.9307730197906494, "logps/chosen": -64.09410095214844, "logps/rejected": -71.48682403564453, "loss": 0.6745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12753203511238098, "rewards/margins": 0.04101991653442383, "rewards/rejected": -0.1685519516468048, "step": 3430 }, { "epoch": 0.5926946933149552, "grad_norm": 3.394134044647217, "learning_rate": 9.71306801586481e-08, "logits/chosen": -2.8721954822540283, "logits/rejected": -2.8564352989196777, "logps/chosen": -67.01398468017578, "logps/rejected": -70.92399597167969, "loss": 0.6767, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1444111317396164, "rewards/margins": 0.03681856393814087, "rewards/rejected": -0.18122968077659607, "step": 3440 }, { "epoch": 0.5944176430048242, "grad_norm": 3.4582929611206055, "learning_rate": 9.709711601275018e-08, "logits/chosen": -3.113546133041382, "logits/rejected": -3.0734775066375732, "logps/chosen": -72.65756225585938, "logps/rejected": -72.58638000488281, "loss": 0.6733, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1327807903289795, "rewards/margins": 0.04361308366060257, "rewards/rejected": -0.17639386653900146, "step": 3450 }, { "epoch": 0.5961405926946933, "grad_norm": 3.5773825645446777, "learning_rate": 9.706336256453906e-08, "logits/chosen": -2.8964171409606934, "logits/rejected": -2.883762836456299, "logps/chosen": -64.20597839355469, "logps/rejected": -68.95957946777344, "loss": 0.6786, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12026575952768326, "rewards/margins": 0.03332877531647682, "rewards/rejected": -0.15359455347061157, "step": 3460 }, { "epoch": 0.5978635423845624, "grad_norm": 3.172778606414795, "learning_rate": 9.702941994968345e-08, "logits/chosen": -2.9739644527435303, "logits/rejected": -2.9584555625915527, "logps/chosen": -68.490234375, "logps/rejected": -71.9591293334961, "loss": 0.6717, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1268002688884735, "rewards/margins": 0.0494329109787941, "rewards/rejected": -0.17623315751552582, "step": 3470 }, { "epoch": 0.5995864920744314, "grad_norm": 3.458611011505127, "learning_rate": 9.699528830461241e-08, "logits/chosen": -2.902557849884033, "logits/rejected": -2.871539354324341, "logps/chosen": -67.68953704833984, "logps/rejected": -70.87908172607422, "loss": 0.6653, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12574946880340576, "rewards/margins": 0.06421028822660446, "rewards/rejected": -0.18995974957942963, "step": 3480 }, { "epoch": 0.6013094417643005, "grad_norm": 3.736043930053711, "learning_rate": 9.69609677665148e-08, "logits/chosen": -2.909066677093506, "logits/rejected": -2.8806378841400146, "logps/chosen": -67.27809143066406, "logps/rejected": -73.0969009399414, "loss": 0.6676, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13008712232112885, "rewards/margins": 0.05565319582819939, "rewards/rejected": -0.18574030697345734, "step": 3490 }, { "epoch": 0.6030323914541695, "grad_norm": 3.375514268875122, "learning_rate": 9.692645847333871e-08, "logits/chosen": -2.856605291366577, "logits/rejected": -2.848818302154541, "logps/chosen": -66.0629653930664, "logps/rejected": -72.42523956298828, "loss": 0.6764, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14060282707214355, "rewards/margins": 0.03703240305185318, "rewards/rejected": -0.17763522267341614, "step": 3500 }, { "epoch": 0.6030323914541695, "eval_logits/chosen": -2.9970762729644775, "eval_logits/rejected": -2.9912493228912354, "eval_logps/chosen": -67.7069091796875, "eval_logps/rejected": -74.53691864013672, "eval_loss": 0.6827172040939331, "eval_rewards/accuracies": 0.5999070405960083, "eval_rewards/chosen": -0.08995012193918228, "eval_rewards/margins": 0.023617828264832497, "eval_rewards/rejected": -0.11356794834136963, "eval_runtime": 383.9935, "eval_samples_per_second": 11.209, "eval_steps_per_second": 1.401, "step": 3500 }, { "epoch": 0.6047553411440386, "grad_norm": 3.2144839763641357, "learning_rate": 9.689176056379091e-08, "logits/chosen": -2.7975692749023438, "logits/rejected": -2.7731168270111084, "logps/chosen": -68.49657440185547, "logps/rejected": -70.34160614013672, "loss": 0.6739, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1492932140827179, "rewards/margins": 0.04267875477671623, "rewards/rejected": -0.19197198748588562, "step": 3510 }, { "epoch": 0.6064782908339077, "grad_norm": 3.4010651111602783, "learning_rate": 9.68568741773363e-08, "logits/chosen": -2.831744909286499, "logits/rejected": -2.8120248317718506, "logps/chosen": -68.4847412109375, "logps/rejected": -70.20333862304688, "loss": 0.6704, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13150446116924286, "rewards/margins": 0.0498737208545208, "rewards/rejected": -0.18137818574905396, "step": 3520 }, { "epoch": 0.6082012405237767, "grad_norm": 3.531956195831299, "learning_rate": 9.682179945419735e-08, "logits/chosen": -2.993518352508545, "logits/rejected": -2.951322078704834, "logps/chosen": -68.79706573486328, "logps/rejected": -69.9690170288086, "loss": 0.6647, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12839588522911072, "rewards/margins": 0.062154434621334076, "rewards/rejected": -0.1905503123998642, "step": 3530 }, { "epoch": 0.6099241902136457, "grad_norm": 3.6685080528259277, "learning_rate": 9.678653653535353e-08, "logits/chosen": -2.8190901279449463, "logits/rejected": -2.795196294784546, "logps/chosen": -68.33075714111328, "logps/rejected": -72.10498046875, "loss": 0.6748, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.14418013393878937, "rewards/margins": 0.041199199855327606, "rewards/rejected": -0.185379296541214, "step": 3540 }, { "epoch": 0.6116471399035148, "grad_norm": 3.675966501235962, "learning_rate": 9.675108556254073e-08, "logits/chosen": -2.906829833984375, "logits/rejected": -2.897273540496826, "logps/chosen": -70.09661865234375, "logps/rejected": -70.79762268066406, "loss": 0.6817, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.15533465147018433, "rewards/margins": 0.027129217982292175, "rewards/rejected": -0.1824638545513153, "step": 3550 }, { "epoch": 0.6133700895933839, "grad_norm": 3.322143077850342, "learning_rate": 9.67154466782507e-08, "logits/chosen": -2.838775157928467, "logits/rejected": -2.81182599067688, "logps/chosen": -68.0738754272461, "logps/rejected": -72.13358306884766, "loss": 0.6731, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.15173934400081635, "rewards/margins": 0.04411458596587181, "rewards/rejected": -0.19585391879081726, "step": 3560 }, { "epoch": 0.6150930392832529, "grad_norm": 3.7309257984161377, "learning_rate": 9.667962002573053e-08, "logits/chosen": -2.9788858890533447, "logits/rejected": -2.9449925422668457, "logps/chosen": -72.99544525146484, "logps/rejected": -71.87519836425781, "loss": 0.6847, "rewards/accuracies": 0.625, "rewards/chosen": -0.161227285861969, "rewards/margins": 0.025302579626441002, "rewards/rejected": -0.18652985990047455, "step": 3570 }, { "epoch": 0.616815988973122, "grad_norm": 3.5933849811553955, "learning_rate": 9.664360574898196e-08, "logits/chosen": -2.914541721343994, "logits/rejected": -2.892376184463501, "logps/chosen": -72.05098724365234, "logps/rejected": -76.94429016113281, "loss": 0.6678, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1385619193315506, "rewards/margins": 0.05616595596075058, "rewards/rejected": -0.19472786784172058, "step": 3580 }, { "epoch": 0.618538938662991, "grad_norm": 3.436840534210205, "learning_rate": 9.660740399276092e-08, "logits/chosen": -2.9030489921569824, "logits/rejected": -2.887316942214966, "logps/chosen": -70.31509399414062, "logps/rejected": -74.77911376953125, "loss": 0.6715, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15890085697174072, "rewards/margins": 0.04903041571378708, "rewards/rejected": -0.2079312801361084, "step": 3590 }, { "epoch": 0.6202618883528601, "grad_norm": 3.779756546020508, "learning_rate": 9.657101490257689e-08, "logits/chosen": -2.8828721046447754, "logits/rejected": -2.8583245277404785, "logps/chosen": -69.12373352050781, "logps/rejected": -70.43314361572266, "loss": 0.6737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14476682245731354, "rewards/margins": 0.043956391513347626, "rewards/rejected": -0.18872320652008057, "step": 3600 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -2.9870545864105225, "eval_logits/rejected": -2.9812123775482178, "eval_logps/chosen": -68.3294906616211, "eval_logps/rejected": -75.25022888183594, "eval_loss": 0.6823411583900452, "eval_rewards/accuracies": 0.6103624701499939, "eval_rewards/chosen": -0.09617592394351959, "eval_rewards/margins": 0.024525169283151627, "eval_rewards/rejected": -0.12070107460021973, "eval_runtime": 384.1503, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.4, "step": 3600 }, { "epoch": 0.6219848380427292, "grad_norm": 3.3747036457061768, "learning_rate": 9.653443862469226e-08, "logits/chosen": -2.8490610122680664, "logits/rejected": -2.8332598209381104, "logps/chosen": -68.02410888671875, "logps/rejected": -69.45539093017578, "loss": 0.6782, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1653090864419937, "rewards/margins": 0.034825216978788376, "rewards/rejected": -0.2001343071460724, "step": 3610 }, { "epoch": 0.6237077877325982, "grad_norm": 3.6460654735565186, "learning_rate": 9.64976753061219e-08, "logits/chosen": -2.808055877685547, "logits/rejected": -2.783261299133301, "logps/chosen": -68.2891616821289, "logps/rejected": -72.53072357177734, "loss": 0.6646, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12781301140785217, "rewards/margins": 0.0615234375, "rewards/rejected": -0.18933644890785217, "step": 3620 }, { "epoch": 0.6254307374224672, "grad_norm": 3.360413074493408, "learning_rate": 9.646072509463239e-08, "logits/chosen": -2.961965560913086, "logits/rejected": -2.9553470611572266, "logps/chosen": -65.70463562011719, "logps/rejected": -76.64944458007812, "loss": 0.6637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14563927054405212, "rewards/margins": 0.06400388479232788, "rewards/rejected": -0.20964315533638, "step": 3630 }, { "epoch": 0.6271536871123363, "grad_norm": 3.8706581592559814, "learning_rate": 9.642358813874154e-08, "logits/chosen": -2.9124159812927246, "logits/rejected": -2.8981194496154785, "logps/chosen": -68.7247314453125, "logps/rejected": -75.15029907226562, "loss": 0.6693, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.13681621849536896, "rewards/margins": 0.052891455590724945, "rewards/rejected": -0.1897076666355133, "step": 3640 }, { "epoch": 0.6288766368022054, "grad_norm": 3.5387752056121826, "learning_rate": 9.638626458771779e-08, "logits/chosen": -2.863311767578125, "logits/rejected": -2.868051290512085, "logps/chosen": -65.095703125, "logps/rejected": -72.89458465576172, "loss": 0.6736, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1401418298482895, "rewards/margins": 0.0434744767844677, "rewards/rejected": -0.18361631035804749, "step": 3650 }, { "epoch": 0.6305995864920745, "grad_norm": 3.363511562347412, "learning_rate": 9.63487545915795e-08, "logits/chosen": -2.9155075550079346, "logits/rejected": -2.8857743740081787, "logps/chosen": -68.94444274902344, "logps/rejected": -74.99880981445312, "loss": 0.6639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15085703134536743, "rewards/margins": 0.06373803317546844, "rewards/rejected": -0.21459504961967468, "step": 3660 }, { "epoch": 0.6323225361819435, "grad_norm": 3.5878331661224365, "learning_rate": 9.631105830109454e-08, "logits/chosen": -2.8715896606445312, "logits/rejected": -2.844346046447754, "logps/chosen": -68.6483383178711, "logps/rejected": -72.62535095214844, "loss": 0.6744, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1579716056585312, "rewards/margins": 0.042805515229701996, "rewards/rejected": -0.20077712833881378, "step": 3670 }, { "epoch": 0.6340454858718125, "grad_norm": 3.7476389408111572, "learning_rate": 9.627317586777947e-08, "logits/chosen": -2.8907763957977295, "logits/rejected": -2.8515639305114746, "logps/chosen": -70.89676666259766, "logps/rejected": -70.56390380859375, "loss": 0.6751, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1622094213962555, "rewards/margins": 0.042420774698257446, "rewards/rejected": -0.20463021099567413, "step": 3680 }, { "epoch": 0.6357684355616816, "grad_norm": 4.531287670135498, "learning_rate": 9.623510744389908e-08, "logits/chosen": -2.827359199523926, "logits/rejected": -2.835468292236328, "logps/chosen": -67.43928527832031, "logps/rejected": -79.85739135742188, "loss": 0.6688, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15345561504364014, "rewards/margins": 0.0541754886507988, "rewards/rejected": -0.20763111114501953, "step": 3690 }, { "epoch": 0.6374913852515507, "grad_norm": 3.7610597610473633, "learning_rate": 9.619685318246575e-08, "logits/chosen": -2.8696014881134033, "logits/rejected": -2.8439719676971436, "logps/chosen": -72.57151794433594, "logps/rejected": -80.30851745605469, "loss": 0.6664, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.14973986148834229, "rewards/margins": 0.05947665497660637, "rewards/rejected": -0.20921652019023895, "step": 3700 }, { "epoch": 0.6374913852515507, "eval_logits/chosen": -2.975109815597534, "eval_logits/rejected": -2.9692435264587402, "eval_logps/chosen": -69.21776580810547, "eval_logps/rejected": -76.31509399414062, "eval_loss": 0.6815912127494812, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.10505872219800949, "eval_rewards/margins": 0.026290999725461006, "eval_rewards/rejected": -0.13134972751140594, "eval_runtime": 384.2496, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 3700 }, { "epoch": 0.6392143349414197, "grad_norm": 3.9867801666259766, "learning_rate": 9.615841323723878e-08, "logits/chosen": -2.901616334915161, "logits/rejected": -2.869765043258667, "logps/chosen": -69.55196380615234, "logps/rejected": -71.0933609008789, "loss": 0.6735, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.15683287382125854, "rewards/margins": 0.04376794770359993, "rewards/rejected": -0.20060083270072937, "step": 3710 }, { "epoch": 0.6409372846312887, "grad_norm": 3.4861154556274414, "learning_rate": 9.611978776272381e-08, "logits/chosen": -2.914351224899292, "logits/rejected": -2.8966033458709717, "logps/chosen": -66.36556243896484, "logps/rejected": -74.12760925292969, "loss": 0.6645, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.14468802511692047, "rewards/margins": 0.06273495405912399, "rewards/rejected": -0.20742297172546387, "step": 3720 }, { "epoch": 0.6426602343211578, "grad_norm": 4.244858264923096, "learning_rate": 9.608097691417222e-08, "logits/chosen": -2.8747448921203613, "logits/rejected": -2.8519973754882812, "logps/chosen": -69.42605590820312, "logps/rejected": -72.1407470703125, "loss": 0.6638, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14908427000045776, "rewards/margins": 0.06526695191860199, "rewards/rejected": -0.21435122191905975, "step": 3730 }, { "epoch": 0.6443831840110269, "grad_norm": 4.0733513832092285, "learning_rate": 9.604198084758046e-08, "logits/chosen": -2.8734517097473145, "logits/rejected": -2.8487000465393066, "logps/chosen": -68.9703369140625, "logps/rejected": -77.02600860595703, "loss": 0.6706, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.15535292029380798, "rewards/margins": 0.050812482833862305, "rewards/rejected": -0.2061654031276703, "step": 3740 }, { "epoch": 0.646106133700896, "grad_norm": 3.836585760116577, "learning_rate": 9.600279971968947e-08, "logits/chosen": -2.942082643508911, "logits/rejected": -2.913980722427368, "logps/chosen": -68.1339340209961, "logps/rejected": -73.91707611083984, "loss": 0.6659, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.14815273880958557, "rewards/margins": 0.06022592633962631, "rewards/rejected": -0.20837867259979248, "step": 3750 }, { "epoch": 0.647829083390765, "grad_norm": 4.055958271026611, "learning_rate": 9.5963433687984e-08, "logits/chosen": -2.8938567638397217, "logits/rejected": -2.8905863761901855, "logps/chosen": -72.90452575683594, "logps/rejected": -77.8410873413086, "loss": 0.6725, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17030875384807587, "rewards/margins": 0.047505687922239304, "rewards/rejected": -0.21781444549560547, "step": 3760 }, { "epoch": 0.649552033080634, "grad_norm": 3.945086717605591, "learning_rate": 9.592388291069204e-08, "logits/chosen": -2.8639397621154785, "logits/rejected": -2.8437116146087646, "logps/chosen": -69.64740753173828, "logps/rejected": -74.47994232177734, "loss": 0.6764, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16105706989765167, "rewards/margins": 0.03824262320995331, "rewards/rejected": -0.19929969310760498, "step": 3770 }, { "epoch": 0.6512749827705031, "grad_norm": 3.452279806137085, "learning_rate": 9.588414754678408e-08, "logits/chosen": -2.910830020904541, "logits/rejected": -2.8731696605682373, "logps/chosen": -70.38741302490234, "logps/rejected": -70.97752380371094, "loss": 0.6696, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.16308808326721191, "rewards/margins": 0.05267634242773056, "rewards/rejected": -0.21576444804668427, "step": 3780 }, { "epoch": 0.6529979324603722, "grad_norm": 3.7020864486694336, "learning_rate": 9.584422775597263e-08, "logits/chosen": -2.864274263381958, "logits/rejected": -2.835930347442627, "logps/chosen": -69.42414093017578, "logps/rejected": -72.6201400756836, "loss": 0.6651, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.15436016023159027, "rewards/margins": 0.06193486973643303, "rewards/rejected": -0.2162950336933136, "step": 3790 }, { "epoch": 0.6547208821502413, "grad_norm": 4.76193380355835, "learning_rate": 9.58041236987114e-08, "logits/chosen": -2.881044864654541, "logits/rejected": -2.8496294021606445, "logps/chosen": -73.31177520751953, "logps/rejected": -74.33292388916016, "loss": 0.6667, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.156185120344162, "rewards/margins": 0.05846773460507393, "rewards/rejected": -0.21465285122394562, "step": 3800 }, { "epoch": 0.6547208821502413, "eval_logits/chosen": -2.965406656265259, "eval_logits/rejected": -2.9595389366149902, "eval_logps/chosen": -70.4286880493164, "eval_logps/rejected": -77.74009704589844, "eval_loss": 0.6807082891464233, "eval_rewards/accuracies": 0.6085036993026733, "eval_rewards/chosen": -0.11716797947883606, "eval_rewards/margins": 0.028431786224246025, "eval_rewards/rejected": -0.14559975266456604, "eval_runtime": 383.8312, "eval_samples_per_second": 11.213, "eval_steps_per_second": 1.402, "step": 3800 }, { "epoch": 0.6564438318401102, "grad_norm": 4.137098789215088, "learning_rate": 9.576383553619479e-08, "logits/chosen": -2.904186248779297, "logits/rejected": -2.8615362644195557, "logps/chosen": -74.40220642089844, "logps/rejected": -75.01345825195312, "loss": 0.6661, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.16934098303318024, "rewards/margins": 0.06015457957983017, "rewards/rejected": -0.229495570063591, "step": 3810 }, { "epoch": 0.6581667815299793, "grad_norm": 4.33560037612915, "learning_rate": 9.572336343035719e-08, "logits/chosen": -2.873201370239258, "logits/rejected": -2.856436252593994, "logps/chosen": -70.38566589355469, "logps/rejected": -74.04071044921875, "loss": 0.6723, "rewards/accuracies": 0.625, "rewards/chosen": -0.16287454962730408, "rewards/margins": 0.04665592685341835, "rewards/rejected": -0.20953047275543213, "step": 3820 }, { "epoch": 0.6598897312198484, "grad_norm": 4.001519203186035, "learning_rate": 9.56827075438723e-08, "logits/chosen": -2.917055606842041, "logits/rejected": -2.8728079795837402, "logps/chosen": -74.32755279541016, "logps/rejected": -73.45980834960938, "loss": 0.6637, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15650448203086853, "rewards/margins": 0.06449047476053238, "rewards/rejected": -0.2209949493408203, "step": 3830 }, { "epoch": 0.6616126809097175, "grad_norm": 4.411248683929443, "learning_rate": 9.564186804015257e-08, "logits/chosen": -2.8436412811279297, "logits/rejected": -2.830352544784546, "logps/chosen": -69.19142150878906, "logps/rejected": -81.37538146972656, "loss": 0.6606, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15105029940605164, "rewards/margins": 0.0723867267370224, "rewards/rejected": -0.22343704104423523, "step": 3840 }, { "epoch": 0.6633356305995864, "grad_norm": 3.921731948852539, "learning_rate": 9.560084508334842e-08, "logits/chosen": -2.9667470455169678, "logits/rejected": -2.9518978595733643, "logps/chosen": -74.0916519165039, "logps/rejected": -75.1229019165039, "loss": 0.6728, "rewards/accuracies": 0.625, "rewards/chosen": -0.16790547966957092, "rewards/margins": 0.045664455741643906, "rewards/rejected": -0.21356992423534393, "step": 3850 }, { "epoch": 0.6650585802894555, "grad_norm": 4.0796003341674805, "learning_rate": 9.555963883834766e-08, "logits/chosen": -2.972275495529175, "logits/rejected": -2.9467124938964844, "logps/chosen": -73.37123107910156, "logps/rejected": -75.72817993164062, "loss": 0.6783, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.19190803170204163, "rewards/margins": 0.035946134477853775, "rewards/rejected": -0.2278541624546051, "step": 3860 }, { "epoch": 0.6667815299793246, "grad_norm": 4.614365100860596, "learning_rate": 9.551824947077482e-08, "logits/chosen": -2.8627209663391113, "logits/rejected": -2.839452028274536, "logps/chosen": -75.68873596191406, "logps/rejected": -78.63587188720703, "loss": 0.6656, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.16835929453372955, "rewards/margins": 0.061359524726867676, "rewards/rejected": -0.22971877455711365, "step": 3870 }, { "epoch": 0.6685044796691937, "grad_norm": 4.349843502044678, "learning_rate": 9.54766771469905e-08, "logits/chosen": -2.8957715034484863, "logits/rejected": -2.9004740715026855, "logps/chosen": -70.16437530517578, "logps/rejected": -76.49174499511719, "loss": 0.6771, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1928536593914032, "rewards/margins": 0.03714747726917267, "rewards/rejected": -0.23000116646289825, "step": 3880 }, { "epoch": 0.6702274293590628, "grad_norm": 3.8690977096557617, "learning_rate": 9.54349220340906e-08, "logits/chosen": -2.9396841526031494, "logits/rejected": -2.9125778675079346, "logps/chosen": -72.10147857666016, "logps/rejected": -76.12156677246094, "loss": 0.6661, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17915073037147522, "rewards/margins": 0.060593824833631516, "rewards/rejected": -0.23974457383155823, "step": 3890 }, { "epoch": 0.6719503790489317, "grad_norm": 4.07612943649292, "learning_rate": 9.539298429990581e-08, "logits/chosen": -2.9170002937316895, "logits/rejected": -2.862642765045166, "logps/chosen": -74.65745544433594, "logps/rejected": -73.78831481933594, "loss": 0.6678, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1782888025045395, "rewards/margins": 0.05697142332792282, "rewards/rejected": -0.2352602183818817, "step": 3900 }, { "epoch": 0.6719503790489317, "eval_logits/chosen": -2.9557979106903076, "eval_logits/rejected": -2.949904203414917, "eval_logps/chosen": -71.69707489013672, "eval_logps/rejected": -79.20467376708984, "eval_loss": 0.6799117922782898, "eval_rewards/accuracies": 0.6092007160186768, "eval_rewards/chosen": -0.12985172867774963, "eval_rewards/margins": 0.03039376623928547, "eval_rewards/rejected": -0.16024549305438995, "eval_runtime": 384.2444, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 3900 }, { "epoch": 0.6736733287388008, "grad_norm": 3.6909968852996826, "learning_rate": 9.535086411300076e-08, "logits/chosen": -2.889756202697754, "logits/rejected": -2.8741090297698975, "logps/chosen": -74.21575927734375, "logps/rejected": -79.89337158203125, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1857440173625946, "rewards/margins": 0.0671306923031807, "rewards/rejected": -0.2528747022151947, "step": 3910 }, { "epoch": 0.6753962784286699, "grad_norm": 4.169498443603516, "learning_rate": 9.53085616426735e-08, "logits/chosen": -2.9550678730010986, "logits/rejected": -2.9101521968841553, "logps/chosen": -75.76385498046875, "logps/rejected": -74.92377471923828, "loss": 0.6605, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.17094440758228302, "rewards/margins": 0.07381351292133331, "rewards/rejected": -0.24475789070129395, "step": 3920 }, { "epoch": 0.677119228118539, "grad_norm": 4.23738431930542, "learning_rate": 9.526607705895473e-08, "logits/chosen": -2.957963228225708, "logits/rejected": -2.9531877040863037, "logps/chosen": -70.16165924072266, "logps/rejected": -76.8001480102539, "loss": 0.6709, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1890384703874588, "rewards/margins": 0.051141876727342606, "rewards/rejected": -0.2401803433895111, "step": 3930 }, { "epoch": 0.6788421778084079, "grad_norm": 4.149570941925049, "learning_rate": 9.522341053260714e-08, "logits/chosen": -2.785499095916748, "logits/rejected": -2.766923189163208, "logps/chosen": -70.81736755371094, "logps/rejected": -76.40454864501953, "loss": 0.6692, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17317375540733337, "rewards/margins": 0.05389087274670601, "rewards/rejected": -0.22706463932991028, "step": 3940 }, { "epoch": 0.680565127498277, "grad_norm": 4.172847270965576, "learning_rate": 9.51805622351247e-08, "logits/chosen": -2.7846221923828125, "logits/rejected": -2.739029884338379, "logps/chosen": -73.34998321533203, "logps/rejected": -75.7305679321289, "loss": 0.662, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17876625061035156, "rewards/margins": 0.06907214224338531, "rewards/rejected": -0.24783840775489807, "step": 3950 }, { "epoch": 0.6822880771881461, "grad_norm": 4.17586612701416, "learning_rate": 9.513753233873202e-08, "logits/chosen": -2.888617753982544, "logits/rejected": -2.889319658279419, "logps/chosen": -70.58680725097656, "logps/rejected": -85.16160583496094, "loss": 0.6589, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1788363754749298, "rewards/margins": 0.07883033901453018, "rewards/rejected": -0.2576667368412018, "step": 3960 }, { "epoch": 0.6840110268780152, "grad_norm": 4.24992561340332, "learning_rate": 9.50943210163836e-08, "logits/chosen": -2.8979947566986084, "logits/rejected": -2.8743720054626465, "logps/chosen": -72.25019836425781, "logps/rejected": -77.7898941040039, "loss": 0.6638, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18177303671836853, "rewards/margins": 0.06608869135379791, "rewards/rejected": -0.24786174297332764, "step": 3970 }, { "epoch": 0.6857339765678843, "grad_norm": 3.9834277629852295, "learning_rate": 9.505092844176322e-08, "logits/chosen": -2.776604652404785, "logits/rejected": -2.757155418395996, "logps/chosen": -72.45700073242188, "logps/rejected": -79.39900207519531, "loss": 0.6613, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.19117490947246552, "rewards/margins": 0.07083877176046371, "rewards/rejected": -0.26201367378234863, "step": 3980 }, { "epoch": 0.6874569262577532, "grad_norm": 4.078197479248047, "learning_rate": 9.500735478928307e-08, "logits/chosen": -2.8826117515563965, "logits/rejected": -2.869070529937744, "logps/chosen": -72.52119445800781, "logps/rejected": -79.67607116699219, "loss": 0.6704, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19304506480693817, "rewards/margins": 0.051048628985881805, "rewards/rejected": -0.24409373104572296, "step": 3990 }, { "epoch": 0.6891798759476223, "grad_norm": 4.409865379333496, "learning_rate": 9.496360023408332e-08, "logits/chosen": -2.9498190879821777, "logits/rejected": -2.915428638458252, "logps/chosen": -76.47032165527344, "logps/rejected": -77.8676986694336, "loss": 0.6671, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1926608383655548, "rewards/margins": 0.05839105695486069, "rewards/rejected": -0.2510519027709961, "step": 4000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -2.942638635635376, "eval_logits/rejected": -2.9367516040802, "eval_logps/chosen": -72.79254150390625, "eval_logps/rejected": -80.4742202758789, "eval_loss": 0.6792241334915161, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.1408064365386963, "eval_rewards/margins": 0.0321345329284668, "eval_rewards/rejected": -0.1729409396648407, "eval_runtime": 384.1559, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.4, "step": 4000 }, { "epoch": 0.6909028256374914, "grad_norm": 3.9781241416931152, "learning_rate": 9.491966495203114e-08, "logits/chosen": -2.8296196460723877, "logits/rejected": -2.8197121620178223, "logps/chosen": -67.74693298339844, "logps/rejected": -79.69145202636719, "loss": 0.6611, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.19727525115013123, "rewards/margins": 0.06946456432342529, "rewards/rejected": -0.26673978567123413, "step": 4010 }, { "epoch": 0.6926257753273605, "grad_norm": 4.76407527923584, "learning_rate": 9.487554911972019e-08, "logits/chosen": -2.837247371673584, "logits/rejected": -2.8239505290985107, "logps/chosen": -71.54335021972656, "logps/rejected": -78.17178344726562, "loss": 0.6679, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19357839226722717, "rewards/margins": 0.05688212066888809, "rewards/rejected": -0.25046053528785706, "step": 4020 }, { "epoch": 0.6943487250172296, "grad_norm": 4.017004013061523, "learning_rate": 9.483125291446976e-08, "logits/chosen": -2.8376073837280273, "logits/rejected": -2.815830707550049, "logps/chosen": -73.77522277832031, "logps/rejected": -80.86968231201172, "loss": 0.6594, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17721232771873474, "rewards/margins": 0.07356409728527069, "rewards/rejected": -0.250776469707489, "step": 4030 }, { "epoch": 0.6960716747070985, "grad_norm": 4.465822219848633, "learning_rate": 9.478677651432421e-08, "logits/chosen": -2.9229576587677, "logits/rejected": -2.9110991954803467, "logps/chosen": -75.74259948730469, "logps/rejected": -79.6421127319336, "loss": 0.6665, "rewards/accuracies": 0.625, "rewards/chosen": -0.1955038160085678, "rewards/margins": 0.06042812392115593, "rewards/rejected": -0.25593194365501404, "step": 4040 }, { "epoch": 0.6977946243969676, "grad_norm": 4.073046684265137, "learning_rate": 9.47421200980521e-08, "logits/chosen": -2.8303213119506836, "logits/rejected": -2.812896728515625, "logps/chosen": -69.10895538330078, "logps/rejected": -78.77720642089844, "loss": 0.6604, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1854352056980133, "rewards/margins": 0.07410334050655365, "rewards/rejected": -0.25953856110572815, "step": 4050 }, { "epoch": 0.6995175740868367, "grad_norm": 4.1511430740356445, "learning_rate": 9.469728384514561e-08, "logits/chosen": -2.8318750858306885, "logits/rejected": -2.7997710704803467, "logps/chosen": -77.20620727539062, "logps/rejected": -79.17239379882812, "loss": 0.6659, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2084653079509735, "rewards/margins": 0.06185628101229668, "rewards/rejected": -0.2703215479850769, "step": 4060 }, { "epoch": 0.7012405237767058, "grad_norm": 4.208459377288818, "learning_rate": 9.465226793581974e-08, "logits/chosen": -2.8345329761505127, "logits/rejected": -2.812391996383667, "logps/chosen": -74.58561706542969, "logps/rejected": -80.38744354248047, "loss": 0.6628, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20753717422485352, "rewards/margins": 0.0675569549202919, "rewards/rejected": -0.2750941514968872, "step": 4070 }, { "epoch": 0.7029634734665747, "grad_norm": 4.178997993469238, "learning_rate": 9.460707255101159e-08, "logits/chosen": -2.8007960319519043, "logits/rejected": -2.7930490970611572, "logps/chosen": -71.29439544677734, "logps/rejected": -80.11695861816406, "loss": 0.6632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1939457356929779, "rewards/margins": 0.06700269132852554, "rewards/rejected": -0.26094844937324524, "step": 4080 }, { "epoch": 0.7046864231564438, "grad_norm": 4.075286865234375, "learning_rate": 9.456169787237962e-08, "logits/chosen": -2.8844974040985107, "logits/rejected": -2.85430908203125, "logps/chosen": -73.63328552246094, "logps/rejected": -79.8232650756836, "loss": 0.6612, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18156015872955322, "rewards/margins": 0.07141424715518951, "rewards/rejected": -0.25297439098358154, "step": 4090 }, { "epoch": 0.7064093728463129, "grad_norm": 3.9757399559020996, "learning_rate": 9.451614408230299e-08, "logits/chosen": -2.8377492427825928, "logits/rejected": -2.8096656799316406, "logps/chosen": -70.15831756591797, "logps/rejected": -74.91976165771484, "loss": 0.6554, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17314878106117249, "rewards/margins": 0.08360201865434647, "rewards/rejected": -0.25675076246261597, "step": 4100 }, { "epoch": 0.7064093728463129, "eval_logits/chosen": -2.9237635135650635, "eval_logits/rejected": -2.917879343032837, "eval_logps/chosen": -73.29619598388672, "eval_logps/rejected": -81.09249877929688, "eval_loss": 0.678745448589325, "eval_rewards/accuracies": 0.6119888424873352, "eval_rewards/chosen": -0.14584296941757202, "eval_rewards/margins": 0.0332808755338192, "eval_rewards/rejected": -0.17912384867668152, "eval_runtime": 384.2587, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 4100 }, { "epoch": 0.708132322536182, "grad_norm": 4.0176496505737305, "learning_rate": 9.447041136388078e-08, "logits/chosen": -2.805227756500244, "logits/rejected": -2.79392671585083, "logps/chosen": -77.64244842529297, "logps/rejected": -77.17851257324219, "loss": 0.6761, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.21580031514167786, "rewards/margins": 0.04095283895730972, "rewards/rejected": -0.256753146648407, "step": 4110 }, { "epoch": 0.709855272226051, "grad_norm": 4.602128505706787, "learning_rate": 9.442449990093124e-08, "logits/chosen": -2.7617733478546143, "logits/rejected": -2.7641284465789795, "logps/chosen": -71.70056915283203, "logps/rejected": -81.80654907226562, "loss": 0.6707, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.21735677123069763, "rewards/margins": 0.052834171801805496, "rewards/rejected": -0.270190954208374, "step": 4120 }, { "epoch": 0.71157822191592, "grad_norm": 4.171328544616699, "learning_rate": 9.437840987799104e-08, "logits/chosen": -2.861532688140869, "logits/rejected": -2.8389453887939453, "logps/chosen": -72.39353942871094, "logps/rejected": -79.54823303222656, "loss": 0.6537, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.18425598740577698, "rewards/margins": 0.08589592576026917, "rewards/rejected": -0.27015191316604614, "step": 4130 }, { "epoch": 0.7133011716057891, "grad_norm": 3.955057144165039, "learning_rate": 9.433214148031458e-08, "logits/chosen": -2.8611323833465576, "logits/rejected": -2.8340659141540527, "logps/chosen": -77.03531646728516, "logps/rejected": -75.87199401855469, "loss": 0.6784, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.21509671211242676, "rewards/margins": 0.036592431366443634, "rewards/rejected": -0.2516891360282898, "step": 4140 }, { "epoch": 0.7150241212956582, "grad_norm": 4.342238903045654, "learning_rate": 9.428569489387324e-08, "logits/chosen": -2.8826565742492676, "logits/rejected": -2.856748104095459, "logps/chosen": -78.09243774414062, "logps/rejected": -79.34883880615234, "loss": 0.66, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20250776410102844, "rewards/margins": 0.07274194061756134, "rewards/rejected": -0.2752496898174286, "step": 4150 }, { "epoch": 0.7167470709855273, "grad_norm": 4.166910171508789, "learning_rate": 9.423907030535459e-08, "logits/chosen": -2.776376962661743, "logits/rejected": -2.746248483657837, "logps/chosen": -72.65409088134766, "logps/rejected": -76.98197937011719, "loss": 0.6624, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.20975884795188904, "rewards/margins": 0.06894242018461227, "rewards/rejected": -0.2787012755870819, "step": 4160 }, { "epoch": 0.7184700206753962, "grad_norm": 4.668094158172607, "learning_rate": 9.419226790216164e-08, "logits/chosen": -2.828092098236084, "logits/rejected": -2.820021867752075, "logps/chosen": -71.60343933105469, "logps/rejected": -80.28787994384766, "loss": 0.6641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.20207414031028748, "rewards/margins": 0.06495118886232376, "rewards/rejected": -0.267025351524353, "step": 4170 }, { "epoch": 0.7201929703652653, "grad_norm": 4.634207248687744, "learning_rate": 9.414528787241215e-08, "logits/chosen": -2.8247077465057373, "logits/rejected": -2.8097341060638428, "logps/chosen": -75.35960388183594, "logps/rejected": -82.92457580566406, "loss": 0.6639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.200140118598938, "rewards/margins": 0.06621621549129486, "rewards/rejected": -0.26635634899139404, "step": 4180 }, { "epoch": 0.7219159200551344, "grad_norm": 4.70818567276001, "learning_rate": 9.409813040493783e-08, "logits/chosen": -2.8063673973083496, "logits/rejected": -2.7928805351257324, "logps/chosen": -72.40957641601562, "logps/rejected": -80.8687515258789, "loss": 0.6617, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21341435611248016, "rewards/margins": 0.07006919384002686, "rewards/rejected": -0.2834835648536682, "step": 4190 }, { "epoch": 0.7236388697450035, "grad_norm": 4.3420634269714355, "learning_rate": 9.405079568928355e-08, "logits/chosen": -2.856837272644043, "logits/rejected": -2.8410205841064453, "logps/chosen": -76.31224822998047, "logps/rejected": -77.55537414550781, "loss": 0.6742, "rewards/accuracies": 0.625, "rewards/chosen": -0.20605771243572235, "rewards/margins": 0.043607138097286224, "rewards/rejected": -0.24966482818126678, "step": 4200 }, { "epoch": 0.7236388697450035, "eval_logits/chosen": -2.910283327102661, "eval_logits/rejected": -2.904351234436035, "eval_logps/chosen": -74.51010131835938, "eval_logps/rejected": -82.50054168701172, "eval_loss": 0.677975594997406, "eval_rewards/accuracies": 0.6126858592033386, "eval_rewards/chosen": -0.15798205137252808, "eval_rewards/margins": 0.035222191363573074, "eval_rewards/rejected": -0.19320423901081085, "eval_runtime": 384.0793, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 4200 }, { "epoch": 0.7253618194348725, "grad_norm": 4.6862335205078125, "learning_rate": 9.400328391570665e-08, "logits/chosen": -2.8391900062561035, "logits/rejected": -2.817417621612549, "logps/chosen": -77.92698669433594, "logps/rejected": -80.89947509765625, "loss": 0.6699, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2263023406267166, "rewards/margins": 0.05494243651628494, "rewards/rejected": -0.28124481439590454, "step": 4210 }, { "epoch": 0.7270847691247415, "grad_norm": 4.127202987670898, "learning_rate": 9.395559527517611e-08, "logits/chosen": -2.6965038776397705, "logits/rejected": -2.6781580448150635, "logps/chosen": -72.81075286865234, "logps/rejected": -80.09992218017578, "loss": 0.663, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2059207409620285, "rewards/margins": 0.06802737712860107, "rewards/rejected": -0.27394813299179077, "step": 4220 }, { "epoch": 0.7288077188146106, "grad_norm": 4.130259037017822, "learning_rate": 9.390772995937181e-08, "logits/chosen": -2.875835418701172, "logits/rejected": -2.8564863204956055, "logps/chosen": -76.85389709472656, "logps/rejected": -83.16339874267578, "loss": 0.6601, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2154783308506012, "rewards/margins": 0.07515918463468552, "rewards/rejected": -0.2906374931335449, "step": 4230 }, { "epoch": 0.7305306685044797, "grad_norm": 4.457327842712402, "learning_rate": 9.385968816068377e-08, "logits/chosen": -2.7803921699523926, "logits/rejected": -2.7560620307922363, "logps/chosen": -76.09105682373047, "logps/rejected": -85.15662384033203, "loss": 0.6585, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.21346938610076904, "rewards/margins": 0.07855705916881561, "rewards/rejected": -0.29202643036842346, "step": 4240 }, { "epoch": 0.7322536181943488, "grad_norm": 4.516713619232178, "learning_rate": 9.381147007221137e-08, "logits/chosen": -2.7942028045654297, "logits/rejected": -2.778221368789673, "logps/chosen": -76.53819274902344, "logps/rejected": -79.68205261230469, "loss": 0.6698, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.21489393711090088, "rewards/margins": 0.05524193122982979, "rewards/rejected": -0.27013587951660156, "step": 4250 }, { "epoch": 0.7339765678842178, "grad_norm": 4.434290885925293, "learning_rate": 9.376307588776258e-08, "logits/chosen": -2.760439395904541, "logits/rejected": -2.7457115650177, "logps/chosen": -75.36550903320312, "logps/rejected": -82.37860107421875, "loss": 0.6642, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20837347209453583, "rewards/margins": 0.06687536090612411, "rewards/rejected": -0.27524879574775696, "step": 4260 }, { "epoch": 0.7356995175740868, "grad_norm": 4.595795631408691, "learning_rate": 9.371450580185314e-08, "logits/chosen": -2.7766385078430176, "logits/rejected": -2.751711130142212, "logps/chosen": -70.01823425292969, "logps/rejected": -76.9112319946289, "loss": 0.6573, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18409791588783264, "rewards/margins": 0.07954321801662445, "rewards/rejected": -0.2636410892009735, "step": 4270 }, { "epoch": 0.7374224672639559, "grad_norm": 4.3390727043151855, "learning_rate": 9.366576000970581e-08, "logits/chosen": -2.7538180351257324, "logits/rejected": -2.7279274463653564, "logps/chosen": -72.8944320678711, "logps/rejected": -80.99021911621094, "loss": 0.6598, "rewards/accuracies": 0.625, "rewards/chosen": -0.21176910400390625, "rewards/margins": 0.07757888734340668, "rewards/rejected": -0.2893480062484741, "step": 4280 }, { "epoch": 0.739145416953825, "grad_norm": 4.207832336425781, "learning_rate": 9.36168387072496e-08, "logits/chosen": -2.793666362762451, "logits/rejected": -2.7704033851623535, "logps/chosen": -77.56449890136719, "logps/rejected": -80.96661376953125, "loss": 0.6632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22324618697166443, "rewards/margins": 0.07027504593133926, "rewards/rejected": -0.2935212254524231, "step": 4290 }, { "epoch": 0.740868366643694, "grad_norm": 4.246079444885254, "learning_rate": 9.356774209111899e-08, "logits/chosen": -2.8101134300231934, "logits/rejected": -2.7990527153015137, "logps/chosen": -74.55792999267578, "logps/rejected": -81.8268051147461, "loss": 0.6632, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.22001609206199646, "rewards/margins": 0.06760163605213165, "rewards/rejected": -0.2876177430152893, "step": 4300 }, { "epoch": 0.740868366643694, "eval_logits/chosen": -2.8992130756378174, "eval_logits/rejected": -2.893341541290283, "eval_logps/chosen": -75.42848205566406, "eval_logps/rejected": -83.55915832519531, "eval_loss": 0.6774368286132812, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.1671658307313919, "eval_rewards/margins": 0.03662450611591339, "eval_rewards/rejected": -0.20379036664962769, "eval_runtime": 384.3237, "eval_samples_per_second": 11.199, "eval_steps_per_second": 1.4, "step": 4300 }, { "epoch": 0.742591316333563, "grad_norm": 4.788305759429932, "learning_rate": 9.351847035865306e-08, "logits/chosen": -2.7564914226531982, "logits/rejected": -2.7381021976470947, "logps/chosen": -75.6027603149414, "logps/rejected": -80.18602752685547, "loss": 0.6617, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.21550814807415009, "rewards/margins": 0.07047909498214722, "rewards/rejected": -0.2859872281551361, "step": 4310 }, { "epoch": 0.7443142660234321, "grad_norm": 4.23420524597168, "learning_rate": 9.346902370789482e-08, "logits/chosen": -2.8363332748413086, "logits/rejected": -2.8082027435302734, "logps/chosen": -81.27455139160156, "logps/rejected": -87.58678436279297, "loss": 0.6467, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.22632554173469543, "rewards/margins": 0.10277044773101807, "rewards/rejected": -0.3290959596633911, "step": 4320 }, { "epoch": 0.7460372157133012, "grad_norm": 4.526897430419922, "learning_rate": 9.341940233759028e-08, "logits/chosen": -2.7430930137634277, "logits/rejected": -2.7125186920166016, "logps/chosen": -79.66510009765625, "logps/rejected": -79.03765106201172, "loss": 0.6719, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.24174313247203827, "rewards/margins": 0.05070701986551285, "rewards/rejected": -0.2924501299858093, "step": 4330 }, { "epoch": 0.7477601654031703, "grad_norm": 4.141915798187256, "learning_rate": 9.336960644718777e-08, "logits/chosen": -2.7470693588256836, "logits/rejected": -2.728968381881714, "logps/chosen": -71.93878173828125, "logps/rejected": -82.00851440429688, "loss": 0.6557, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.22942455112934113, "rewards/margins": 0.08567793667316437, "rewards/rejected": -0.3151025176048279, "step": 4340 }, { "epoch": 0.7494831150930393, "grad_norm": 4.8447980880737305, "learning_rate": 9.331963623683704e-08, "logits/chosen": -2.7558722496032715, "logits/rejected": -2.7415316104888916, "logps/chosen": -72.0338134765625, "logps/rejected": -81.75230407714844, "loss": 0.6597, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.21023979783058167, "rewards/margins": 0.07532364875078201, "rewards/rejected": -0.28556346893310547, "step": 4350 }, { "epoch": 0.7512060647829083, "grad_norm": 4.503192901611328, "learning_rate": 9.326949190738855e-08, "logits/chosen": -2.7989554405212402, "logits/rejected": -2.7777504920959473, "logps/chosen": -78.94058990478516, "logps/rejected": -83.19453430175781, "loss": 0.6669, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2526894211769104, "rewards/margins": 0.0627552792429924, "rewards/rejected": -0.315444678068161, "step": 4360 }, { "epoch": 0.7529290144727774, "grad_norm": 4.493520736694336, "learning_rate": 9.32191736603926e-08, "logits/chosen": -2.8138620853424072, "logits/rejected": -2.799654722213745, "logps/chosen": -78.1859130859375, "logps/rejected": -85.24605560302734, "loss": 0.6617, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2322731763124466, "rewards/margins": 0.072199746966362, "rewards/rejected": -0.3044729232788086, "step": 4370 }, { "epoch": 0.7546519641626465, "grad_norm": 4.596256256103516, "learning_rate": 9.316868169809851e-08, "logits/chosen": -2.835930824279785, "logits/rejected": -2.8130176067352295, "logps/chosen": -78.02940368652344, "logps/rejected": -80.80790710449219, "loss": 0.6731, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2372584044933319, "rewards/margins": 0.04902643710374832, "rewards/rejected": -0.286284863948822, "step": 4380 }, { "epoch": 0.7563749138525155, "grad_norm": 4.191018104553223, "learning_rate": 9.311801622345386e-08, "logits/chosen": -2.766883134841919, "logits/rejected": -2.753368377685547, "logps/chosen": -76.84660339355469, "logps/rejected": -84.39913177490234, "loss": 0.6607, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.23609964549541473, "rewards/margins": 0.07291743904352188, "rewards/rejected": -0.3090170919895172, "step": 4390 }, { "epoch": 0.7580978635423845, "grad_norm": 4.63446044921875, "learning_rate": 9.306717744010364e-08, "logits/chosen": -2.783541440963745, "logits/rejected": -2.7666807174682617, "logps/chosen": -79.99366760253906, "logps/rejected": -85.1405258178711, "loss": 0.6639, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2308838814496994, "rewards/margins": 0.06631298363208771, "rewards/rejected": -0.2971968650817871, "step": 4400 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -2.886715888977051, "eval_logits/rejected": -2.8807601928710938, "eval_logps/chosen": -76.96532440185547, "eval_logps/rejected": -85.3311996459961, "eval_loss": 0.6765131950378418, "eval_rewards/accuracies": 0.6064126491546631, "eval_rewards/chosen": -0.18253430724143982, "eval_rewards/margins": 0.03897644206881523, "eval_rewards/rejected": -0.22151076793670654, "eval_runtime": 383.9473, "eval_samples_per_second": 11.21, "eval_steps_per_second": 1.401, "step": 4400 }, { "epoch": 0.7598208132322536, "grad_norm": 4.555375099182129, "learning_rate": 9.301616555238942e-08, "logits/chosen": -2.746675968170166, "logits/rejected": -2.7307605743408203, "logps/chosen": -80.5545654296875, "logps/rejected": -87.68020629882812, "loss": 0.6631, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2566290497779846, "rewards/margins": 0.06986650824546814, "rewards/rejected": -0.32649558782577515, "step": 4410 }, { "epoch": 0.7615437629221227, "grad_norm": 5.5577073097229, "learning_rate": 9.296498076534858e-08, "logits/chosen": -2.8607311248779297, "logits/rejected": -2.818080186843872, "logps/chosen": -79.47749328613281, "logps/rejected": -82.02799987792969, "loss": 0.6641, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.24661484360694885, "rewards/margins": 0.06896611303091049, "rewards/rejected": -0.31558096408843994, "step": 4420 }, { "epoch": 0.7632667126119917, "grad_norm": 3.7632880210876465, "learning_rate": 9.291362328471341e-08, "logits/chosen": -2.752110004425049, "logits/rejected": -2.721153974533081, "logps/chosen": -78.45207214355469, "logps/rejected": -82.0703125, "loss": 0.6681, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.24669401347637177, "rewards/margins": 0.06080010533332825, "rewards/rejected": -0.3074941039085388, "step": 4430 }, { "epoch": 0.7649896623018608, "grad_norm": 5.201669216156006, "learning_rate": 9.286209331691037e-08, "logits/chosen": -2.8158178329467773, "logits/rejected": -2.7852184772491455, "logps/chosen": -85.32269287109375, "logps/rejected": -90.01795959472656, "loss": 0.6531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2592065930366516, "rewards/margins": 0.09074309468269348, "rewards/rejected": -0.3499496877193451, "step": 4440 }, { "epoch": 0.7667126119917298, "grad_norm": 4.5300140380859375, "learning_rate": 9.281039106905916e-08, "logits/chosen": -2.6885523796081543, "logits/rejected": -2.6770710945129395, "logps/chosen": -80.5747299194336, "logps/rejected": -85.92475128173828, "loss": 0.664, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.24859464168548584, "rewards/margins": 0.06878234446048737, "rewards/rejected": -0.3173769414424896, "step": 4450 }, { "epoch": 0.7684355616815989, "grad_norm": 4.886305332183838, "learning_rate": 9.275851674897203e-08, "logits/chosen": -2.802295446395874, "logits/rejected": -2.795485019683838, "logps/chosen": -77.82582092285156, "logps/rejected": -84.43071746826172, "loss": 0.665, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.25093844532966614, "rewards/margins": 0.0667717307806015, "rewards/rejected": -0.3177102208137512, "step": 4460 }, { "epoch": 0.770158511371468, "grad_norm": 4.877972602844238, "learning_rate": 9.270647056515275e-08, "logits/chosen": -2.842092990875244, "logits/rejected": -2.81296443939209, "logps/chosen": -81.16618347167969, "logps/rejected": -83.69815063476562, "loss": 0.6568, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2524419128894806, "rewards/margins": 0.08215296268463135, "rewards/rejected": -0.3345949053764343, "step": 4470 }, { "epoch": 0.771881461061337, "grad_norm": 5.280404090881348, "learning_rate": 9.265425272679596e-08, "logits/chosen": -2.8654842376708984, "logits/rejected": -2.858914613723755, "logps/chosen": -77.5667495727539, "logps/rejected": -84.89839172363281, "loss": 0.6677, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.26555973291397095, "rewards/margins": 0.060958124697208405, "rewards/rejected": -0.32651782035827637, "step": 4480 }, { "epoch": 0.7736044107512061, "grad_norm": 4.828038692474365, "learning_rate": 9.260186344378623e-08, "logits/chosen": -2.7557806968688965, "logits/rejected": -2.7294392585754395, "logps/chosen": -78.39535522460938, "logps/rejected": -83.92191314697266, "loss": 0.6673, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2725452780723572, "rewards/margins": 0.06830044835805893, "rewards/rejected": -0.3408457636833191, "step": 4490 }, { "epoch": 0.7753273604410751, "grad_norm": 5.012576103210449, "learning_rate": 9.254930292669723e-08, "logits/chosen": -2.7692604064941406, "logits/rejected": -2.7539470195770264, "logps/chosen": -82.85484313964844, "logps/rejected": -86.92982482910156, "loss": 0.6617, "rewards/accuracies": 0.65625, "rewards/chosen": -0.265195369720459, "rewards/margins": 0.07446925342082977, "rewards/rejected": -0.33966463804244995, "step": 4500 }, { "epoch": 0.7753273604410751, "eval_logits/chosen": -2.8763012886047363, "eval_logits/rejected": -2.8703606128692627, "eval_logps/chosen": -78.81832885742188, "eval_logps/rejected": -87.49480438232422, "eval_loss": 0.6753003597259521, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.20106428861618042, "eval_rewards/margins": 0.04208245500922203, "eval_rewards/rejected": -0.24314673244953156, "eval_runtime": 383.819, "eval_samples_per_second": 11.214, "eval_steps_per_second": 1.402, "step": 4500 }, { "epoch": 0.7770503101309442, "grad_norm": 5.750226020812988, "learning_rate": 9.249657138679084e-08, "logits/chosen": -2.8761849403381348, "logits/rejected": -2.849595308303833, "logps/chosen": -79.1395034790039, "logps/rejected": -90.27713012695312, "loss": 0.6537, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2570708990097046, "rewards/margins": 0.08952498435974121, "rewards/rejected": -0.3465958535671234, "step": 4510 }, { "epoch": 0.7787732598208132, "grad_norm": 5.236589431762695, "learning_rate": 9.244366903601644e-08, "logits/chosen": -2.8129701614379883, "logits/rejected": -2.792578935623169, "logps/chosen": -80.64488983154297, "logps/rejected": -83.59857177734375, "loss": 0.6711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.28046828508377075, "rewards/margins": 0.054709434509277344, "rewards/rejected": -0.3351777493953705, "step": 4520 }, { "epoch": 0.7804962095106823, "grad_norm": 5.145355224609375, "learning_rate": 9.239059608700992e-08, "logits/chosen": -2.802945613861084, "logits/rejected": -2.7939629554748535, "logps/chosen": -80.53050231933594, "logps/rejected": -84.86817932128906, "loss": 0.6752, "rewards/accuracies": 0.59375, "rewards/chosen": -0.26756101846694946, "rewards/margins": 0.04630138352513313, "rewards/rejected": -0.3138624131679535, "step": 4530 }, { "epoch": 0.7822191592005513, "grad_norm": 5.845473289489746, "learning_rate": 9.233735275309287e-08, "logits/chosen": -2.715578556060791, "logits/rejected": -2.703730821609497, "logps/chosen": -78.3282241821289, "logps/rejected": -84.03981018066406, "loss": 0.6606, "rewards/accuracies": 0.625, "rewards/chosen": -0.25596433877944946, "rewards/margins": 0.07508457452058792, "rewards/rejected": -0.3310489058494568, "step": 4540 }, { "epoch": 0.7839421088904204, "grad_norm": 5.710216522216797, "learning_rate": 9.228393924827173e-08, "logits/chosen": -2.813201427459717, "logits/rejected": -2.790335178375244, "logps/chosen": -81.75581359863281, "logps/rejected": -87.93441009521484, "loss": 0.6531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2539733052253723, "rewards/margins": 0.09145532548427582, "rewards/rejected": -0.34542861580848694, "step": 4550 }, { "epoch": 0.7856650585802895, "grad_norm": 4.682338237762451, "learning_rate": 9.223035578723695e-08, "logits/chosen": -2.7215328216552734, "logits/rejected": -2.6849465370178223, "logps/chosen": -80.07984161376953, "logps/rejected": -87.59647369384766, "loss": 0.6457, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.23862795531749725, "rewards/margins": 0.1064465194940567, "rewards/rejected": -0.34507447481155396, "step": 4560 }, { "epoch": 0.7873880082701585, "grad_norm": 5.254011631011963, "learning_rate": 9.217660258536204e-08, "logits/chosen": -2.7420716285705566, "logits/rejected": -2.7178990840911865, "logps/chosen": -78.91129302978516, "logps/rejected": -87.96128845214844, "loss": 0.6581, "rewards/accuracies": 0.6875, "rewards/chosen": -0.278209388256073, "rewards/margins": 0.08387064933776855, "rewards/rejected": -0.36208003759384155, "step": 4570 }, { "epoch": 0.7891109579600276, "grad_norm": 5.115887641906738, "learning_rate": 9.212267985870285e-08, "logits/chosen": -2.71126127243042, "logits/rejected": -2.685075283050537, "logps/chosen": -77.27324676513672, "logps/rejected": -83.13511657714844, "loss": 0.6563, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23013631999492645, "rewards/margins": 0.08359868824481964, "rewards/rejected": -0.31373506784439087, "step": 4580 }, { "epoch": 0.7908339076498966, "grad_norm": 5.144996166229248, "learning_rate": 9.206858782399655e-08, "logits/chosen": -2.80875301361084, "logits/rejected": -2.7820239067077637, "logps/chosen": -82.58045959472656, "logps/rejected": -85.51569366455078, "loss": 0.6709, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2769617736339569, "rewards/margins": 0.056131016463041306, "rewards/rejected": -0.3330928087234497, "step": 4590 }, { "epoch": 0.7925568573397657, "grad_norm": 5.189516067504883, "learning_rate": 9.201432669866086e-08, "logits/chosen": -2.691650390625, "logits/rejected": -2.6723177433013916, "logps/chosen": -81.37686157226562, "logps/rejected": -93.39409637451172, "loss": 0.6446, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.28011757135391235, "rewards/margins": 0.11031317710876465, "rewards/rejected": -0.390430748462677, "step": 4600 }, { "epoch": 0.7925568573397657, "eval_logits/chosen": -2.8663530349731445, "eval_logits/rejected": -2.8603720664978027, "eval_logps/chosen": -80.55084228515625, "eval_logps/rejected": -89.5165023803711, "eval_loss": 0.6741741299629211, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.21838943660259247, "eval_rewards/margins": 0.04497431963682175, "eval_rewards/rejected": -0.2633637487888336, "eval_runtime": 384.2791, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 4600 }, { "epoch": 0.7942798070296347, "grad_norm": 4.885702133178711, "learning_rate": 9.195989670079314e-08, "logits/chosen": -2.7197561264038086, "logits/rejected": -2.7096469402313232, "logps/chosen": -82.00727844238281, "logps/rejected": -86.39524841308594, "loss": 0.6754, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.3086933493614197, "rewards/margins": 0.04578755795955658, "rewards/rejected": -0.35448089241981506, "step": 4610 }, { "epoch": 0.7960027567195038, "grad_norm": 6.132620811462402, "learning_rate": 9.190529804916952e-08, "logits/chosen": -2.755788564682007, "logits/rejected": -2.732422351837158, "logps/chosen": -82.48219299316406, "logps/rejected": -91.32025146484375, "loss": 0.6536, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2676346004009247, "rewards/margins": 0.09044405072927475, "rewards/rejected": -0.35807862877845764, "step": 4620 }, { "epoch": 0.7977257064093728, "grad_norm": 5.214467525482178, "learning_rate": 9.1850530963244e-08, "logits/chosen": -2.789654016494751, "logits/rejected": -2.756521701812744, "logps/chosen": -85.28939056396484, "logps/rejected": -92.73825073242188, "loss": 0.6575, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2800571024417877, "rewards/margins": 0.08137436211109161, "rewards/rejected": -0.3614314794540405, "step": 4630 }, { "epoch": 0.7994486560992419, "grad_norm": 5.4766459465026855, "learning_rate": 9.179559566314761e-08, "logits/chosen": -2.7899534702301025, "logits/rejected": -2.7753140926361084, "logps/chosen": -87.00694274902344, "logps/rejected": -94.41820526123047, "loss": 0.6661, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3225233852863312, "rewards/margins": 0.06614801287651062, "rewards/rejected": -0.3886713981628418, "step": 4640 }, { "epoch": 0.801171605789111, "grad_norm": 5.328300476074219, "learning_rate": 9.174049236968749e-08, "logits/chosen": -2.790173053741455, "logits/rejected": -2.758429527282715, "logps/chosen": -83.92033386230469, "logps/rejected": -89.0642318725586, "loss": 0.6567, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2976953685283661, "rewards/margins": 0.08515827357769012, "rewards/rejected": -0.3828536570072174, "step": 4650 }, { "epoch": 0.80289455547898, "grad_norm": 5.838435173034668, "learning_rate": 9.168522130434598e-08, "logits/chosen": -2.781099557876587, "logits/rejected": -2.763733148574829, "logps/chosen": -82.22652435302734, "logps/rejected": -90.64422607421875, "loss": 0.6559, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27919262647628784, "rewards/margins": 0.08446256816387177, "rewards/rejected": -0.3636552095413208, "step": 4660 }, { "epoch": 0.8046175051688491, "grad_norm": 4.6828508377075195, "learning_rate": 9.162978268927982e-08, "logits/chosen": -2.772085666656494, "logits/rejected": -2.7481086254119873, "logps/chosen": -80.07984161376953, "logps/rejected": -84.9217529296875, "loss": 0.6576, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2765248417854309, "rewards/margins": 0.08171749860048294, "rewards/rejected": -0.35824230313301086, "step": 4670 }, { "epoch": 0.8063404548587181, "grad_norm": 5.070804595947266, "learning_rate": 9.157417674731917e-08, "logits/chosen": -2.758668899536133, "logits/rejected": -2.729013204574585, "logps/chosen": -81.76396179199219, "logps/rejected": -88.8053207397461, "loss": 0.6605, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2927338480949402, "rewards/margins": 0.07841964066028595, "rewards/rejected": -0.3711535334587097, "step": 4680 }, { "epoch": 0.8080634045485872, "grad_norm": 5.231932163238525, "learning_rate": 9.151840370196677e-08, "logits/chosen": -2.801729440689087, "logits/rejected": -2.7796835899353027, "logps/chosen": -86.05563354492188, "logps/rejected": -95.0568618774414, "loss": 0.6492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3029250502586365, "rewards/margins": 0.10464175790548325, "rewards/rejected": -0.4075668454170227, "step": 4690 }, { "epoch": 0.8097863542384562, "grad_norm": 5.7257819175720215, "learning_rate": 9.146246377739695e-08, "logits/chosen": -2.7717370986938477, "logits/rejected": -2.763512134552002, "logps/chosen": -82.7789077758789, "logps/rejected": -96.3499526977539, "loss": 0.6536, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.31044572591781616, "rewards/margins": 0.09615585952997208, "rewards/rejected": -0.40660151839256287, "step": 4700 }, { "epoch": 0.8097863542384562, "eval_logits/chosen": -2.8567216396331787, "eval_logits/rejected": -2.8507139682769775, "eval_logps/chosen": -82.17868041992188, "eval_logps/rejected": -91.38946533203125, "eval_loss": 0.6732736825942993, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.2346678078174591, "eval_rewards/margins": 0.04742560535669327, "eval_rewards/rejected": -0.28209343552589417, "eval_runtime": 383.576, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 4700 }, { "epoch": 0.8115093039283253, "grad_norm": 5.155122756958008, "learning_rate": 9.140635719845486e-08, "logits/chosen": -2.8181588649749756, "logits/rejected": -2.78212308883667, "logps/chosen": -87.6160659790039, "logps/rejected": -92.0344467163086, "loss": 0.6535, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.29668793082237244, "rewards/margins": 0.09982738643884659, "rewards/rejected": -0.3965153098106384, "step": 4710 }, { "epoch": 0.8132322536181944, "grad_norm": 5.167230606079102, "learning_rate": 9.135008419065549e-08, "logits/chosen": -2.6935105323791504, "logits/rejected": -2.664382219314575, "logps/chosen": -85.91453552246094, "logps/rejected": -92.85016632080078, "loss": 0.6448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29112231731414795, "rewards/margins": 0.11304112523794174, "rewards/rejected": -0.4041634500026703, "step": 4720 }, { "epoch": 0.8149552033080634, "grad_norm": 5.660296440124512, "learning_rate": 9.129364498018274e-08, "logits/chosen": -2.68711519241333, "logits/rejected": -2.6629486083984375, "logps/chosen": -85.95222473144531, "logps/rejected": -89.82596588134766, "loss": 0.6677, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.32205989956855774, "rewards/margins": 0.06396792829036713, "rewards/rejected": -0.38602781295776367, "step": 4730 }, { "epoch": 0.8166781529979324, "grad_norm": 5.151979446411133, "learning_rate": 9.12370397938886e-08, "logits/chosen": -2.769345760345459, "logits/rejected": -2.7583069801330566, "logps/chosen": -83.08853149414062, "logps/rejected": -91.46418762207031, "loss": 0.6544, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2864697277545929, "rewards/margins": 0.08946539461612701, "rewards/rejected": -0.3759351372718811, "step": 4740 }, { "epoch": 0.8184011026878015, "grad_norm": 5.471572399139404, "learning_rate": 9.118026885929214e-08, "logits/chosen": -2.740246534347534, "logits/rejected": -2.7223544120788574, "logps/chosen": -82.95596313476562, "logps/rejected": -91.61463928222656, "loss": 0.6521, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.27563899755477905, "rewards/margins": 0.09457249939441681, "rewards/rejected": -0.37021148204803467, "step": 4750 }, { "epoch": 0.8201240523776706, "grad_norm": 5.900081157684326, "learning_rate": 9.112333240457866e-08, "logits/chosen": -2.737785577774048, "logits/rejected": -2.716553211212158, "logps/chosen": -83.81209564208984, "logps/rejected": -91.20219421386719, "loss": 0.6573, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.31340035796165466, "rewards/margins": 0.08304157108068466, "rewards/rejected": -0.39644187688827515, "step": 4760 }, { "epoch": 0.8218470020675396, "grad_norm": 5.405079364776611, "learning_rate": 9.106623065859873e-08, "logits/chosen": -2.800844192504883, "logits/rejected": -2.7802295684814453, "logps/chosen": -92.03160095214844, "logps/rejected": -96.30866241455078, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": -0.3244148790836334, "rewards/margins": 0.08667997270822525, "rewards/rejected": -0.4110948145389557, "step": 4770 }, { "epoch": 0.8235699517574087, "grad_norm": 6.228945255279541, "learning_rate": 9.100896385086731e-08, "logits/chosen": -2.6770527362823486, "logits/rejected": -2.658221960067749, "logps/chosen": -81.65718841552734, "logps/rejected": -95.7015380859375, "loss": 0.6419, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3018708825111389, "rewards/margins": 0.11570735275745392, "rewards/rejected": -0.4175782799720764, "step": 4780 }, { "epoch": 0.8252929014472777, "grad_norm": 4.645414352416992, "learning_rate": 9.095153221156283e-08, "logits/chosen": -2.772768259048462, "logits/rejected": -2.7490804195404053, "logps/chosen": -91.31431579589844, "logps/rejected": -90.93443298339844, "loss": 0.6755, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.3352302312850952, "rewards/margins": 0.0462530255317688, "rewards/rejected": -0.381483256816864, "step": 4790 }, { "epoch": 0.8270158511371468, "grad_norm": 5.8339314460754395, "learning_rate": 9.089393597152619e-08, "logits/chosen": -2.700957775115967, "logits/rejected": -2.6856331825256348, "logps/chosen": -82.1672134399414, "logps/rejected": -88.03073120117188, "loss": 0.661, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.30171626806259155, "rewards/margins": 0.0752943903207779, "rewards/rejected": -0.37701067328453064, "step": 4800 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -2.8470327854156494, "eval_logits/rejected": -2.841031789779663, "eval_logps/chosen": -83.40621185302734, "eval_logps/rejected": -92.85016632080078, "eval_loss": 0.6723343729972839, "eval_rewards/accuracies": 0.6071096658706665, "eval_rewards/chosen": -0.2469431608915329, "eval_rewards/margins": 0.049757279455661774, "eval_rewards/rejected": -0.2967004179954529, "eval_runtime": 383.9646, "eval_samples_per_second": 11.209, "eval_steps_per_second": 1.401, "step": 4800 }, { "epoch": 0.8287388008270159, "grad_norm": 5.596678733825684, "learning_rate": 9.083617536225994e-08, "logits/chosen": -2.7114109992980957, "logits/rejected": -2.678105354309082, "logps/chosen": -88.07744598388672, "logps/rejected": -93.26781463623047, "loss": 0.6505, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.30861157178878784, "rewards/margins": 0.10032536834478378, "rewards/rejected": -0.40893691778182983, "step": 4810 }, { "epoch": 0.8304617505168849, "grad_norm": 5.137798309326172, "learning_rate": 9.077825061592729e-08, "logits/chosen": -2.7468247413635254, "logits/rejected": -2.7376739978790283, "logps/chosen": -82.47352600097656, "logps/rejected": -92.36431884765625, "loss": 0.6545, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31316158175468445, "rewards/margins": 0.09230612218379974, "rewards/rejected": -0.405467689037323, "step": 4820 }, { "epoch": 0.832184700206754, "grad_norm": 4.7232279777526855, "learning_rate": 9.072016196535112e-08, "logits/chosen": -2.7591910362243652, "logits/rejected": -2.737152576446533, "logps/chosen": -86.1485595703125, "logps/rejected": -91.36146545410156, "loss": 0.6692, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3257165849208832, "rewards/margins": 0.06162101775407791, "rewards/rejected": -0.3873376250267029, "step": 4830 }, { "epoch": 0.833907649896623, "grad_norm": 5.128164768218994, "learning_rate": 9.066190964401321e-08, "logits/chosen": -2.7161264419555664, "logits/rejected": -2.6862852573394775, "logps/chosen": -88.64158630371094, "logps/rejected": -95.79473114013672, "loss": 0.647, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3247124254703522, "rewards/margins": 0.10910408198833466, "rewards/rejected": -0.433816522359848, "step": 4840 }, { "epoch": 0.8356305995864921, "grad_norm": 5.817652702331543, "learning_rate": 9.060349388605313e-08, "logits/chosen": -2.781848192214966, "logits/rejected": -2.764552593231201, "logps/chosen": -85.15306091308594, "logps/rejected": -94.81077575683594, "loss": 0.6505, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3101899027824402, "rewards/margins": 0.1017962247133255, "rewards/rejected": -0.41198617219924927, "step": 4850 }, { "epoch": 0.8373535492763611, "grad_norm": 5.9934983253479, "learning_rate": 9.054491492626736e-08, "logits/chosen": -2.7893447875976562, "logits/rejected": -2.757189989089966, "logps/chosen": -91.19413757324219, "logps/rejected": -90.49979400634766, "loss": 0.6637, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3273699879646301, "rewards/margins": 0.07186584919691086, "rewards/rejected": -0.39923587441444397, "step": 4860 }, { "epoch": 0.8390764989662302, "grad_norm": 6.126730918884277, "learning_rate": 9.048617300010839e-08, "logits/chosen": -2.8017003536224365, "logits/rejected": -2.7763314247131348, "logps/chosen": -90.9378433227539, "logps/rejected": -94.17574310302734, "loss": 0.6547, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.33468252420425415, "rewards/margins": 0.09215408563613892, "rewards/rejected": -0.42683663964271545, "step": 4870 }, { "epoch": 0.8407994486560992, "grad_norm": 5.502596378326416, "learning_rate": 9.042726834368372e-08, "logits/chosen": -2.6938109397888184, "logits/rejected": -2.6608924865722656, "logps/chosen": -84.77705383300781, "logps/rejected": -90.72183227539062, "loss": 0.6572, "rewards/accuracies": 0.65625, "rewards/chosen": -0.31596246361732483, "rewards/margins": 0.08504683524370193, "rewards/rejected": -0.4010092616081238, "step": 4880 }, { "epoch": 0.8425223983459683, "grad_norm": 5.71688985824585, "learning_rate": 9.036820119375494e-08, "logits/chosen": -2.7481417655944824, "logits/rejected": -2.7266898155212402, "logps/chosen": -88.00614166259766, "logps/rejected": -96.5954818725586, "loss": 0.6479, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3258554935455322, "rewards/margins": 0.10476219654083252, "rewards/rejected": -0.43061771988868713, "step": 4890 }, { "epoch": 0.8442453480358374, "grad_norm": 5.429107666015625, "learning_rate": 9.030897178773676e-08, "logits/chosen": -2.7230172157287598, "logits/rejected": -2.6916260719299316, "logps/chosen": -87.25579833984375, "logps/rejected": -92.33116149902344, "loss": 0.6655, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3405340313911438, "rewards/margins": 0.0803399458527565, "rewards/rejected": -0.4208739697933197, "step": 4900 }, { "epoch": 0.8442453480358374, "eval_logits/chosen": -2.8362016677856445, "eval_logits/rejected": -2.830209970474243, "eval_logps/chosen": -84.934814453125, "eval_logps/rejected": -94.62076568603516, "eval_loss": 0.6714351177215576, "eval_rewards/accuracies": 0.6059479713439941, "eval_rewards/chosen": -0.26222923398017883, "eval_rewards/margins": 0.05217716842889786, "eval_rewards/rejected": -0.3144063949584961, "eval_runtime": 384.114, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 4900 }, { "epoch": 0.8459682977257064, "grad_norm": 6.0869855880737305, "learning_rate": 9.024958036369604e-08, "logits/chosen": -2.8492271900177, "logits/rejected": -2.8188436031341553, "logps/chosen": -89.55564880371094, "logps/rejected": -94.36383056640625, "loss": 0.6572, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33216413855552673, "rewards/margins": 0.08713052421808243, "rewards/rejected": -0.41929468512535095, "step": 4910 }, { "epoch": 0.8476912474155754, "grad_norm": 5.863268852233887, "learning_rate": 9.019002716035091e-08, "logits/chosen": -2.6868367195129395, "logits/rejected": -2.669393301010132, "logps/chosen": -86.78264617919922, "logps/rejected": -98.21537780761719, "loss": 0.6434, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.32942792773246765, "rewards/margins": 0.11807069927453995, "rewards/rejected": -0.4474986493587494, "step": 4920 }, { "epoch": 0.8494141971054445, "grad_norm": 5.658226490020752, "learning_rate": 9.013031241706971e-08, "logits/chosen": -2.857323408126831, "logits/rejected": -2.8451614379882812, "logps/chosen": -88.69981384277344, "logps/rejected": -102.26609802246094, "loss": 0.6579, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3555667996406555, "rewards/margins": 0.0870615690946579, "rewards/rejected": -0.4426283836364746, "step": 4930 }, { "epoch": 0.8511371467953136, "grad_norm": 5.625889301300049, "learning_rate": 9.007043637387009e-08, "logits/chosen": -2.790069341659546, "logits/rejected": -2.7611613273620605, "logps/chosen": -90.17796325683594, "logps/rejected": -97.74939727783203, "loss": 0.6483, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.34256118535995483, "rewards/margins": 0.10266473144292831, "rewards/rejected": -0.44522589445114136, "step": 4940 }, { "epoch": 0.8528600964851827, "grad_norm": 6.016177177429199, "learning_rate": 9.001039927141802e-08, "logits/chosen": -2.5950121879577637, "logits/rejected": -2.5807113647460938, "logps/chosen": -85.82875061035156, "logps/rejected": -94.74800109863281, "loss": 0.6522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.31001538038253784, "rewards/margins": 0.09628131985664368, "rewards/rejected": -0.4062967300415039, "step": 4950 }, { "epoch": 0.8545830461750517, "grad_norm": 5.528600692749023, "learning_rate": 8.995020135102685e-08, "logits/chosen": -2.713174343109131, "logits/rejected": -2.7130637168884277, "logps/chosen": -84.0205307006836, "logps/rejected": -98.61932373046875, "loss": 0.648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.31766754388809204, "rewards/margins": 0.10433481633663177, "rewards/rejected": -0.422002375125885, "step": 4960 }, { "epoch": 0.8563059958649207, "grad_norm": 6.078629970550537, "learning_rate": 8.988984285465631e-08, "logits/chosen": -2.688173532485962, "logits/rejected": -2.6774582862854004, "logps/chosen": -84.82339477539062, "logps/rejected": -96.73113250732422, "loss": 0.6456, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3344992995262146, "rewards/margins": 0.11425448954105377, "rewards/rejected": -0.4487537741661072, "step": 4970 }, { "epoch": 0.8580289455547898, "grad_norm": 6.011823654174805, "learning_rate": 8.982932402491154e-08, "logits/chosen": -2.754277467727661, "logits/rejected": -2.746687650680542, "logps/chosen": -85.29658508300781, "logps/rejected": -96.35360717773438, "loss": 0.656, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3243906795978546, "rewards/margins": 0.09029241651296616, "rewards/rejected": -0.41468310356140137, "step": 4980 }, { "epoch": 0.8597518952446589, "grad_norm": 5.960501194000244, "learning_rate": 8.976864510504217e-08, "logits/chosen": -2.670903444290161, "logits/rejected": -2.662590503692627, "logps/chosen": -85.5743179321289, "logps/rejected": -97.1070785522461, "loss": 0.6587, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.337899386882782, "rewards/margins": 0.08430057764053345, "rewards/rejected": -0.42219996452331543, "step": 4990 }, { "epoch": 0.8614748449345279, "grad_norm": 6.323968887329102, "learning_rate": 8.970780633894122e-08, "logits/chosen": -2.695253372192383, "logits/rejected": -2.675020217895508, "logps/chosen": -88.87923431396484, "logps/rejected": -96.24919891357422, "loss": 0.65, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3307526707649231, "rewards/margins": 0.10468538850545883, "rewards/rejected": -0.4354380667209625, "step": 5000 }, { "epoch": 0.8614748449345279, "eval_logits/chosen": -2.817227602005005, "eval_logits/rejected": -2.8112363815307617, "eval_logps/chosen": -86.00804138183594, "eval_logps/rejected": -95.91358947753906, "eval_loss": 0.6706132292747498, "eval_rewards/accuracies": 0.595724880695343, "eval_rewards/chosen": -0.27296143770217896, "eval_rewards/margins": 0.054373256862163544, "eval_rewards/rejected": -0.3273346424102783, "eval_runtime": 383.7229, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 5000 }, { "epoch": 0.8631977946243969, "grad_norm": 7.232612133026123, "learning_rate": 8.964680797114426e-08, "logits/chosen": -2.692059278488159, "logits/rejected": -2.665595531463623, "logps/chosen": -90.13217163085938, "logps/rejected": -97.33879089355469, "loss": 0.655, "rewards/accuracies": 0.625, "rewards/chosen": -0.37081378698349, "rewards/margins": 0.09143435209989548, "rewards/rejected": -0.46224817633628845, "step": 5010 }, { "epoch": 0.864920744314266, "grad_norm": 6.6061248779296875, "learning_rate": 8.958565024682836e-08, "logits/chosen": -2.686988115310669, "logits/rejected": -2.6622235774993896, "logps/chosen": -87.38780975341797, "logps/rejected": -95.67384338378906, "loss": 0.6543, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3388838768005371, "rewards/margins": 0.09789013117551804, "rewards/rejected": -0.43677401542663574, "step": 5020 }, { "epoch": 0.8666436940041351, "grad_norm": 7.315520763397217, "learning_rate": 8.952433341181107e-08, "logits/chosen": -2.6427371501922607, "logits/rejected": -2.6327102184295654, "logps/chosen": -87.11759948730469, "logps/rejected": -97.06698608398438, "loss": 0.6585, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3450040817260742, "rewards/margins": 0.08607922494411469, "rewards/rejected": -0.4310832917690277, "step": 5030 }, { "epoch": 0.8683666436940042, "grad_norm": 6.265617370605469, "learning_rate": 8.946285771254948e-08, "logits/chosen": -2.85229754447937, "logits/rejected": -2.813417911529541, "logps/chosen": -93.10102844238281, "logps/rejected": -94.0805435180664, "loss": 0.6626, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3594356179237366, "rewards/margins": 0.0769016221165657, "rewards/rejected": -0.4363372325897217, "step": 5040 }, { "epoch": 0.8700895933838731, "grad_norm": 6.805141448974609, "learning_rate": 8.940122339613927e-08, "logits/chosen": -2.733008623123169, "logits/rejected": -2.7098228931427, "logps/chosen": -90.53655242919922, "logps/rejected": -99.46127319335938, "loss": 0.6529, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3493223190307617, "rewards/margins": 0.09822383522987366, "rewards/rejected": -0.4475461542606354, "step": 5050 }, { "epoch": 0.8718125430737422, "grad_norm": 5.649402618408203, "learning_rate": 8.933943071031359e-08, "logits/chosen": -2.5962395668029785, "logits/rejected": -2.590247869491577, "logps/chosen": -86.31114196777344, "logps/rejected": -95.39735412597656, "loss": 0.6628, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.35839253664016724, "rewards/margins": 0.07522771507501602, "rewards/rejected": -0.4336202144622803, "step": 5060 }, { "epoch": 0.8735354927636113, "grad_norm": 6.9938154220581055, "learning_rate": 8.92774799034422e-08, "logits/chosen": -2.696258544921875, "logits/rejected": -2.6725025177001953, "logps/chosen": -88.53520202636719, "logps/rejected": -94.08182525634766, "loss": 0.6516, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3578687608242035, "rewards/margins": 0.1016777753829956, "rewards/rejected": -0.4595465660095215, "step": 5070 }, { "epoch": 0.8752584424534804, "grad_norm": 5.28748893737793, "learning_rate": 8.921537122453037e-08, "logits/chosen": -2.88200044631958, "logits/rejected": -2.8455650806427, "logps/chosen": -90.97763061523438, "logps/rejected": -95.62336730957031, "loss": 0.6457, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3347325026988983, "rewards/margins": 0.11258021742105484, "rewards/rejected": -0.44731274247169495, "step": 5080 }, { "epoch": 0.8769813921433495, "grad_norm": 6.321465492248535, "learning_rate": 8.915310492321799e-08, "logits/chosen": -2.7534966468811035, "logits/rejected": -2.72301983833313, "logps/chosen": -89.15115356445312, "logps/rejected": -99.37260437011719, "loss": 0.6381, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.34860438108444214, "rewards/margins": 0.13011741638183594, "rewards/rejected": -0.4787217974662781, "step": 5090 }, { "epoch": 0.8787043418332184, "grad_norm": 8.228758811950684, "learning_rate": 8.909068124977839e-08, "logits/chosen": -2.5948662757873535, "logits/rejected": -2.5615808963775635, "logps/chosen": -92.40115356445312, "logps/rejected": -97.37530517578125, "loss": 0.6625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.37334275245666504, "rewards/margins": 0.0771847814321518, "rewards/rejected": -0.45052748918533325, "step": 5100 }, { "epoch": 0.8787043418332184, "eval_logits/chosen": -2.8071084022521973, "eval_logits/rejected": -2.8011698722839355, "eval_logps/chosen": -87.64527893066406, "eval_logps/rejected": -97.8499755859375, "eval_loss": 0.6695210337638855, "eval_rewards/accuracies": 0.5996747016906738, "eval_rewards/chosen": -0.28933385014533997, "eval_rewards/margins": 0.057364702224731445, "eval_rewards/rejected": -0.346698522567749, "eval_runtime": 384.3816, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 5100 }, { "epoch": 0.8804272915230875, "grad_norm": 7.149771690368652, "learning_rate": 8.902810045511753e-08, "logits/chosen": -2.729835271835327, "logits/rejected": -2.695655584335327, "logps/chosen": -96.39937591552734, "logps/rejected": -100.22602844238281, "loss": 0.6651, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.38526520133018494, "rewards/margins": 0.07859955728054047, "rewards/rejected": -0.4638647437095642, "step": 5110 }, { "epoch": 0.8821502412129566, "grad_norm": 6.199367523193359, "learning_rate": 8.896536279077287e-08, "logits/chosen": -2.7863945960998535, "logits/rejected": -2.757514238357544, "logps/chosen": -92.07826232910156, "logps/rejected": -94.53547668457031, "loss": 0.6693, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3820487856864929, "rewards/margins": 0.06044241040945053, "rewards/rejected": -0.4424911439418793, "step": 5120 }, { "epoch": 0.8838731909028257, "grad_norm": 6.102499008178711, "learning_rate": 8.89024685089124e-08, "logits/chosen": -2.8017592430114746, "logits/rejected": -2.770960569381714, "logps/chosen": -90.97740936279297, "logps/rejected": -96.40296173095703, "loss": 0.6565, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.374112993478775, "rewards/margins": 0.09014596045017242, "rewards/rejected": -0.46425899863243103, "step": 5130 }, { "epoch": 0.8855961405926946, "grad_norm": 5.7928385734558105, "learning_rate": 8.883941786233363e-08, "logits/chosen": -2.721494197845459, "logits/rejected": -2.6873159408569336, "logps/chosen": -93.21794128417969, "logps/rejected": -97.0384750366211, "loss": 0.6573, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3539964556694031, "rewards/margins": 0.08667747676372528, "rewards/rejected": -0.44067391753196716, "step": 5140 }, { "epoch": 0.8873190902825637, "grad_norm": 6.3323163986206055, "learning_rate": 8.877621110446253e-08, "logits/chosen": -2.755678176879883, "logits/rejected": -2.7467668056488037, "logps/chosen": -87.09465026855469, "logps/rejected": -99.27693939208984, "loss": 0.6488, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.33964866399765015, "rewards/margins": 0.10438643395900726, "rewards/rejected": -0.444035142660141, "step": 5150 }, { "epoch": 0.8890420399724328, "grad_norm": 7.0943474769592285, "learning_rate": 8.871284848935256e-08, "logits/chosen": -2.730428695678711, "logits/rejected": -2.696103811264038, "logps/chosen": -86.82064819335938, "logps/rejected": -93.75505065917969, "loss": 0.6506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.35403814911842346, "rewards/margins": 0.1033908948302269, "rewards/rejected": -0.45742902159690857, "step": 5160 }, { "epoch": 0.8907649896623019, "grad_norm": 6.670124530792236, "learning_rate": 8.864933027168367e-08, "logits/chosen": -2.7170047760009766, "logits/rejected": -2.6938488483428955, "logps/chosen": -87.88712310791016, "logps/rejected": -98.9630355834961, "loss": 0.6324, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32590562105178833, "rewards/margins": 0.14525170624256134, "rewards/rejected": -0.4711572527885437, "step": 5170 }, { "epoch": 0.892487939352171, "grad_norm": 7.565772533416748, "learning_rate": 8.858565670676117e-08, "logits/chosen": -2.80495548248291, "logits/rejected": -2.795027017593384, "logps/chosen": -95.3014907836914, "logps/rejected": -102.08565521240234, "loss": 0.6639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.39898473024368286, "rewards/margins": 0.07678544521331787, "rewards/rejected": -0.47577017545700073, "step": 5180 }, { "epoch": 0.8942108890420399, "grad_norm": 7.428860664367676, "learning_rate": 8.852182805051485e-08, "logits/chosen": -2.6876919269561768, "logits/rejected": -2.676413059234619, "logps/chosen": -90.71916198730469, "logps/rejected": -97.53370666503906, "loss": 0.6726, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.37878960371017456, "rewards/margins": 0.057644765824079514, "rewards/rejected": -0.4364343583583832, "step": 5190 }, { "epoch": 0.895933838731909, "grad_norm": 5.683623313903809, "learning_rate": 8.845784455949778e-08, "logits/chosen": -2.7533655166625977, "logits/rejected": -2.7321879863739014, "logps/chosen": -88.15218353271484, "logps/rejected": -100.27111053466797, "loss": 0.6509, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.36073580384254456, "rewards/margins": 0.10134414583444595, "rewards/rejected": -0.4620800018310547, "step": 5200 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -2.7990663051605225, "eval_logits/rejected": -2.7930634021759033, "eval_logps/chosen": -87.94992065429688, "eval_logps/rejected": -98.30118560791016, "eval_loss": 0.6689584851264954, "eval_rewards/accuracies": 0.5985130071640015, "eval_rewards/chosen": -0.29238027334213257, "eval_rewards/margins": 0.058830372989177704, "eval_rewards/rejected": -0.35121065378189087, "eval_runtime": 384.1767, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 5200 }, { "epoch": 0.8976567884217781, "grad_norm": 6.255385398864746, "learning_rate": 8.839370649088546e-08, "logits/chosen": -2.709812641143799, "logits/rejected": -2.694866180419922, "logps/chosen": -87.79924011230469, "logps/rejected": -94.70648956298828, "loss": 0.6597, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.36159998178482056, "rewards/margins": 0.08545978367328644, "rewards/rejected": -0.4470597207546234, "step": 5210 }, { "epoch": 0.8993797381116472, "grad_norm": 5.670936584472656, "learning_rate": 8.83294141024747e-08, "logits/chosen": -2.7418923377990723, "logits/rejected": -2.7291693687438965, "logps/chosen": -86.82774353027344, "logps/rejected": -96.816650390625, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": -0.3421666622161865, "rewards/margins": 0.06618931889533997, "rewards/rejected": -0.4083560109138489, "step": 5220 }, { "epoch": 0.9011026878015161, "grad_norm": 6.936209201812744, "learning_rate": 8.826496765268248e-08, "logits/chosen": -2.6761391162872314, "logits/rejected": -2.6653850078582764, "logps/chosen": -91.12430572509766, "logps/rejected": -105.36338806152344, "loss": 0.6376, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3586147427558899, "rewards/margins": 0.1338968575000763, "rewards/rejected": -0.4925116002559662, "step": 5230 }, { "epoch": 0.9028256374913852, "grad_norm": 6.240992069244385, "learning_rate": 8.820036740054516e-08, "logits/chosen": -2.6710047721862793, "logits/rejected": -2.649723529815674, "logps/chosen": -89.43163299560547, "logps/rejected": -106.2796401977539, "loss": 0.6238, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.36658650636672974, "rewards/margins": 0.16273388266563416, "rewards/rejected": -0.5293203592300415, "step": 5240 }, { "epoch": 0.9045485871812543, "grad_norm": 5.806526184082031, "learning_rate": 8.813561360571715e-08, "logits/chosen": -2.6251959800720215, "logits/rejected": -2.611636161804199, "logps/chosen": -88.37168884277344, "logps/rejected": -97.55949401855469, "loss": 0.6568, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3739396035671234, "rewards/margins": 0.09285987168550491, "rewards/rejected": -0.4667994976043701, "step": 5250 }, { "epoch": 0.9062715368711234, "grad_norm": 8.46798038482666, "learning_rate": 8.807070652847014e-08, "logits/chosen": -2.712798595428467, "logits/rejected": -2.6734073162078857, "logps/chosen": -92.40482330322266, "logps/rejected": -97.30726623535156, "loss": 0.6517, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.35619446635246277, "rewards/margins": 0.10177119076251984, "rewards/rejected": -0.4579657018184662, "step": 5260 }, { "epoch": 0.9079944865609925, "grad_norm": 6.423983573913574, "learning_rate": 8.800564642969182e-08, "logits/chosen": -2.806490182876587, "logits/rejected": -2.7925844192504883, "logps/chosen": -89.00984191894531, "logps/rejected": -99.77445983886719, "loss": 0.6463, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3745562434196472, "rewards/margins": 0.11172924935817719, "rewards/rejected": -0.486285537481308, "step": 5270 }, { "epoch": 0.9097174362508614, "grad_norm": 6.772559642791748, "learning_rate": 8.794043357088501e-08, "logits/chosen": -2.7092785835266113, "logits/rejected": -2.680488348007202, "logps/chosen": -94.13202667236328, "logps/rejected": -99.32430267333984, "loss": 0.6577, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3976258635520935, "rewards/margins": 0.088914655148983, "rewards/rejected": -0.4865404963493347, "step": 5280 }, { "epoch": 0.9114403859407305, "grad_norm": 6.844779968261719, "learning_rate": 8.787506821416648e-08, "logits/chosen": -2.656005382537842, "logits/rejected": -2.6143336296081543, "logps/chosen": -92.70957946777344, "logps/rejected": -98.14231872558594, "loss": 0.6561, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3685385584831238, "rewards/margins": 0.09249154478311539, "rewards/rejected": -0.4610300660133362, "step": 5290 }, { "epoch": 0.9131633356305996, "grad_norm": 6.572747707366943, "learning_rate": 8.780955062226598e-08, "logits/chosen": -2.663505792617798, "logits/rejected": -2.650599479675293, "logps/chosen": -90.8861083984375, "logps/rejected": -102.00794219970703, "loss": 0.6469, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3578792214393616, "rewards/margins": 0.11072443425655365, "rewards/rejected": -0.4686037003993988, "step": 5300 }, { "epoch": 0.9131633356305996, "eval_logits/chosen": -2.788172960281372, "eval_logits/rejected": -2.78218936920166, "eval_logps/chosen": -88.50018310546875, "eval_logps/rejected": -98.94988250732422, "eval_loss": 0.6686086058616638, "eval_rewards/accuracies": 0.597815990447998, "eval_rewards/chosen": -0.2978828549385071, "eval_rewards/margins": 0.05981473624706268, "eval_rewards/rejected": -0.35769757628440857, "eval_runtime": 384.0408, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 5300 }, { "epoch": 0.9148862853204687, "grad_norm": 5.892764091491699, "learning_rate": 8.774388105852517e-08, "logits/chosen": -2.785391330718994, "logits/rejected": -2.774961233139038, "logps/chosen": -97.1575927734375, "logps/rejected": -99.34544372558594, "loss": 0.6634, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3902384638786316, "rewards/margins": 0.07924894243478775, "rewards/rejected": -0.46948742866516113, "step": 5310 }, { "epoch": 0.9166092350103378, "grad_norm": 6.925797939300537, "learning_rate": 8.767805978689651e-08, "logits/chosen": -2.7012622356414795, "logits/rejected": -2.650627851486206, "logps/chosen": -95.94258880615234, "logps/rejected": -99.46453094482422, "loss": 0.6473, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3838873505592346, "rewards/margins": 0.10817118734121323, "rewards/rejected": -0.49205857515335083, "step": 5320 }, { "epoch": 0.9183321847002067, "grad_norm": 5.307535648345947, "learning_rate": 8.761208707194223e-08, "logits/chosen": -2.6463091373443604, "logits/rejected": -2.6482155323028564, "logps/chosen": -89.02336120605469, "logps/rejected": -102.9623031616211, "loss": 0.6479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.38169655203819275, "rewards/margins": 0.11367203295230865, "rewards/rejected": -0.495368629693985, "step": 5330 }, { "epoch": 0.9200551343900758, "grad_norm": 6.876779079437256, "learning_rate": 8.754596317883332e-08, "logits/chosen": -2.6829965114593506, "logits/rejected": -2.634899616241455, "logps/chosen": -94.00877380371094, "logps/rejected": -98.99806213378906, "loss": 0.6517, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3764682710170746, "rewards/margins": 0.09976208955049515, "rewards/rejected": -0.4762304425239563, "step": 5340 }, { "epoch": 0.9217780840799449, "grad_norm": 6.224486351013184, "learning_rate": 8.747968837334837e-08, "logits/chosen": -2.6352744102478027, "logits/rejected": -2.6014914512634277, "logps/chosen": -91.38629913330078, "logps/rejected": -100.00788116455078, "loss": 0.6554, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.38743850588798523, "rewards/margins": 0.09292630851268768, "rewards/rejected": -0.4803648591041565, "step": 5350 }, { "epoch": 0.923501033769814, "grad_norm": 7.481403350830078, "learning_rate": 8.741326292187257e-08, "logits/chosen": -2.743380069732666, "logits/rejected": -2.735330104827881, "logps/chosen": -89.44071197509766, "logps/rejected": -106.06976318359375, "loss": 0.6381, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.362265944480896, "rewards/margins": 0.13361617922782898, "rewards/rejected": -0.495882123708725, "step": 5360 }, { "epoch": 0.9252239834596829, "grad_norm": 7.762505531311035, "learning_rate": 8.734668709139663e-08, "logits/chosen": -2.6523056030273438, "logits/rejected": -2.636333465576172, "logps/chosen": -88.76700592041016, "logps/rejected": -97.87422943115234, "loss": 0.6652, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3684227764606476, "rewards/margins": 0.07627833634614944, "rewards/rejected": -0.44470110535621643, "step": 5370 }, { "epoch": 0.926946933149552, "grad_norm": 6.765832424163818, "learning_rate": 8.727996114951566e-08, "logits/chosen": -2.7545881271362305, "logits/rejected": -2.7174041271209717, "logps/chosen": -94.75218200683594, "logps/rejected": -100.50247955322266, "loss": 0.6455, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.373540461063385, "rewards/margins": 0.1151813417673111, "rewards/rejected": -0.4887217879295349, "step": 5380 }, { "epoch": 0.9286698828394211, "grad_norm": 6.449654579162598, "learning_rate": 8.721308536442814e-08, "logits/chosen": -2.6725001335144043, "logits/rejected": -2.6334259510040283, "logps/chosen": -99.27436828613281, "logps/rejected": -99.46378326416016, "loss": 0.6661, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4070706367492676, "rewards/margins": 0.07143436372280121, "rewards/rejected": -0.47850504517555237, "step": 5390 }, { "epoch": 0.9303928325292902, "grad_norm": 7.270046710968018, "learning_rate": 8.714606000493482e-08, "logits/chosen": -2.6885461807250977, "logits/rejected": -2.671773672103882, "logps/chosen": -89.15217590332031, "logps/rejected": -102.94400787353516, "loss": 0.6482, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3804583251476288, "rewards/margins": 0.11006374657154083, "rewards/rejected": -0.4905220568180084, "step": 5400 }, { "epoch": 0.9303928325292902, "eval_logits/chosen": -2.779904842376709, "eval_logits/rejected": -2.7739040851593018, "eval_logps/chosen": -88.95071411132812, "eval_logps/rejected": -99.54953002929688, "eval_loss": 0.6680476665496826, "eval_rewards/accuracies": 0.6038568615913391, "eval_rewards/chosen": -0.3023882508277893, "eval_rewards/margins": 0.061305828392505646, "eval_rewards/rejected": -0.36369404196739197, "eval_runtime": 384.0762, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 5400 }, { "epoch": 0.9321157822191593, "grad_norm": 7.766871452331543, "learning_rate": 8.707888534043772e-08, "logits/chosen": -2.6998820304870605, "logits/rejected": -2.688086986541748, "logps/chosen": -96.61395263671875, "logps/rejected": -102.01858520507812, "loss": 0.6618, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3997456729412079, "rewards/margins": 0.07956382632255554, "rewards/rejected": -0.4793094992637634, "step": 5410 }, { "epoch": 0.9338387319090282, "grad_norm": 7.364156246185303, "learning_rate": 8.701156164093888e-08, "logits/chosen": -2.7553787231445312, "logits/rejected": -2.735417127609253, "logps/chosen": -92.14382934570312, "logps/rejected": -101.41740417480469, "loss": 0.6565, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.38425585627555847, "rewards/margins": 0.09239666908979416, "rewards/rejected": -0.4766525328159332, "step": 5420 }, { "epoch": 0.9355616815988973, "grad_norm": 8.202524185180664, "learning_rate": 8.694408917703942e-08, "logits/chosen": -2.690218448638916, "logits/rejected": -2.6787049770355225, "logps/chosen": -94.65299987792969, "logps/rejected": -102.63578033447266, "loss": 0.6543, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4105490744113922, "rewards/margins": 0.09830156713724136, "rewards/rejected": -0.508850634098053, "step": 5430 }, { "epoch": 0.9372846312887664, "grad_norm": 6.53749418258667, "learning_rate": 8.68764682199384e-08, "logits/chosen": -2.6908693313598633, "logits/rejected": -2.67452073097229, "logps/chosen": -95.178466796875, "logps/rejected": -107.22066497802734, "loss": 0.6387, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3897704780101776, "rewards/margins": 0.12999257445335388, "rewards/rejected": -0.5197630524635315, "step": 5440 }, { "epoch": 0.9390075809786355, "grad_norm": 8.43091106414795, "learning_rate": 8.680869904143172e-08, "logits/chosen": -2.6179678440093994, "logits/rejected": -2.607678174972534, "logps/chosen": -96.59883117675781, "logps/rejected": -104.86076354980469, "loss": 0.6518, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.41019707918167114, "rewards/margins": 0.10363531112670898, "rewards/rejected": -0.5138323903083801, "step": 5450 }, { "epoch": 0.9407305306685044, "grad_norm": 7.467963695526123, "learning_rate": 8.674078191391108e-08, "logits/chosen": -2.6984951496124268, "logits/rejected": -2.683515787124634, "logps/chosen": -93.39624786376953, "logps/rejected": -99.15972900390625, "loss": 0.664, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.39552855491638184, "rewards/margins": 0.07595936208963394, "rewards/rejected": -0.4714879095554352, "step": 5460 }, { "epoch": 0.9424534803583735, "grad_norm": 7.247677326202393, "learning_rate": 8.66727171103628e-08, "logits/chosen": -2.6561055183410645, "logits/rejected": -2.637718915939331, "logps/chosen": -92.51533508300781, "logps/rejected": -98.60027313232422, "loss": 0.6711, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.41156691312789917, "rewards/margins": 0.0695917159318924, "rewards/rejected": -0.48115864396095276, "step": 5470 }, { "epoch": 0.9441764300482426, "grad_norm": 6.071343421936035, "learning_rate": 8.66045049043668e-08, "logits/chosen": -2.6847033500671387, "logits/rejected": -2.661642551422119, "logps/chosen": -92.6823501586914, "logps/rejected": -104.17033386230469, "loss": 0.641, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.39739149808883667, "rewards/margins": 0.1278773844242096, "rewards/rejected": -0.5252689123153687, "step": 5480 }, { "epoch": 0.9458993797381117, "grad_norm": 7.692708492279053, "learning_rate": 8.653614557009546e-08, "logits/chosen": -2.654143810272217, "logits/rejected": -2.627291440963745, "logps/chosen": -95.40496063232422, "logps/rejected": -101.32283020019531, "loss": 0.6577, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.398398220539093, "rewards/margins": 0.09270858019590378, "rewards/rejected": -0.4911068081855774, "step": 5490 }, { "epoch": 0.9476223294279807, "grad_norm": 8.546768188476562, "learning_rate": 8.646763938231252e-08, "logits/chosen": -2.7309505939483643, "logits/rejected": -2.714759349822998, "logps/chosen": -92.15448760986328, "logps/rejected": -106.44493103027344, "loss": 0.639, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.383436381816864, "rewards/margins": 0.13435621559619904, "rewards/rejected": -0.5177925825119019, "step": 5500 }, { "epoch": 0.9476223294279807, "eval_logits/chosen": -2.7674951553344727, "eval_logits/rejected": -2.7614736557006836, "eval_logps/chosen": -90.17369842529297, "eval_logps/rejected": -100.9876708984375, "eval_loss": 0.6673266291618347, "eval_rewards/accuracies": 0.6066449880599976, "eval_rewards/chosen": -0.31461793184280396, "eval_rewards/margins": 0.06345757842063904, "eval_rewards/rejected": -0.37807556986808777, "eval_runtime": 383.7672, "eval_samples_per_second": 11.215, "eval_steps_per_second": 1.402, "step": 5500 }, { "epoch": 0.9493452791178497, "grad_norm": 7.1926045417785645, "learning_rate": 8.6398986616372e-08, "logits/chosen": -2.6446940898895264, "logits/rejected": -2.629819393157959, "logps/chosen": -95.67147827148438, "logps/rejected": -101.98039245605469, "loss": 0.6676, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4244406819343567, "rewards/margins": 0.06804448366165161, "rewards/rejected": -0.4924851357936859, "step": 5510 }, { "epoch": 0.9510682288077188, "grad_norm": 7.491471290588379, "learning_rate": 8.633018754821704e-08, "logits/chosen": -2.6384119987487793, "logits/rejected": -2.614940643310547, "logps/chosen": -93.56713104248047, "logps/rejected": -101.3330307006836, "loss": 0.6434, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.37742677330970764, "rewards/margins": 0.12046167999505997, "rewards/rejected": -0.4978884756565094, "step": 5520 }, { "epoch": 0.9527911784975879, "grad_norm": 7.954461574554443, "learning_rate": 8.62612424543789e-08, "logits/chosen": -2.6789708137512207, "logits/rejected": -2.646639585494995, "logps/chosen": -100.33576965332031, "logps/rejected": -102.86430358886719, "loss": 0.6565, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4042905867099762, "rewards/margins": 0.09361861646175385, "rewards/rejected": -0.49790921807289124, "step": 5530 }, { "epoch": 0.954514128187457, "grad_norm": 7.99074125289917, "learning_rate": 8.61921516119757e-08, "logits/chosen": -2.6626570224761963, "logits/rejected": -2.6674015522003174, "logps/chosen": -91.5600814819336, "logps/rejected": -108.2236328125, "loss": 0.6427, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.39212900400161743, "rewards/margins": 0.12309417873620987, "rewards/rejected": -0.5152231454849243, "step": 5540 }, { "epoch": 0.956237077877326, "grad_norm": 6.5582075119018555, "learning_rate": 8.612291529871146e-08, "logits/chosen": -2.5428881645202637, "logits/rejected": -2.5276150703430176, "logps/chosen": -94.1728286743164, "logps/rejected": -102.33039855957031, "loss": 0.6694, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4155499041080475, "rewards/margins": 0.0669882521033287, "rewards/rejected": -0.4825381636619568, "step": 5550 }, { "epoch": 0.957960027567195, "grad_norm": 7.766365051269531, "learning_rate": 8.605353379287478e-08, "logits/chosen": -2.6667234897613525, "logits/rejected": -2.641800880432129, "logps/chosen": -90.39002227783203, "logps/rejected": -97.9429702758789, "loss": 0.654, "rewards/accuracies": 0.625, "rewards/chosen": -0.38503286242485046, "rewards/margins": 0.09949497878551483, "rewards/rejected": -0.4845278263092041, "step": 5560 }, { "epoch": 0.9596829772570641, "grad_norm": 7.693308353424072, "learning_rate": 8.5984007373338e-08, "logits/chosen": -2.699014186859131, "logits/rejected": -2.6899383068084717, "logps/chosen": -88.60310363769531, "logps/rejected": -106.08793640136719, "loss": 0.6488, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3793658912181854, "rewards/margins": 0.11214636266231537, "rewards/rejected": -0.4915122389793396, "step": 5570 }, { "epoch": 0.9614059269469332, "grad_norm": 5.970696449279785, "learning_rate": 8.591433631955582e-08, "logits/chosen": -2.580005645751953, "logits/rejected": -2.5667502880096436, "logps/chosen": -94.14044189453125, "logps/rejected": -105.60859680175781, "loss": 0.6488, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4096287190914154, "rewards/margins": 0.1085970550775528, "rewards/rejected": -0.5182257890701294, "step": 5580 }, { "epoch": 0.9631288766368022, "grad_norm": 9.693364143371582, "learning_rate": 8.584452091156432e-08, "logits/chosen": -2.7441248893737793, "logits/rejected": -2.703463315963745, "logps/chosen": -97.33702087402344, "logps/rejected": -105.43988037109375, "loss": 0.6573, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4050588011741638, "rewards/margins": 0.09781823307275772, "rewards/rejected": -0.5028769969940186, "step": 5590 }, { "epoch": 0.9648518263266712, "grad_norm": 7.37345027923584, "learning_rate": 8.57745614299798e-08, "logits/chosen": -2.7047462463378906, "logits/rejected": -2.7025909423828125, "logps/chosen": -93.438720703125, "logps/rejected": -108.75413513183594, "loss": 0.6515, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.40934810042381287, "rewards/margins": 0.10511176288127899, "rewards/rejected": -0.5144599080085754, "step": 5600 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -2.760322093963623, "eval_logits/rejected": -2.7542781829833984, "eval_logps/chosen": -89.83961486816406, "eval_logps/rejected": -100.77328491210938, "eval_loss": 0.666799008846283, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.3112771511077881, "eval_rewards/margins": 0.06465443223714828, "eval_rewards/rejected": -0.37593159079551697, "eval_runtime": 384.2448, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 5600 }, { "epoch": 0.9665747760165403, "grad_norm": 6.685023784637451, "learning_rate": 8.570445815599767e-08, "logits/chosen": -2.713970422744751, "logits/rejected": -2.6974878311157227, "logps/chosen": -91.72346496582031, "logps/rejected": -108.36578369140625, "loss": 0.6431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.39322951436042786, "rewards/margins": 0.1248430460691452, "rewards/rejected": -0.5180725455284119, "step": 5610 }, { "epoch": 0.9682977257064094, "grad_norm": 9.166258811950684, "learning_rate": 8.563421137139123e-08, "logits/chosen": -2.6391115188598633, "logits/rejected": -2.6187069416046143, "logps/chosen": -97.88960266113281, "logps/rejected": -102.65364837646484, "loss": 0.6581, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.42869919538497925, "rewards/margins": 0.09258860349655151, "rewards/rejected": -0.5212877988815308, "step": 5620 }, { "epoch": 0.9700206753962785, "grad_norm": 7.245903968811035, "learning_rate": 8.556382135851068e-08, "logits/chosen": -2.7014362812042236, "logits/rejected": -2.6737356185913086, "logps/chosen": -97.61897277832031, "logps/rejected": -99.8261489868164, "loss": 0.6703, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.41711297631263733, "rewards/margins": 0.06960290670394897, "rewards/rejected": -0.4867158830165863, "step": 5630 }, { "epoch": 0.9717436250861475, "grad_norm": 7.634911060333252, "learning_rate": 8.549328840028187e-08, "logits/chosen": -2.665235757827759, "logits/rejected": -2.651763916015625, "logps/chosen": -91.96749877929688, "logps/rejected": -104.59291076660156, "loss": 0.652, "rewards/accuracies": 0.65625, "rewards/chosen": -0.380215585231781, "rewards/margins": 0.10477562248706818, "rewards/rejected": -0.4849912226200104, "step": 5640 }, { "epoch": 0.9734665747760165, "grad_norm": 7.104193210601807, "learning_rate": 8.542261278020524e-08, "logits/chosen": -2.588043689727783, "logits/rejected": -2.580328941345215, "logps/chosen": -91.02288818359375, "logps/rejected": -104.02386474609375, "loss": 0.6457, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3901236653327942, "rewards/margins": 0.12807972729206085, "rewards/rejected": -0.5182033777236938, "step": 5650 }, { "epoch": 0.9751895244658856, "grad_norm": 7.564207553863525, "learning_rate": 8.535179478235461e-08, "logits/chosen": -2.596454620361328, "logits/rejected": -2.592784881591797, "logps/chosen": -92.5943832397461, "logps/rejected": -104.44905090332031, "loss": 0.6441, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4073053002357483, "rewards/margins": 0.1271769106388092, "rewards/rejected": -0.5344822406768799, "step": 5660 }, { "epoch": 0.9769124741557547, "grad_norm": 7.8629150390625, "learning_rate": 8.52808346913761e-08, "logits/chosen": -2.631941556930542, "logits/rejected": -2.612409830093384, "logps/chosen": -91.29837799072266, "logps/rejected": -102.11553955078125, "loss": 0.6438, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.37255948781967163, "rewards/margins": 0.12025030702352524, "rewards/rejected": -0.49280983209609985, "step": 5670 }, { "epoch": 0.9786354238456237, "grad_norm": 7.458493709564209, "learning_rate": 8.520973279248694e-08, "logits/chosen": -2.6747031211853027, "logits/rejected": -2.6367902755737305, "logps/chosen": -96.63722229003906, "logps/rejected": -105.88260650634766, "loss": 0.644, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4225873053073883, "rewards/margins": 0.11975307762622833, "rewards/rejected": -0.5423403382301331, "step": 5680 }, { "epoch": 0.9803583735354927, "grad_norm": 8.17159366607666, "learning_rate": 8.513848937147434e-08, "logits/chosen": -2.7038073539733887, "logits/rejected": -2.6744046211242676, "logps/chosen": -98.38480377197266, "logps/rejected": -104.3926010131836, "loss": 0.6422, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4038626551628113, "rewards/margins": 0.12340279668569565, "rewards/rejected": -0.5272655487060547, "step": 5690 }, { "epoch": 0.9820813232253618, "grad_norm": 9.138307571411133, "learning_rate": 8.506710471469438e-08, "logits/chosen": -2.5220253467559814, "logits/rejected": -2.50627064704895, "logps/chosen": -99.71697998046875, "logps/rejected": -106.26151275634766, "loss": 0.6512, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4193113446235657, "rewards/margins": 0.11023150384426117, "rewards/rejected": -0.5295428037643433, "step": 5700 }, { "epoch": 0.9820813232253618, "eval_logits/chosen": -2.749270439147949, "eval_logits/rejected": -2.7432141304016113, "eval_logps/chosen": -91.73849487304688, "eval_logps/rejected": -103.00379180908203, "eval_loss": 0.6657052636146545, "eval_rewards/accuracies": 0.609433114528656, "eval_rewards/chosen": -0.33026596903800964, "eval_rewards/margins": 0.0679706409573555, "eval_rewards/rejected": -0.39823657274246216, "eval_runtime": 383.8128, "eval_samples_per_second": 11.214, "eval_steps_per_second": 1.402, "step": 5700 }, { "epoch": 0.9838042729152309, "grad_norm": 8.108245849609375, "learning_rate": 8.499557910907078e-08, "logits/chosen": -2.6945862770080566, "logits/rejected": -2.668743848800659, "logps/chosen": -97.47035217285156, "logps/rejected": -108.9496841430664, "loss": 0.6387, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4235713481903076, "rewards/margins": 0.14009717106819153, "rewards/rejected": -0.5636684894561768, "step": 5710 }, { "epoch": 0.9855272226051, "grad_norm": 7.586965084075928, "learning_rate": 8.492391284209383e-08, "logits/chosen": -2.6646227836608887, "logits/rejected": -2.642545223236084, "logps/chosen": -93.4908447265625, "logps/rejected": -106.43135833740234, "loss": 0.6301, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4147884249687195, "rewards/margins": 0.15116877853870392, "rewards/rejected": -0.5659571886062622, "step": 5720 }, { "epoch": 0.987250172294969, "grad_norm": 7.231719017028809, "learning_rate": 8.485210620181915e-08, "logits/chosen": -2.7168619632720947, "logits/rejected": -2.7074432373046875, "logps/chosen": -93.1943359375, "logps/rejected": -104.40936279296875, "loss": 0.6497, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4146973490715027, "rewards/margins": 0.11056436598300934, "rewards/rejected": -0.5252617597579956, "step": 5730 }, { "epoch": 0.988973121984838, "grad_norm": 7.313248634338379, "learning_rate": 8.478015947686664e-08, "logits/chosen": -2.6228625774383545, "logits/rejected": -2.592677354812622, "logps/chosen": -105.03717041015625, "logps/rejected": -111.39955139160156, "loss": 0.6494, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4582861363887787, "rewards/margins": 0.11127295345067978, "rewards/rejected": -0.5695590972900391, "step": 5740 }, { "epoch": 0.9906960716747071, "grad_norm": 7.69813346862793, "learning_rate": 8.470807295641917e-08, "logits/chosen": -2.7569165229797363, "logits/rejected": -2.7294604778289795, "logps/chosen": -99.28314208984375, "logps/rejected": -102.55259704589844, "loss": 0.6556, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.42618900537490845, "rewards/margins": 0.09662672132253647, "rewards/rejected": -0.5228157043457031, "step": 5750 }, { "epoch": 0.9924190213645762, "grad_norm": 7.915000915527344, "learning_rate": 8.463584693022156e-08, "logits/chosen": -2.6352105140686035, "logits/rejected": -2.6096949577331543, "logps/chosen": -100.03324890136719, "logps/rejected": -110.30208587646484, "loss": 0.6532, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.436234712600708, "rewards/margins": 0.11109612137079239, "rewards/rejected": -0.5473308563232422, "step": 5760 }, { "epoch": 0.9941419710544452, "grad_norm": 8.061699867248535, "learning_rate": 8.45634816885794e-08, "logits/chosen": -2.593804121017456, "logits/rejected": -2.574969530105591, "logps/chosen": -90.83858489990234, "logps/rejected": -105.9139633178711, "loss": 0.6407, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3898393511772156, "rewards/margins": 0.12882938981056213, "rewards/rejected": -0.5186687111854553, "step": 5770 }, { "epoch": 0.9958649207443143, "grad_norm": 7.79086446762085, "learning_rate": 8.449097752235776e-08, "logits/chosen": -2.6222667694091797, "logits/rejected": -2.602914810180664, "logps/chosen": -93.59446716308594, "logps/rejected": -108.5308609008789, "loss": 0.6383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4113089442253113, "rewards/margins": 0.1343192309141159, "rewards/rejected": -0.5456281900405884, "step": 5780 }, { "epoch": 0.9975878704341833, "grad_norm": 7.241783618927002, "learning_rate": 8.441833472298014e-08, "logits/chosen": -2.562025308609009, "logits/rejected": -2.5287890434265137, "logps/chosen": -89.9479751586914, "logps/rejected": -105.0750961303711, "loss": 0.6408, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3847387433052063, "rewards/margins": 0.12667948007583618, "rewards/rejected": -0.5114182233810425, "step": 5790 }, { "epoch": 0.9993108201240524, "grad_norm": 8.019474983215332, "learning_rate": 8.434555358242728e-08, "logits/chosen": -2.646048069000244, "logits/rejected": -2.618975877761841, "logps/chosen": -92.06868743896484, "logps/rejected": -108.29134368896484, "loss": 0.6323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.413178026676178, "rewards/margins": 0.14786291122436523, "rewards/rejected": -0.561040997505188, "step": 5800 }, { "epoch": 0.9993108201240524, "eval_logits/chosen": -2.731804370880127, "eval_logits/rejected": -2.7257096767425537, "eval_logps/chosen": -94.23040008544922, "eval_logps/rejected": -105.85838317871094, "eval_loss": 0.6645354628562927, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.3551850914955139, "eval_rewards/margins": 0.07159748673439026, "eval_rewards/rejected": -0.42678260803222656, "eval_runtime": 383.7476, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 5800 }, { "epoch": 1.0010337698139213, "grad_norm": 7.8858513832092285, "learning_rate": 8.427263439323593e-08, "logits/chosen": -2.7004971504211426, "logits/rejected": -2.6849634647369385, "logps/chosen": -97.8449478149414, "logps/rejected": -110.03662109375, "loss": 0.6421, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4377388060092926, "rewards/margins": 0.1266471892595291, "rewards/rejected": -0.5643860697746277, "step": 5810 }, { "epoch": 1.0027567195037905, "grad_norm": 6.8778276443481445, "learning_rate": 8.419957744849773e-08, "logits/chosen": -2.6962661743164062, "logits/rejected": -2.663400173187256, "logps/chosen": -97.3134765625, "logps/rejected": -110.54541015625, "loss": 0.6237, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.44287461042404175, "rewards/margins": 0.16928058862686157, "rewards/rejected": -0.6121551394462585, "step": 5820 }, { "epoch": 1.0044796691936595, "grad_norm": 8.916372299194336, "learning_rate": 8.412638304185805e-08, "logits/chosen": -2.5855021476745605, "logits/rejected": -2.5558366775512695, "logps/chosen": -96.43072509765625, "logps/rejected": -108.48006439208984, "loss": 0.6437, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4293416142463684, "rewards/margins": 0.12681038677692413, "rewards/rejected": -0.5561521053314209, "step": 5830 }, { "epoch": 1.0062026188835287, "grad_norm": 6.786613941192627, "learning_rate": 8.405305146751472e-08, "logits/chosen": -2.6135430335998535, "logits/rejected": -2.5935873985290527, "logps/chosen": -94.95622253417969, "logps/rejected": -113.15129089355469, "loss": 0.6301, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4256817698478699, "rewards/margins": 0.16012795269489288, "rewards/rejected": -0.5858098268508911, "step": 5840 }, { "epoch": 1.0079255685733977, "grad_norm": 8.19491195678711, "learning_rate": 8.397958302021695e-08, "logits/chosen": -2.6712794303894043, "logits/rejected": -2.6515040397644043, "logps/chosen": -94.15184783935547, "logps/rejected": -115.56861877441406, "loss": 0.6198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.432370662689209, "rewards/margins": 0.1783347725868225, "rewards/rejected": -0.6107054352760315, "step": 5850 }, { "epoch": 1.0096485182632666, "grad_norm": 7.014339923858643, "learning_rate": 8.390597799526404e-08, "logits/chosen": -2.53800106048584, "logits/rejected": -2.528062343597412, "logps/chosen": -97.08416748046875, "logps/rejected": -115.81413269042969, "loss": 0.629, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4531080722808838, "rewards/margins": 0.17267939448356628, "rewards/rejected": -0.6257874965667725, "step": 5860 }, { "epoch": 1.0113714679531358, "grad_norm": 11.040721893310547, "learning_rate": 8.383223668850433e-08, "logits/chosen": -2.5746819972991943, "logits/rejected": -2.5552728176116943, "logps/chosen": -104.11962890625, "logps/rejected": -118.37178039550781, "loss": 0.6328, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4793880581855774, "rewards/margins": 0.15406163036823273, "rewards/rejected": -0.6334496736526489, "step": 5870 }, { "epoch": 1.0130944176430048, "grad_norm": 8.925549507141113, "learning_rate": 8.375835939633384e-08, "logits/chosen": -2.64033842086792, "logits/rejected": -2.628830671310425, "logps/chosen": -98.64120483398438, "logps/rejected": -108.58721923828125, "loss": 0.6482, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.43899255990982056, "rewards/margins": 0.11557696759700775, "rewards/rejected": -0.5545695424079895, "step": 5880 }, { "epoch": 1.014817367332874, "grad_norm": 9.15612506866455, "learning_rate": 8.368434641569524e-08, "logits/chosen": -2.664888858795166, "logits/rejected": -2.6511969566345215, "logps/chosen": -100.8013687133789, "logps/rejected": -116.66251373291016, "loss": 0.643, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.45635995268821716, "rewards/margins": 0.12817351520061493, "rewards/rejected": -0.5845334529876709, "step": 5890 }, { "epoch": 1.016540317022743, "grad_norm": 8.083471298217773, "learning_rate": 8.361019804407657e-08, "logits/chosen": -2.5785346031188965, "logits/rejected": -2.5600547790527344, "logps/chosen": -103.21112060546875, "logps/rejected": -117.26570892333984, "loss": 0.632, "rewards/accuracies": 0.6875, "rewards/chosen": -0.47545090317726135, "rewards/margins": 0.15051282942295074, "rewards/rejected": -0.6259638071060181, "step": 5900 }, { "epoch": 1.016540317022743, "eval_logits/chosen": -2.708470344543457, "eval_logits/rejected": -2.702338695526123, "eval_logps/chosen": -97.82315826416016, "eval_logps/rejected": -109.99978637695312, "eval_loss": 0.6629430055618286, "eval_rewards/accuracies": 0.6085036993026733, "eval_rewards/chosen": -0.39111265540122986, "eval_rewards/margins": 0.07708395272493362, "eval_rewards/rejected": -0.4681966304779053, "eval_runtime": 384.8292, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 5900 }, { "epoch": 1.018263266712612, "grad_norm": 8.833287239074707, "learning_rate": 8.353591457951005e-08, "logits/chosen": -2.5715785026550293, "logits/rejected": -2.5744786262512207, "logps/chosen": -98.71360778808594, "logps/rejected": -114.59403991699219, "loss": 0.6514, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4620113968849182, "rewards/margins": 0.10676075518131256, "rewards/rejected": -0.5687721967697144, "step": 5910 }, { "epoch": 1.019986216402481, "grad_norm": 8.468814849853516, "learning_rate": 8.346149632057089e-08, "logits/chosen": -2.6088509559631348, "logits/rejected": -2.591217517852783, "logps/chosen": -98.80986022949219, "logps/rejected": -112.15071105957031, "loss": 0.6493, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.47326168417930603, "rewards/margins": 0.11317290365695953, "rewards/rejected": -0.5864346027374268, "step": 5920 }, { "epoch": 1.02170916609235, "grad_norm": 7.3569183349609375, "learning_rate": 8.338694356637612e-08, "logits/chosen": -2.627300500869751, "logits/rejected": -2.620506763458252, "logps/chosen": -102.12773132324219, "logps/rejected": -114.47517395019531, "loss": 0.6523, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5189951658248901, "rewards/margins": 0.1140846237540245, "rewards/rejected": -0.6330798268318176, "step": 5930 }, { "epoch": 1.0234321157822193, "grad_norm": 7.738087177276611, "learning_rate": 8.331225661658331e-08, "logits/chosen": -2.6179256439208984, "logits/rejected": -2.5811519622802734, "logps/chosen": -99.44174194335938, "logps/rejected": -114.23796081542969, "loss": 0.6291, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4520456790924072, "rewards/margins": 0.15967020392417908, "rewards/rejected": -0.6117158532142639, "step": 5940 }, { "epoch": 1.0251550654720882, "grad_norm": 8.31123161315918, "learning_rate": 8.323743577138949e-08, "logits/chosen": -2.4969635009765625, "logits/rejected": -2.492830991744995, "logps/chosen": -99.90968322753906, "logps/rejected": -111.66524505615234, "loss": 0.6477, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.462044894695282, "rewards/margins": 0.11489604413509369, "rewards/rejected": -0.5769410133361816, "step": 5950 }, { "epoch": 1.0268780151619572, "grad_norm": 7.500054359436035, "learning_rate": 8.316248133152979e-08, "logits/chosen": -2.556940793991089, "logits/rejected": -2.5101706981658936, "logps/chosen": -107.40179443359375, "logps/rejected": -110.4677734375, "loss": 0.6645, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.49078473448753357, "rewards/margins": 0.09588263928890228, "rewards/rejected": -0.5866674184799194, "step": 5960 }, { "epoch": 1.0286009648518264, "grad_norm": 8.492338180541992, "learning_rate": 8.308739359827636e-08, "logits/chosen": -2.539529800415039, "logits/rejected": -2.5230045318603516, "logps/chosen": -98.50740051269531, "logps/rejected": -112.32657623291016, "loss": 0.6321, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4661005139350891, "rewards/margins": 0.15801900625228882, "rewards/rejected": -0.6241195201873779, "step": 5970 }, { "epoch": 1.0303239145416954, "grad_norm": 8.059408187866211, "learning_rate": 8.301217287343709e-08, "logits/chosen": -2.5459558963775635, "logits/rejected": -2.545886278152466, "logps/chosen": -96.28752899169922, "logps/rejected": -116.68046569824219, "loss": 0.6263, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.43717512488365173, "rewards/margins": 0.16448244452476501, "rewards/rejected": -0.6016575694084167, "step": 5980 }, { "epoch": 1.0320468642315643, "grad_norm": 8.546866416931152, "learning_rate": 8.293681945935445e-08, "logits/chosen": -2.6126601696014404, "logits/rejected": -2.5796117782592773, "logps/chosen": -101.33241271972656, "logps/rejected": -111.3437271118164, "loss": 0.6419, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4625963568687439, "rewards/margins": 0.13088804483413696, "rewards/rejected": -0.5934844017028809, "step": 5990 }, { "epoch": 1.0337698139214335, "grad_norm": 8.398924827575684, "learning_rate": 8.286133365890421e-08, "logits/chosen": -2.5918524265289307, "logits/rejected": -2.575507879257202, "logps/chosen": -100.43080139160156, "logps/rejected": -111.91226959228516, "loss": 0.654, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4832669794559479, "rewards/margins": 0.11311982572078705, "rewards/rejected": -0.5963867902755737, "step": 6000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -2.696943998336792, "eval_logits/rejected": -2.6907052993774414, "eval_logps/chosen": -96.78338623046875, "eval_logps/rejected": -108.89262390136719, "eval_loss": 0.663216769695282, "eval_rewards/accuracies": 0.6075743436813354, "eval_rewards/chosen": -0.38071489334106445, "eval_rewards/margins": 0.07641009986400604, "eval_rewards/rejected": -0.4571249783039093, "eval_runtime": 385.0723, "eval_samples_per_second": 11.177, "eval_steps_per_second": 1.397, "step": 6000 }, { "epoch": 1.0354927636113025, "grad_norm": 10.142074584960938, "learning_rate": 8.278571577549425e-08, "logits/chosen": -2.597203254699707, "logits/rejected": -2.5821285247802734, "logps/chosen": -101.98750305175781, "logps/rejected": -109.36756896972656, "loss": 0.665, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.48952287435531616, "rewards/margins": 0.09009354561567307, "rewards/rejected": -0.5796164274215698, "step": 6010 }, { "epoch": 1.0372157133011717, "grad_norm": 8.646759986877441, "learning_rate": 8.270996611306335e-08, "logits/chosen": -2.743859052658081, "logits/rejected": -2.7142465114593506, "logps/chosen": -104.6760025024414, "logps/rejected": -110.75141906738281, "loss": 0.6587, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5028871297836304, "rewards/margins": 0.10563130676746368, "rewards/rejected": -0.6085184812545776, "step": 6020 }, { "epoch": 1.0389386629910407, "grad_norm": 7.222773551940918, "learning_rate": 8.263408497607998e-08, "logits/chosen": -2.4255988597869873, "logits/rejected": -2.413698196411133, "logps/chosen": -98.52910614013672, "logps/rejected": -110.16273498535156, "loss": 0.6539, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4869111180305481, "rewards/margins": 0.11155279725790024, "rewards/rejected": -0.5984638929367065, "step": 6030 }, { "epoch": 1.0406616126809096, "grad_norm": 8.066089630126953, "learning_rate": 8.255807266954104e-08, "logits/chosen": -2.606157064437866, "logits/rejected": -2.588623285293579, "logps/chosen": -99.98893737792969, "logps/rejected": -111.00299072265625, "loss": 0.6422, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4628068804740906, "rewards/margins": 0.13045981526374817, "rewards/rejected": -0.5932666659355164, "step": 6040 }, { "epoch": 1.0423845623707788, "grad_norm": 8.780072212219238, "learning_rate": 8.248192949897068e-08, "logits/chosen": -2.5141348838806152, "logits/rejected": -2.490442991256714, "logps/chosen": -108.51566314697266, "logps/rejected": -117.81304931640625, "loss": 0.6322, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4917355477809906, "rewards/margins": 0.14940030872821808, "rewards/rejected": -0.6411358118057251, "step": 6050 }, { "epoch": 1.0441075120606478, "grad_norm": 8.989575386047363, "learning_rate": 8.2405655770419e-08, "logits/chosen": -2.573965549468994, "logits/rejected": -2.55254864692688, "logps/chosen": -102.88951110839844, "logps/rejected": -112.5423355102539, "loss": 0.6528, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.49394720792770386, "rewards/margins": 0.11183132976293564, "rewards/rejected": -0.6057785749435425, "step": 6060 }, { "epoch": 1.045830461750517, "grad_norm": 8.59586238861084, "learning_rate": 8.232925179046092e-08, "logits/chosen": -2.5845422744750977, "logits/rejected": -2.563185930252075, "logps/chosen": -100.42613220214844, "logps/rejected": -113.06158447265625, "loss": 0.6332, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4648379683494568, "rewards/margins": 0.14855217933654785, "rewards/rejected": -0.6133901476860046, "step": 6070 }, { "epoch": 1.047553411440386, "grad_norm": 8.119184494018555, "learning_rate": 8.225271786619485e-08, "logits/chosen": -2.55905818939209, "logits/rejected": -2.540487766265869, "logps/chosen": -106.81014251708984, "logps/rejected": -115.96821594238281, "loss": 0.6355, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.47734540700912476, "rewards/margins": 0.1465647965669632, "rewards/rejected": -0.6239102482795715, "step": 6080 }, { "epoch": 1.049276361130255, "grad_norm": 8.115350723266602, "learning_rate": 8.217605430524151e-08, "logits/chosen": -2.6180989742279053, "logits/rejected": -2.5999579429626465, "logps/chosen": -96.67985534667969, "logps/rejected": -114.14349365234375, "loss": 0.6206, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.45271745324134827, "rewards/margins": 0.17520318925380707, "rewards/rejected": -0.6279206275939941, "step": 6090 }, { "epoch": 1.050999310820124, "grad_norm": 8.631158828735352, "learning_rate": 8.209926141574268e-08, "logits/chosen": -2.5867297649383545, "logits/rejected": -2.582693576812744, "logps/chosen": -107.13438415527344, "logps/rejected": -120.7203598022461, "loss": 0.6293, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5035117864608765, "rewards/margins": 0.1649848371744156, "rewards/rejected": -0.6684966087341309, "step": 6100 }, { "epoch": 1.050999310820124, "eval_logits/chosen": -2.683070182800293, "eval_logits/rejected": -2.6768126487731934, "eval_logps/chosen": -97.87681579589844, "eval_logps/rejected": -110.21141052246094, "eval_loss": 0.662380039691925, "eval_rewards/accuracies": 0.6110594868659973, "eval_rewards/chosen": -0.3916492164134979, "eval_rewards/margins": 0.07866359502077103, "eval_rewards/rejected": -0.4703127443790436, "eval_runtime": 384.7571, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 6100 }, { "epoch": 1.052722260509993, "grad_norm": 7.67395544052124, "learning_rate": 8.202233950635999e-08, "logits/chosen": -2.585909843444824, "logits/rejected": -2.571315050125122, "logps/chosen": -101.78653717041016, "logps/rejected": -119.69728088378906, "loss": 0.6296, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5008759498596191, "rewards/margins": 0.16121146082878113, "rewards/rejected": -0.6620875000953674, "step": 6110 }, { "epoch": 1.0544452101998623, "grad_norm": 8.022553443908691, "learning_rate": 8.194528888627361e-08, "logits/chosen": -2.6834945678710938, "logits/rejected": -2.6373043060302734, "logps/chosen": -101.11886596679688, "logps/rejected": -120.63822174072266, "loss": 0.6112, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4679715037345886, "rewards/margins": 0.2036353051662445, "rewards/rejected": -0.6716068387031555, "step": 6120 }, { "epoch": 1.0561681598897312, "grad_norm": 8.303075790405273, "learning_rate": 8.186810986518112e-08, "logits/chosen": -2.585390329360962, "logits/rejected": -2.5620946884155273, "logps/chosen": -103.96269226074219, "logps/rejected": -119.4062728881836, "loss": 0.6325, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4957105219364166, "rewards/margins": 0.15889684855937958, "rewards/rejected": -0.6546074151992798, "step": 6130 }, { "epoch": 1.0578911095796002, "grad_norm": 9.664684295654297, "learning_rate": 8.179080275329606e-08, "logits/chosen": -2.6101136207580566, "logits/rejected": -2.597914934158325, "logps/chosen": -102.56831359863281, "logps/rejected": -117.04487609863281, "loss": 0.6383, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4948934018611908, "rewards/margins": 0.15076062083244324, "rewards/rejected": -0.645654022693634, "step": 6140 }, { "epoch": 1.0596140592694694, "grad_norm": 9.094680786132812, "learning_rate": 8.171336786134699e-08, "logits/chosen": -2.530261278152466, "logits/rejected": -2.5166783332824707, "logps/chosen": -104.24058532714844, "logps/rejected": -113.39418029785156, "loss": 0.6511, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5123268365859985, "rewards/margins": 0.12172627449035645, "rewards/rejected": -0.634053111076355, "step": 6150 }, { "epoch": 1.0613370089593384, "grad_norm": 9.177583694458008, "learning_rate": 8.163580550057596e-08, "logits/chosen": -2.4778246879577637, "logits/rejected": -2.46870493888855, "logps/chosen": -99.19271087646484, "logps/rejected": -113.863037109375, "loss": 0.6439, "rewards/accuracies": 0.625, "rewards/chosen": -0.4821690618991852, "rewards/margins": 0.1309712529182434, "rewards/rejected": -0.613140344619751, "step": 6160 }, { "epoch": 1.0630599586492075, "grad_norm": 7.216751575469971, "learning_rate": 8.155811598273737e-08, "logits/chosen": -2.601728916168213, "logits/rejected": -2.587202310562134, "logps/chosen": -115.6440200805664, "logps/rejected": -126.56358337402344, "loss": 0.6402, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5618337392807007, "rewards/margins": 0.13908319175243378, "rewards/rejected": -0.7009170651435852, "step": 6170 }, { "epoch": 1.0647829083390765, "grad_norm": 10.469382286071777, "learning_rate": 8.148029962009677e-08, "logits/chosen": -2.6094672679901123, "logits/rejected": -2.587141990661621, "logps/chosen": -109.9295654296875, "logps/rejected": -118.58846282958984, "loss": 0.6432, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5177440643310547, "rewards/margins": 0.13834531605243683, "rewards/rejected": -0.6560893654823303, "step": 6180 }, { "epoch": 1.0665058580289455, "grad_norm": 9.211211204528809, "learning_rate": 8.140235672542951e-08, "logits/chosen": -2.585750102996826, "logits/rejected": -2.562568187713623, "logps/chosen": -109.43128967285156, "logps/rejected": -122.4117431640625, "loss": 0.6279, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5125532150268555, "rewards/margins": 0.18153855204582214, "rewards/rejected": -0.6940917372703552, "step": 6190 }, { "epoch": 1.0682288077188147, "grad_norm": 8.883831024169922, "learning_rate": 8.132428761201953e-08, "logits/chosen": -2.4775471687316895, "logits/rejected": -2.4578866958618164, "logps/chosen": -105.61055755615234, "logps/rejected": -122.1505355834961, "loss": 0.6314, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5109606981277466, "rewards/margins": 0.15817254781723022, "rewards/rejected": -0.6691333055496216, "step": 6200 }, { "epoch": 1.0682288077188147, "eval_logits/chosen": -2.6697161197662354, "eval_logits/rejected": -2.6634604930877686, "eval_logps/chosen": -100.99473571777344, "eval_logps/rejected": -113.78132629394531, "eval_loss": 0.6610652208328247, "eval_rewards/accuracies": 0.6119888424873352, "eval_rewards/chosen": -0.4228283166885376, "eval_rewards/margins": 0.08318374305963516, "eval_rewards/rejected": -0.5060120820999146, "eval_runtime": 384.8283, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 6200 }, { "epoch": 1.0699517574086836, "grad_norm": 9.150965690612793, "learning_rate": 8.124609259365812e-08, "logits/chosen": -2.5794265270233154, "logits/rejected": -2.5522186756134033, "logps/chosen": -109.03935241699219, "logps/rejected": -122.3956069946289, "loss": 0.6352, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5421409606933594, "rewards/margins": 0.15285472571849823, "rewards/rejected": -0.6949957013130188, "step": 6210 }, { "epoch": 1.0716747070985528, "grad_norm": 8.55938720703125, "learning_rate": 8.116777198464257e-08, "logits/chosen": -2.5671470165252686, "logits/rejected": -2.548713207244873, "logps/chosen": -103.78585052490234, "logps/rejected": -118.60389709472656, "loss": 0.6366, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5164681673049927, "rewards/margins": 0.14410123229026794, "rewards/rejected": -0.6605693101882935, "step": 6220 }, { "epoch": 1.0733976567884218, "grad_norm": 7.881919860839844, "learning_rate": 8.108932609977504e-08, "logits/chosen": -2.701103448867798, "logits/rejected": -2.6826889514923096, "logps/chosen": -107.50831604003906, "logps/rejected": -129.65963745117188, "loss": 0.5966, "rewards/accuracies": 0.71875, "rewards/chosen": -0.48675599694252014, "rewards/margins": 0.24431733787059784, "rewards/rejected": -0.731073260307312, "step": 6230 }, { "epoch": 1.0751206064782908, "grad_norm": 9.516952514648438, "learning_rate": 8.101075525436121e-08, "logits/chosen": -2.5149810314178467, "logits/rejected": -2.4864091873168945, "logps/chosen": -108.58879089355469, "logps/rejected": -121.03299713134766, "loss": 0.6375, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5220197439193726, "rewards/margins": 0.15128469467163086, "rewards/rejected": -0.6733046174049377, "step": 6240 }, { "epoch": 1.07684355616816, "grad_norm": 16.308387756347656, "learning_rate": 8.093205976420896e-08, "logits/chosen": -2.600565195083618, "logits/rejected": -2.5785927772521973, "logps/chosen": -107.17005920410156, "logps/rejected": -115.45458984375, "loss": 0.6598, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5144227147102356, "rewards/margins": 0.10117814689874649, "rewards/rejected": -0.6156008839607239, "step": 6250 }, { "epoch": 1.078566505858029, "grad_norm": 9.17185115814209, "learning_rate": 8.085323994562727e-08, "logits/chosen": -2.4759507179260254, "logits/rejected": -2.4563660621643066, "logps/chosen": -110.73283386230469, "logps/rejected": -119.28691101074219, "loss": 0.6499, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5680841207504272, "rewards/margins": 0.1209753155708313, "rewards/rejected": -0.6890594363212585, "step": 6260 }, { "epoch": 1.080289455547898, "grad_norm": 8.293094635009766, "learning_rate": 8.077429611542476e-08, "logits/chosen": -2.7337212562561035, "logits/rejected": -2.7329859733581543, "logps/chosen": -107.32122802734375, "logps/rejected": -123.2201919555664, "loss": 0.6401, "rewards/accuracies": 0.625, "rewards/chosen": -0.5442054271697998, "rewards/margins": 0.1416146159172058, "rewards/rejected": -0.6858199834823608, "step": 6270 }, { "epoch": 1.082012405237767, "grad_norm": 12.452573776245117, "learning_rate": 8.069522859090856e-08, "logits/chosen": -2.428518772125244, "logits/rejected": -2.4068827629089355, "logps/chosen": -108.6520767211914, "logps/rejected": -119.85020446777344, "loss": 0.6463, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5423984527587891, "rewards/margins": 0.12967154383659363, "rewards/rejected": -0.6720700263977051, "step": 6280 }, { "epoch": 1.083735354927636, "grad_norm": 8.479166984558105, "learning_rate": 8.061603768988294e-08, "logits/chosen": -2.511026620864868, "logits/rejected": -2.4835691452026367, "logps/chosen": -102.7119369506836, "logps/rejected": -120.36119079589844, "loss": 0.6294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5133351683616638, "rewards/margins": 0.1665147989988327, "rewards/rejected": -0.6798499822616577, "step": 6290 }, { "epoch": 1.0854583046175053, "grad_norm": 10.123178482055664, "learning_rate": 8.053672373064811e-08, "logits/chosen": -2.5713682174682617, "logits/rejected": -2.5471014976501465, "logps/chosen": -110.4374771118164, "logps/rejected": -121.81849670410156, "loss": 0.6526, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.580012321472168, "rewards/margins": 0.11772135645151138, "rewards/rejected": -0.6977337002754211, "step": 6300 }, { "epoch": 1.0854583046175053, "eval_logits/chosen": -2.659346342086792, "eval_logits/rejected": -2.652965784072876, "eval_logps/chosen": -102.64820861816406, "eval_logps/rejected": -115.80354309082031, "eval_loss": 0.6599147915840149, "eval_rewards/accuracies": 0.6145446300506592, "eval_rewards/chosen": -0.4393632113933563, "eval_rewards/margins": 0.08687097579240799, "eval_rewards/rejected": -0.5262341499328613, "eval_runtime": 384.7477, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 6300 }, { "epoch": 1.0871812543073742, "grad_norm": 10.9315185546875, "learning_rate": 8.045728703199885e-08, "logits/chosen": -2.554922580718994, "logits/rejected": -2.527207851409912, "logps/chosen": -102.4748306274414, "logps/rejected": -114.82203674316406, "loss": 0.6545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.520179271697998, "rewards/margins": 0.11252252757549286, "rewards/rejected": -0.6327018141746521, "step": 6310 }, { "epoch": 1.0889042039972432, "grad_norm": 9.028841018676758, "learning_rate": 8.037772791322331e-08, "logits/chosen": -2.515536069869995, "logits/rejected": -2.4959588050842285, "logps/chosen": -110.16471862792969, "logps/rejected": -122.38230895996094, "loss": 0.644, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5544276833534241, "rewards/margins": 0.13615959882736206, "rewards/rejected": -0.6905871629714966, "step": 6320 }, { "epoch": 1.0906271536871124, "grad_norm": 9.55019760131836, "learning_rate": 8.029804669410171e-08, "logits/chosen": -2.5275192260742188, "logits/rejected": -2.506976366043091, "logps/chosen": -104.4784927368164, "logps/rejected": -127.18218994140625, "loss": 0.604, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5118848085403442, "rewards/margins": 0.2238554060459137, "rewards/rejected": -0.7357402443885803, "step": 6330 }, { "epoch": 1.0923501033769814, "grad_norm": 8.612353324890137, "learning_rate": 8.0218243694905e-08, "logits/chosen": -2.5484752655029297, "logits/rejected": -2.527956247329712, "logps/chosen": -107.1147232055664, "logps/rejected": -118.55269622802734, "loss": 0.6393, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5152736306190491, "rewards/margins": 0.14164672791957855, "rewards/rejected": -0.6569203734397888, "step": 6340 }, { "epoch": 1.0940730530668505, "grad_norm": 8.498324394226074, "learning_rate": 8.013831923639363e-08, "logits/chosen": -2.511718511581421, "logits/rejected": -2.504115104675293, "logps/chosen": -107.2218017578125, "logps/rejected": -124.30360412597656, "loss": 0.63, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5170698761940002, "rewards/margins": 0.1629563570022583, "rewards/rejected": -0.6800262928009033, "step": 6350 }, { "epoch": 1.0957960027567195, "grad_norm": 10.749839782714844, "learning_rate": 8.005827363981626e-08, "logits/chosen": -2.533294916152954, "logits/rejected": -2.529582977294922, "logps/chosen": -103.89247131347656, "logps/rejected": -121.03326416015625, "loss": 0.6324, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5042386651039124, "rewards/margins": 0.16025623679161072, "rewards/rejected": -0.6644949316978455, "step": 6360 }, { "epoch": 1.0975189524465885, "grad_norm": 10.076329231262207, "learning_rate": 7.997810722690845e-08, "logits/chosen": -2.5595498085021973, "logits/rejected": -2.557542324066162, "logps/chosen": -105.4200668334961, "logps/rejected": -119.04295349121094, "loss": 0.6518, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5227575302124023, "rewards/margins": 0.12325102090835571, "rewards/rejected": -0.6460086107254028, "step": 6370 }, { "epoch": 1.0992419021364577, "grad_norm": 10.629008293151855, "learning_rate": 7.989782031989135e-08, "logits/chosen": -2.5922961235046387, "logits/rejected": -2.580679416656494, "logps/chosen": -108.01606750488281, "logps/rejected": -127.7287826538086, "loss": 0.6325, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5528152585029602, "rewards/margins": 0.15926948189735413, "rewards/rejected": -0.7120847105979919, "step": 6380 }, { "epoch": 1.1009648518263266, "grad_norm": 8.468249320983887, "learning_rate": 7.981741324147043e-08, "logits/chosen": -2.6312594413757324, "logits/rejected": -2.5986592769622803, "logps/chosen": -108.42940521240234, "logps/rejected": -120.1337890625, "loss": 0.6321, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5234563946723938, "rewards/margins": 0.16786018013954163, "rewards/rejected": -0.6913164854049683, "step": 6390 }, { "epoch": 1.1026878015161956, "grad_norm": 9.646410942077637, "learning_rate": 7.973688631483421e-08, "logits/chosen": -2.567979335784912, "logits/rejected": -2.545125961303711, "logps/chosen": -109.34147644042969, "logps/rejected": -123.15412902832031, "loss": 0.6347, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5164808034896851, "rewards/margins": 0.14966382086277008, "rewards/rejected": -0.6661446690559387, "step": 6400 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -2.6499454975128174, "eval_logits/rejected": -2.6435108184814453, "eval_logps/chosen": -102.65230560302734, "eval_logps/rejected": -115.96501922607422, "eval_loss": 0.6592952013015747, "eval_rewards/accuracies": 0.6180297136306763, "eval_rewards/chosen": -0.43940412998199463, "eval_rewards/margins": 0.08844489604234695, "eval_rewards/rejected": -0.527849018573761, "eval_runtime": 384.9143, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 6400 }, { "epoch": 1.1044107512060648, "grad_norm": 10.946127891540527, "learning_rate": 7.965623986365286e-08, "logits/chosen": -2.651127576828003, "logits/rejected": -2.6272809505462646, "logps/chosen": -109.9125747680664, "logps/rejected": -122.58573913574219, "loss": 0.6443, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5554816126823425, "rewards/margins": 0.15297892689704895, "rewards/rejected": -0.7084606289863586, "step": 6410 }, { "epoch": 1.1061337008959338, "grad_norm": 9.975909233093262, "learning_rate": 7.957547421207705e-08, "logits/chosen": -2.5911028385162354, "logits/rejected": -2.574113368988037, "logps/chosen": -110.55873107910156, "logps/rejected": -122.85896301269531, "loss": 0.6464, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5576825737953186, "rewards/margins": 0.13165812194347382, "rewards/rejected": -0.6893406510353088, "step": 6420 }, { "epoch": 1.107856650585803, "grad_norm": 10.608201026916504, "learning_rate": 7.949458968473649e-08, "logits/chosen": -2.5030407905578613, "logits/rejected": -2.489295721054077, "logps/chosen": -102.52950286865234, "logps/rejected": -108.9786605834961, "loss": 0.6648, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5213794708251953, "rewards/margins": 0.0860557034611702, "rewards/rejected": -0.6074351668357849, "step": 6430 }, { "epoch": 1.109579600275672, "grad_norm": 9.387601852416992, "learning_rate": 7.941358660673876e-08, "logits/chosen": -2.5138039588928223, "logits/rejected": -2.497356653213501, "logps/chosen": -109.52412414550781, "logps/rejected": -121.5171127319336, "loss": 0.654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5626505017280579, "rewards/margins": 0.12059362977743149, "rewards/rejected": -0.6832441091537476, "step": 6440 }, { "epoch": 1.111302549965541, "grad_norm": 9.151935577392578, "learning_rate": 7.933246530366788e-08, "logits/chosen": -2.527308225631714, "logits/rejected": -2.4958131313323975, "logps/chosen": -108.6585464477539, "logps/rejected": -122.76898193359375, "loss": 0.6279, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5233389139175415, "rewards/margins": 0.18206921219825745, "rewards/rejected": -0.7054081559181213, "step": 6450 }, { "epoch": 1.11302549965541, "grad_norm": 10.981696128845215, "learning_rate": 7.925122610158315e-08, "logits/chosen": -2.4835329055786133, "logits/rejected": -2.4842262268066406, "logps/chosen": -105.540771484375, "logps/rejected": -135.99533081054688, "loss": 0.5967, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5366493463516235, "rewards/margins": 0.24407310783863068, "rewards/rejected": -0.780722439289093, "step": 6460 }, { "epoch": 1.114748449345279, "grad_norm": 8.487412452697754, "learning_rate": 7.916986932701766e-08, "logits/chosen": -2.455601692199707, "logits/rejected": -2.434135675430298, "logps/chosen": -104.88807678222656, "logps/rejected": -118.6629638671875, "loss": 0.6421, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5212485790252686, "rewards/margins": 0.1477835327386856, "rewards/rejected": -0.6690321564674377, "step": 6470 }, { "epoch": 1.1164713990351482, "grad_norm": 10.79360294342041, "learning_rate": 7.908839530697713e-08, "logits/chosen": -2.5401716232299805, "logits/rejected": -2.5057597160339355, "logps/chosen": -106.2930679321289, "logps/rejected": -115.5504379272461, "loss": 0.6353, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.49724236130714417, "rewards/margins": 0.15368565917015076, "rewards/rejected": -0.6509280204772949, "step": 6480 }, { "epoch": 1.1181943487250172, "grad_norm": 9.632614135742188, "learning_rate": 7.900680436893852e-08, "logits/chosen": -2.7056424617767334, "logits/rejected": -2.6940414905548096, "logps/chosen": -110.35343170166016, "logps/rejected": -124.7533950805664, "loss": 0.6427, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5673316717147827, "rewards/margins": 0.13890740275382996, "rewards/rejected": -0.7062389850616455, "step": 6490 }, { "epoch": 1.1199172984148862, "grad_norm": 10.136536598205566, "learning_rate": 7.892509684084874e-08, "logits/chosen": -2.5503296852111816, "logits/rejected": -2.5431060791015625, "logps/chosen": -111.2696762084961, "logps/rejected": -122.97188568115234, "loss": 0.6393, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5506294369697571, "rewards/margins": 0.15197066962718964, "rewards/rejected": -0.7026001214981079, "step": 6500 }, { "epoch": 1.1199172984148862, "eval_logits/chosen": -2.6354048252105713, "eval_logits/rejected": -2.62886643409729, "eval_logps/chosen": -103.39319610595703, "eval_logps/rejected": -116.8753890991211, "eval_loss": 0.6587897539138794, "eval_rewards/accuracies": 0.6238383054733276, "eval_rewards/chosen": -0.44681301712989807, "eval_rewards/margins": 0.09013961255550385, "eval_rewards/rejected": -0.5369526743888855, "eval_runtime": 384.2181, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 6500 }, { "epoch": 1.1216402481047554, "grad_norm": 9.954007148742676, "learning_rate": 7.884327305112332e-08, "logits/chosen": -2.5616657733917236, "logits/rejected": -2.5188605785369873, "logps/chosen": -110.81083679199219, "logps/rejected": -122.8641586303711, "loss": 0.6338, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5402653813362122, "rewards/margins": 0.15828846395015717, "rewards/rejected": -0.6985538601875305, "step": 6510 }, { "epoch": 1.1233631977946243, "grad_norm": 11.675725936889648, "learning_rate": 7.876133332864505e-08, "logits/chosen": -2.5257201194763184, "logits/rejected": -2.5030903816223145, "logps/chosen": -104.782470703125, "logps/rejected": -115.98095703125, "loss": 0.6443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5117102861404419, "rewards/margins": 0.13352252542972565, "rewards/rejected": -0.6452327370643616, "step": 6520 }, { "epoch": 1.1250861474844935, "grad_norm": 10.296143531799316, "learning_rate": 7.86792780027628e-08, "logits/chosen": -2.4846608638763428, "logits/rejected": -2.457932710647583, "logps/chosen": -107.0318832397461, "logps/rejected": -123.57948303222656, "loss": 0.6209, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5340791940689087, "rewards/margins": 0.18727423250675201, "rewards/rejected": -0.7213534116744995, "step": 6530 }, { "epoch": 1.1268090971743625, "grad_norm": 7.869551181793213, "learning_rate": 7.859710740328998e-08, "logits/chosen": -2.5307869911193848, "logits/rejected": -2.5049257278442383, "logps/chosen": -117.2274398803711, "logps/rejected": -129.9869384765625, "loss": 0.6452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5985310077667236, "rewards/margins": 0.13828882575035095, "rewards/rejected": -0.7368198037147522, "step": 6540 }, { "epoch": 1.1285320468642315, "grad_norm": 10.557644844055176, "learning_rate": 7.85148218605034e-08, "logits/chosen": -2.4480504989624023, "logits/rejected": -2.4262728691101074, "logps/chosen": -106.24981689453125, "logps/rejected": -119.52680969238281, "loss": 0.6498, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5470387935638428, "rewards/margins": 0.1312585175037384, "rewards/rejected": -0.6782972812652588, "step": 6550 }, { "epoch": 1.1302549965541007, "grad_norm": 10.761880874633789, "learning_rate": 7.843242170514187e-08, "logits/chosen": -2.5584206581115723, "logits/rejected": -2.520704984664917, "logps/chosen": -109.3178939819336, "logps/rejected": -124.63825988769531, "loss": 0.6199, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5331367254257202, "rewards/margins": 0.19460919499397278, "rewards/rejected": -0.7277458906173706, "step": 6560 }, { "epoch": 1.1319779462439696, "grad_norm": 10.68869686126709, "learning_rate": 7.834990726840485e-08, "logits/chosen": -2.552407741546631, "logits/rejected": -2.5239217281341553, "logps/chosen": -107.8479232788086, "logps/rejected": -123.52923583984375, "loss": 0.6292, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5428390502929688, "rewards/margins": 0.16548021137714386, "rewards/rejected": -0.7083193063735962, "step": 6570 }, { "epoch": 1.1337008959338388, "grad_norm": 9.134474754333496, "learning_rate": 7.826727888195118e-08, "logits/chosen": -2.5608503818511963, "logits/rejected": -2.525477409362793, "logps/chosen": -109.8907241821289, "logps/rejected": -116.34696960449219, "loss": 0.6521, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5486241579055786, "rewards/margins": 0.11482731997966766, "rewards/rejected": -0.6634514331817627, "step": 6580 }, { "epoch": 1.1354238456237078, "grad_norm": 11.286917686462402, "learning_rate": 7.818453687789766e-08, "logits/chosen": -2.5129857063293457, "logits/rejected": -2.491657018661499, "logps/chosen": -105.63932800292969, "logps/rejected": -121.3639144897461, "loss": 0.6369, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5279938578605652, "rewards/margins": 0.15083569288253784, "rewards/rejected": -0.678829550743103, "step": 6590 }, { "epoch": 1.1371467953135768, "grad_norm": 10.29550838470459, "learning_rate": 7.81016815888178e-08, "logits/chosen": -2.6027557849884033, "logits/rejected": -2.5872511863708496, "logps/chosen": -108.52693939208984, "logps/rejected": -121.42665100097656, "loss": 0.6374, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5289459824562073, "rewards/margins": 0.14944204688072205, "rewards/rejected": -0.6783880591392517, "step": 6600 }, { "epoch": 1.1371467953135768, "eval_logits/chosen": -2.628943681716919, "eval_logits/rejected": -2.6224958896636963, "eval_logps/chosen": -103.72373962402344, "eval_logps/rejected": -117.20513153076172, "eval_loss": 0.6590405702590942, "eval_rewards/accuracies": 0.6166356801986694, "eval_rewards/chosen": -0.45011845231056213, "eval_rewards/margins": 0.09013160318136215, "eval_rewards/rejected": -0.5402500629425049, "eval_runtime": 384.8596, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 6600 }, { "epoch": 1.138869745003446, "grad_norm": 9.680298805236816, "learning_rate": 7.801871334774045e-08, "logits/chosen": -2.5576517581939697, "logits/rejected": -2.5401768684387207, "logps/chosen": -105.10823822021484, "logps/rejected": -122.4234390258789, "loss": 0.6249, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5314202308654785, "rewards/margins": 0.17527267336845398, "rewards/rejected": -0.7066928744316101, "step": 6610 }, { "epoch": 1.140592694693315, "grad_norm": 10.470937728881836, "learning_rate": 7.793563248814843e-08, "logits/chosen": -2.520463705062866, "logits/rejected": -2.4994704723358154, "logps/chosen": -112.21296691894531, "logps/rejected": -125.2126235961914, "loss": 0.6491, "rewards/accuracies": 0.625, "rewards/chosen": -0.5982210636138916, "rewards/margins": 0.1323176473379135, "rewards/rejected": -0.7305387854576111, "step": 6620 }, { "epoch": 1.1423156443831841, "grad_norm": 9.155440330505371, "learning_rate": 7.785243934397725e-08, "logits/chosen": -2.493478536605835, "logits/rejected": -2.4693892002105713, "logps/chosen": -107.62077331542969, "logps/rejected": -114.66853332519531, "loss": 0.6543, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5430929064750671, "rewards/margins": 0.11166892200708389, "rewards/rejected": -0.654761791229248, "step": 6630 }, { "epoch": 1.144038594073053, "grad_norm": 8.96811580657959, "learning_rate": 7.776913424961374e-08, "logits/chosen": -2.5499300956726074, "logits/rejected": -2.5163116455078125, "logps/chosen": -110.06022644042969, "logps/rejected": -119.46247863769531, "loss": 0.6492, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5695438385009766, "rewards/margins": 0.12909965217113495, "rewards/rejected": -0.6986435055732727, "step": 6640 }, { "epoch": 1.145761543762922, "grad_norm": 11.784492492675781, "learning_rate": 7.768571753989465e-08, "logits/chosen": -2.590853214263916, "logits/rejected": -2.5694775581359863, "logps/chosen": -105.77310943603516, "logps/rejected": -126.59233093261719, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5365957617759705, "rewards/margins": 0.19579224288463593, "rewards/rejected": -0.7323880791664124, "step": 6650 }, { "epoch": 1.1474844934527912, "grad_norm": 9.328380584716797, "learning_rate": 7.760218955010542e-08, "logits/chosen": -2.6106936931610107, "logits/rejected": -2.5931646823883057, "logps/chosen": -104.96159362792969, "logps/rejected": -125.2098617553711, "loss": 0.6238, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5320087671279907, "rewards/margins": 0.1784505844116211, "rewards/rejected": -0.7104593515396118, "step": 6660 }, { "epoch": 1.1492074431426602, "grad_norm": 10.923369407653809, "learning_rate": 7.751855061597875e-08, "logits/chosen": -2.5037078857421875, "logits/rejected": -2.504495620727539, "logps/chosen": -107.7574462890625, "logps/rejected": -136.79908752441406, "loss": 0.6026, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5785056948661804, "rewards/margins": 0.2365662157535553, "rewards/rejected": -0.8150718808174133, "step": 6670 }, { "epoch": 1.1509303928325294, "grad_norm": 11.105286598205566, "learning_rate": 7.743480107369324e-08, "logits/chosen": -2.5104610919952393, "logits/rejected": -2.4818921089172363, "logps/chosen": -109.00919342041016, "logps/rejected": -121.95619201660156, "loss": 0.6388, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5518396496772766, "rewards/margins": 0.14706751704216003, "rewards/rejected": -0.6989072561264038, "step": 6680 }, { "epoch": 1.1526533425223984, "grad_norm": 9.123054504394531, "learning_rate": 7.735094125987214e-08, "logits/chosen": -2.515434980392456, "logits/rejected": -2.495396375656128, "logps/chosen": -108.50569152832031, "logps/rejected": -129.08914184570312, "loss": 0.6054, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5134601593017578, "rewards/margins": 0.21908187866210938, "rewards/rejected": -0.7325420379638672, "step": 6690 }, { "epoch": 1.1543762922122673, "grad_norm": 11.098625183105469, "learning_rate": 7.726697151158183e-08, "logits/chosen": -2.5144548416137695, "logits/rejected": -2.5041587352752686, "logps/chosen": -113.06412506103516, "logps/rejected": -128.99073791503906, "loss": 0.6359, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5693588256835938, "rewards/margins": 0.1615733802318573, "rewards/rejected": -0.7309322357177734, "step": 6700 }, { "epoch": 1.1543762922122673, "eval_logits/chosen": -2.6122851371765137, "eval_logits/rejected": -2.605778455734253, "eval_logps/chosen": -105.39387512207031, "eval_logps/rejected": -119.2261734008789, "eval_loss": 0.6580765247344971, "eval_rewards/accuracies": 0.6189591288566589, "eval_rewards/chosen": -0.46681979298591614, "eval_rewards/margins": 0.09364067763090134, "eval_rewards/rejected": -0.5604605078697205, "eval_runtime": 384.4979, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 6700 }, { "epoch": 1.1560992419021365, "grad_norm": 7.834842205047607, "learning_rate": 7.718289216633063e-08, "logits/chosen": -2.507887363433838, "logits/rejected": -2.4832241535186768, "logps/chosen": -112.9707260131836, "logps/rejected": -130.0009002685547, "loss": 0.625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5604552030563354, "rewards/margins": 0.20407012104988098, "rewards/rejected": -0.764525294303894, "step": 6710 }, { "epoch": 1.1578221915920055, "grad_norm": 10.156352043151855, "learning_rate": 7.709870356206736e-08, "logits/chosen": -2.496110677719116, "logits/rejected": -2.481423854827881, "logps/chosen": -105.4066390991211, "logps/rejected": -127.34352111816406, "loss": 0.6111, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5399174690246582, "rewards/margins": 0.2180265188217163, "rewards/rejected": -0.7579440474510193, "step": 6720 }, { "epoch": 1.1595451412818747, "grad_norm": 11.109990119934082, "learning_rate": 7.701440603718e-08, "logits/chosen": -2.456390857696533, "logits/rejected": -2.441084384918213, "logps/chosen": -113.7673110961914, "logps/rejected": -126.12129211425781, "loss": 0.645, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5706645250320435, "rewards/margins": 0.13778047263622284, "rewards/rejected": -0.7084449529647827, "step": 6730 }, { "epoch": 1.1612680909717437, "grad_norm": 13.250123977661133, "learning_rate": 7.692999993049429e-08, "logits/chosen": -2.5170092582702637, "logits/rejected": -2.508409023284912, "logps/chosen": -110.7908706665039, "logps/rejected": -132.03346252441406, "loss": 0.6173, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5635769963264465, "rewards/margins": 0.19584734737873077, "rewards/rejected": -0.7594243884086609, "step": 6740 }, { "epoch": 1.1629910406616126, "grad_norm": 10.663148880004883, "learning_rate": 7.684548558127247e-08, "logits/chosen": -2.550809144973755, "logits/rejected": -2.536221742630005, "logps/chosen": -112.7601547241211, "logps/rejected": -133.10244750976562, "loss": 0.6243, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6030198335647583, "rewards/margins": 0.19112975895404816, "rewards/rejected": -0.7941495776176453, "step": 6750 }, { "epoch": 1.1647139903514818, "grad_norm": 11.121389389038086, "learning_rate": 7.676086332921176e-08, "logits/chosen": -2.4977035522460938, "logits/rejected": -2.472046375274658, "logps/chosen": -109.84000396728516, "logps/rejected": -123.44218444824219, "loss": 0.6353, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5778656601905823, "rewards/margins": 0.16918626427650452, "rewards/rejected": -0.7470518946647644, "step": 6760 }, { "epoch": 1.1664369400413508, "grad_norm": 10.351583480834961, "learning_rate": 7.667613351444318e-08, "logits/chosen": -2.514172077178955, "logits/rejected": -2.506831645965576, "logps/chosen": -109.45332336425781, "logps/rejected": -130.4525909423828, "loss": 0.6194, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5561839938163757, "rewards/margins": 0.19203853607177734, "rewards/rejected": -0.7482224702835083, "step": 6770 }, { "epoch": 1.1681598897312198, "grad_norm": 12.578771591186523, "learning_rate": 7.659129647753002e-08, "logits/chosen": -2.502161741256714, "logits/rejected": -2.4726710319519043, "logps/chosen": -118.43953704833984, "logps/rejected": -128.36976623535156, "loss": 0.6512, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6193763017654419, "rewards/margins": 0.13105718791484833, "rewards/rejected": -0.7504335641860962, "step": 6780 }, { "epoch": 1.169882839421089, "grad_norm": 10.060388565063477, "learning_rate": 7.650635255946658e-08, "logits/chosen": -2.5180985927581787, "logits/rejected": -2.502481460571289, "logps/chosen": -106.8270492553711, "logps/rejected": -129.9012908935547, "loss": 0.6153, "rewards/accuracies": 0.75, "rewards/chosen": -0.5630310773849487, "rewards/margins": 0.20478780567646027, "rewards/rejected": -0.7678189277648926, "step": 6790 }, { "epoch": 1.171605789110958, "grad_norm": 7.966916084289551, "learning_rate": 7.642130210167673e-08, "logits/chosen": -2.432468891143799, "logits/rejected": -2.4055893421173096, "logps/chosen": -110.31768798828125, "logps/rejected": -128.74017333984375, "loss": 0.6146, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5789932012557983, "rewards/margins": 0.20744936168193817, "rewards/rejected": -0.7864425182342529, "step": 6800 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -2.5937321186065674, "eval_logits/rejected": -2.5870144367218018, "eval_logps/chosen": -108.64956665039062, "eval_logps/rejected": -122.98480987548828, "eval_loss": 0.656653642654419, "eval_rewards/accuracies": 0.6173326969146729, "eval_rewards/chosen": -0.4993767738342285, "eval_rewards/margins": 0.09867008030414581, "eval_rewards/rejected": -0.5980467796325684, "eval_runtime": 384.4346, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 6800 }, { "epoch": 1.173328738800827, "grad_norm": 9.390850067138672, "learning_rate": 7.633614544601257e-08, "logits/chosen": -2.486985921859741, "logits/rejected": -2.469143867492676, "logps/chosen": -114.038818359375, "logps/rejected": -131.0915985107422, "loss": 0.618, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5833514332771301, "rewards/margins": 0.21009714901447296, "rewards/rejected": -0.7934485673904419, "step": 6810 }, { "epoch": 1.175051688490696, "grad_norm": 12.063318252563477, "learning_rate": 7.625088293475308e-08, "logits/chosen": -2.580454111099243, "logits/rejected": -2.5531787872314453, "logps/chosen": -116.632568359375, "logps/rejected": -129.03273010253906, "loss": 0.6416, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.616090714931488, "rewards/margins": 0.14406165480613708, "rewards/rejected": -0.7601524591445923, "step": 6820 }, { "epoch": 1.176774638180565, "grad_norm": 10.357357025146484, "learning_rate": 7.61655149106027e-08, "logits/chosen": -2.5899417400360107, "logits/rejected": -2.5826337337493896, "logps/chosen": -111.27880859375, "logps/rejected": -126.9948959350586, "loss": 0.6373, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5737534761428833, "rewards/margins": 0.16606983542442322, "rewards/rejected": -0.7398232817649841, "step": 6830 }, { "epoch": 1.1784975878704342, "grad_norm": 10.03347396850586, "learning_rate": 7.608004171668994e-08, "logits/chosen": -2.542518377304077, "logits/rejected": -2.519183874130249, "logps/chosen": -118.90779113769531, "logps/rejected": -134.30470275878906, "loss": 0.6401, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6197718977928162, "rewards/margins": 0.1692858189344406, "rewards/rejected": -0.789057731628418, "step": 6840 }, { "epoch": 1.1802205375603032, "grad_norm": 9.831806182861328, "learning_rate": 7.599446369656608e-08, "logits/chosen": -2.4035212993621826, "logits/rejected": -2.374490976333618, "logps/chosen": -113.41793060302734, "logps/rejected": -131.79440307617188, "loss": 0.626, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5925357937812805, "rewards/margins": 0.18899916112422943, "rewards/rejected": -0.7815349698066711, "step": 6850 }, { "epoch": 1.1819434872501722, "grad_norm": 11.192511558532715, "learning_rate": 7.59087811942037e-08, "logits/chosen": -2.5101075172424316, "logits/rejected": -2.4764935970306396, "logps/chosen": -122.56880187988281, "logps/rejected": -133.31004333496094, "loss": 0.6291, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6345411539077759, "rewards/margins": 0.17534251511096954, "rewards/rejected": -0.8098835945129395, "step": 6860 }, { "epoch": 1.1836664369400414, "grad_norm": 11.095505714416504, "learning_rate": 7.582299455399536e-08, "logits/chosen": -2.4207568168640137, "logits/rejected": -2.4096591472625732, "logps/chosen": -114.78059387207031, "logps/rejected": -132.53634643554688, "loss": 0.6308, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6369515657424927, "rewards/margins": 0.17071498930454254, "rewards/rejected": -0.8076664805412292, "step": 6870 }, { "epoch": 1.1853893866299103, "grad_norm": 10.605700492858887, "learning_rate": 7.573710412075218e-08, "logits/chosen": -2.5224595069885254, "logits/rejected": -2.4933252334594727, "logps/chosen": -118.11952209472656, "logps/rejected": -133.68777465820312, "loss": 0.6266, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6385660171508789, "rewards/margins": 0.18669869005680084, "rewards/rejected": -0.8252647519111633, "step": 6880 }, { "epoch": 1.1871123363197795, "grad_norm": 9.551141738891602, "learning_rate": 7.565111023970246e-08, "logits/chosen": -2.449681043624878, "logits/rejected": -2.4260666370391846, "logps/chosen": -110.6463851928711, "logps/rejected": -130.5448455810547, "loss": 0.6253, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5833606719970703, "rewards/margins": 0.19819436967372894, "rewards/rejected": -0.7815549969673157, "step": 6890 }, { "epoch": 1.1888352860096485, "grad_norm": 10.671778678894043, "learning_rate": 7.556501325649031e-08, "logits/chosen": -2.5089802742004395, "logits/rejected": -2.4887733459472656, "logps/chosen": -114.85318756103516, "logps/rejected": -131.3336639404297, "loss": 0.6367, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6129860281944275, "rewards/margins": 0.1659851223230362, "rewards/rejected": -0.7789711952209473, "step": 6900 }, { "epoch": 1.1888352860096485, "eval_logits/chosen": -2.582038640975952, "eval_logits/rejected": -2.5752716064453125, "eval_logps/chosen": -109.63968658447266, "eval_logps/rejected": -124.18799591064453, "eval_loss": 0.6560820937156677, "eval_rewards/accuracies": 0.6226765513420105, "eval_rewards/chosen": -0.5092779397964478, "eval_rewards/margins": 0.10080081969499588, "eval_rewards/rejected": -0.6100786924362183, "eval_runtime": 384.7967, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 6900 }, { "epoch": 1.1905582356995175, "grad_norm": 10.926170349121094, "learning_rate": 7.547881351717425e-08, "logits/chosen": -2.533719062805176, "logits/rejected": -2.510187864303589, "logps/chosen": -115.92164611816406, "logps/rejected": -137.22987365722656, "loss": 0.6079, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5903561115264893, "rewards/margins": 0.22987723350524902, "rewards/rejected": -0.8202333450317383, "step": 6910 }, { "epoch": 1.1922811853893867, "grad_norm": 11.420799255371094, "learning_rate": 7.539251136822582e-08, "logits/chosen": -2.585358142852783, "logits/rejected": -2.556082010269165, "logps/chosen": -120.77299499511719, "logps/rejected": -133.7059783935547, "loss": 0.6461, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6466901898384094, "rewards/margins": 0.152668297290802, "rewards/rejected": -0.7993584275245667, "step": 6920 }, { "epoch": 1.1940041350792556, "grad_norm": 10.871424674987793, "learning_rate": 7.530610715652816e-08, "logits/chosen": -2.4130659103393555, "logits/rejected": -2.386399269104004, "logps/chosen": -117.57035064697266, "logps/rejected": -136.52706909179688, "loss": 0.6113, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5993648767471313, "rewards/margins": 0.22338667511940002, "rewards/rejected": -0.822751522064209, "step": 6930 }, { "epoch": 1.1957270847691248, "grad_norm": 12.682785034179688, "learning_rate": 7.521960122937469e-08, "logits/chosen": -2.4112255573272705, "logits/rejected": -2.369144916534424, "logps/chosen": -115.7467269897461, "logps/rejected": -128.5149383544922, "loss": 0.6217, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5778379440307617, "rewards/margins": 0.20140111446380615, "rewards/rejected": -0.7792390584945679, "step": 6940 }, { "epoch": 1.1974500344589938, "grad_norm": 10.227527618408203, "learning_rate": 7.513299393446761e-08, "logits/chosen": -2.461789608001709, "logits/rejected": -2.4458439350128174, "logps/chosen": -113.69105529785156, "logps/rejected": -137.35147094726562, "loss": 0.6019, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5983427166938782, "rewards/margins": 0.2457897663116455, "rewards/rejected": -0.8441325426101685, "step": 6950 }, { "epoch": 1.1991729841488628, "grad_norm": 11.420378684997559, "learning_rate": 7.504628561991661e-08, "logits/chosen": -2.6247124671936035, "logits/rejected": -2.5894503593444824, "logps/chosen": -120.89681243896484, "logps/rejected": -127.87425231933594, "loss": 0.6431, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6557838916778564, "rewards/margins": 0.1451510190963745, "rewards/rejected": -0.800934910774231, "step": 6960 }, { "epoch": 1.200895933838732, "grad_norm": 14.405696868896484, "learning_rate": 7.495947663423736e-08, "logits/chosen": -2.536498546600342, "logits/rejected": -2.505995750427246, "logps/chosen": -115.50186920166016, "logps/rejected": -127.437744140625, "loss": 0.638, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6096006631851196, "rewards/margins": 0.1608787477016449, "rewards/rejected": -0.7704793214797974, "step": 6970 }, { "epoch": 1.202618883528601, "grad_norm": 10.212309837341309, "learning_rate": 7.487256732635024e-08, "logits/chosen": -2.447810411453247, "logits/rejected": -2.4197306632995605, "logps/chosen": -116.3142318725586, "logps/rejected": -138.9977569580078, "loss": 0.6058, "rewards/accuracies": 0.71875, "rewards/chosen": -0.620888888835907, "rewards/margins": 0.2390161007642746, "rewards/rejected": -0.8599050641059875, "step": 6980 }, { "epoch": 1.20434183321847, "grad_norm": 12.747852325439453, "learning_rate": 7.478555804557881e-08, "logits/chosen": -2.3737339973449707, "logits/rejected": -2.3627359867095947, "logps/chosen": -121.1943130493164, "logps/rejected": -136.52676391601562, "loss": 0.6327, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6421090364456177, "rewards/margins": 0.18053218722343445, "rewards/rejected": -0.8226412534713745, "step": 6990 }, { "epoch": 1.206064782908339, "grad_norm": 11.638018608093262, "learning_rate": 7.469844914164847e-08, "logits/chosen": -2.639355182647705, "logits/rejected": -2.6122663021087646, "logps/chosen": -127.60298156738281, "logps/rejected": -146.56190490722656, "loss": 0.6185, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7130860090255737, "rewards/margins": 0.2056291550397873, "rewards/rejected": -0.9187151789665222, "step": 7000 }, { "epoch": 1.206064782908339, "eval_logits/chosen": -2.570643186569214, "eval_logits/rejected": -2.5638396739959717, "eval_logps/chosen": -112.77349853515625, "eval_logps/rejected": -127.83332061767578, "eval_loss": 0.654921293258667, "eval_rewards/accuracies": 0.615938663482666, "eval_rewards/chosen": -0.540615975856781, "eval_rewards/margins": 0.10591594129800797, "eval_rewards/rejected": -0.6465319991111755, "eval_runtime": 385.0305, "eval_samples_per_second": 11.178, "eval_steps_per_second": 1.397, "step": 7000 }, { "epoch": 1.207787732598208, "grad_norm": 8.9988431930542, "learning_rate": 7.461124096468505e-08, "logits/chosen": -2.4724316596984863, "logits/rejected": -2.447662591934204, "logps/chosen": -122.56915283203125, "logps/rejected": -140.09811401367188, "loss": 0.6184, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.661439061164856, "rewards/margins": 0.21091802418231964, "rewards/rejected": -0.872357189655304, "step": 7010 }, { "epoch": 1.2095106822880772, "grad_norm": 11.723347663879395, "learning_rate": 7.45239338652134e-08, "logits/chosen": -2.4519741535186768, "logits/rejected": -2.427361011505127, "logps/chosen": -115.8144760131836, "logps/rejected": -134.84031677246094, "loss": 0.6275, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6447967886924744, "rewards/margins": 0.18002037703990936, "rewards/rejected": -0.8248171806335449, "step": 7020 }, { "epoch": 1.2112336319779462, "grad_norm": 12.742324829101562, "learning_rate": 7.443652819415603e-08, "logits/chosen": -2.495712995529175, "logits/rejected": -2.4790539741516113, "logps/chosen": -120.02806091308594, "logps/rejected": -140.04800415039062, "loss": 0.6333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6692787408828735, "rewards/margins": 0.1800619661808014, "rewards/rejected": -0.8493407368659973, "step": 7030 }, { "epoch": 1.2129565816678154, "grad_norm": 16.276948928833008, "learning_rate": 7.434902430283154e-08, "logits/chosen": -2.4384918212890625, "logits/rejected": -2.4184823036193848, "logps/chosen": -122.02296447753906, "logps/rejected": -142.19448852539062, "loss": 0.6196, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6594618558883667, "rewards/margins": 0.1975087672472, "rewards/rejected": -0.8569706082344055, "step": 7040 }, { "epoch": 1.2146795313576844, "grad_norm": 12.45302963256836, "learning_rate": 7.426142254295343e-08, "logits/chosen": -2.4147567749023438, "logits/rejected": -2.396167755126953, "logps/chosen": -115.85501861572266, "logps/rejected": -136.3661346435547, "loss": 0.6283, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6439922451972961, "rewards/margins": 0.18588688969612122, "rewards/rejected": -0.8298791646957397, "step": 7050 }, { "epoch": 1.2164024810475533, "grad_norm": 12.847715377807617, "learning_rate": 7.417372326662845e-08, "logits/chosen": -2.522223472595215, "logits/rejected": -2.509165048599243, "logps/chosen": -123.62113952636719, "logps/rejected": -136.920166015625, "loss": 0.6512, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7041088342666626, "rewards/margins": 0.1359659731388092, "rewards/rejected": -0.8400747179985046, "step": 7060 }, { "epoch": 1.2181254307374225, "grad_norm": 14.532912254333496, "learning_rate": 7.408592682635546e-08, "logits/chosen": -2.4729976654052734, "logits/rejected": -2.449955701828003, "logps/chosen": -124.8624496459961, "logps/rejected": -133.46981811523438, "loss": 0.6754, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.7025941014289856, "rewards/margins": 0.09962520748376846, "rewards/rejected": -0.8022192716598511, "step": 7070 }, { "epoch": 1.2198483804272915, "grad_norm": 10.264069557189941, "learning_rate": 7.399803357502372e-08, "logits/chosen": -2.5410983562469482, "logits/rejected": -2.5128250122070312, "logps/chosen": -116.9005355834961, "logps/rejected": -135.8470001220703, "loss": 0.6236, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6343238949775696, "rewards/margins": 0.18449579179286957, "rewards/rejected": -0.8188197016716003, "step": 7080 }, { "epoch": 1.2215713301171607, "grad_norm": 10.06944465637207, "learning_rate": 7.391004386591171e-08, "logits/chosen": -2.5722641944885254, "logits/rejected": -2.5609748363494873, "logps/chosen": -120.7652359008789, "logps/rejected": -139.71237182617188, "loss": 0.6218, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6724736094474792, "rewards/margins": 0.20255060493946075, "rewards/rejected": -0.8750241994857788, "step": 7090 }, { "epoch": 1.2232942798070296, "grad_norm": 10.163381576538086, "learning_rate": 7.382195805268555e-08, "logits/chosen": -2.4425318241119385, "logits/rejected": -2.4224209785461426, "logps/chosen": -120.17268371582031, "logps/rejected": -139.46719360351562, "loss": 0.6226, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6309939622879028, "rewards/margins": 0.19519367814064026, "rewards/rejected": -0.8261876106262207, "step": 7100 }, { "epoch": 1.2232942798070296, "eval_logits/chosen": -2.5650668144226074, "eval_logits/rejected": -2.558187484741211, "eval_logps/chosen": -110.5578842163086, "eval_logps/rejected": -125.31090545654297, "eval_loss": 0.6558101177215576, "eval_rewards/accuracies": 0.6180297136306763, "eval_rewards/chosen": -0.5184599757194519, "eval_rewards/margins": 0.10284788906574249, "eval_rewards/rejected": -0.6213078498840332, "eval_runtime": 384.9063, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 7100 }, { "epoch": 1.2250172294968986, "grad_norm": 11.83450984954834, "learning_rate": 7.373377648939768e-08, "logits/chosen": -2.4836411476135254, "logits/rejected": -2.4502947330474854, "logps/chosen": -117.56773376464844, "logps/rejected": -125.57755279541016, "loss": 0.6629, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.640949010848999, "rewards/margins": 0.10499922186136246, "rewards/rejected": -0.7459481954574585, "step": 7110 }, { "epoch": 1.2267401791867678, "grad_norm": 13.437517166137695, "learning_rate": 7.364549953048537e-08, "logits/chosen": -2.473374843597412, "logits/rejected": -2.435924530029297, "logps/chosen": -125.98585510253906, "logps/rejected": -130.25930786132812, "loss": 0.6501, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6399961709976196, "rewards/margins": 0.13699987530708313, "rewards/rejected": -0.7769960761070251, "step": 7120 }, { "epoch": 1.2284631288766368, "grad_norm": 11.831195831298828, "learning_rate": 7.355712753076936e-08, "logits/chosen": -2.36934757232666, "logits/rejected": -2.34602952003479, "logps/chosen": -115.50764465332031, "logps/rejected": -130.68206787109375, "loss": 0.6298, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5916638970375061, "rewards/margins": 0.18337228894233704, "rewards/rejected": -0.775036096572876, "step": 7130 }, { "epoch": 1.230186078566506, "grad_norm": 10.17710018157959, "learning_rate": 7.346866084545236e-08, "logits/chosen": -2.466310977935791, "logits/rejected": -2.4604392051696777, "logps/chosen": -112.48873138427734, "logps/rejected": -131.60763549804688, "loss": 0.6309, "rewards/accuracies": 0.625, "rewards/chosen": -0.6117105484008789, "rewards/margins": 0.184920072555542, "rewards/rejected": -0.7966305613517761, "step": 7140 }, { "epoch": 1.231909028256375, "grad_norm": 12.351106643676758, "learning_rate": 7.338009983011769e-08, "logits/chosen": -2.5419418811798096, "logits/rejected": -2.520542621612549, "logps/chosen": -128.04127502441406, "logps/rejected": -141.21267700195312, "loss": 0.6477, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7165376543998718, "rewards/margins": 0.14469008147716522, "rewards/rejected": -0.861227810382843, "step": 7150 }, { "epoch": 1.233631977946244, "grad_norm": 11.851210594177246, "learning_rate": 7.329144484072778e-08, "logits/chosen": -2.459324359893799, "logits/rejected": -2.4257044792175293, "logps/chosen": -115.08448791503906, "logps/rejected": -135.9889678955078, "loss": 0.6183, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6142273545265198, "rewards/margins": 0.20488572120666504, "rewards/rejected": -0.8191129565238953, "step": 7160 }, { "epoch": 1.235354927636113, "grad_norm": 11.759284019470215, "learning_rate": 7.320269623362282e-08, "logits/chosen": -2.425865650177002, "logits/rejected": -2.399968147277832, "logps/chosen": -119.05157470703125, "logps/rejected": -134.5637969970703, "loss": 0.6395, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6348527669906616, "rewards/margins": 0.17269684374332428, "rewards/rejected": -0.8075494766235352, "step": 7170 }, { "epoch": 1.237077877325982, "grad_norm": 10.9734468460083, "learning_rate": 7.311385436551928e-08, "logits/chosen": -2.5507378578186035, "logits/rejected": -2.5361576080322266, "logps/chosen": -115.3531265258789, "logps/rejected": -133.952392578125, "loss": 0.6211, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6005090475082397, "rewards/margins": 0.19429241120815277, "rewards/rejected": -0.7948015928268433, "step": 7180 }, { "epoch": 1.2388008270158513, "grad_norm": 10.070911407470703, "learning_rate": 7.302491959350846e-08, "logits/chosen": -2.4009807109832764, "logits/rejected": -2.376857280731201, "logps/chosen": -114.873291015625, "logps/rejected": -136.6298828125, "loss": 0.6173, "rewards/accuracies": 0.6875, "rewards/chosen": -0.622027575969696, "rewards/margins": 0.20805366337299347, "rewards/rejected": -0.8300812840461731, "step": 7190 }, { "epoch": 1.2405237767057202, "grad_norm": 12.647407531738281, "learning_rate": 7.293589227505511e-08, "logits/chosen": -2.4105372428894043, "logits/rejected": -2.3920445442199707, "logps/chosen": -118.8134765625, "logps/rejected": -143.93020629882812, "loss": 0.6173, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6571611166000366, "rewards/margins": 0.2204003781080246, "rewards/rejected": -0.8775615692138672, "step": 7200 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -2.555715560913086, "eval_logits/rejected": -2.5487794876098633, "eval_logps/chosen": -111.71888732910156, "eval_logps/rejected": -126.75550079345703, "eval_loss": 0.6549857258796692, "eval_rewards/accuracies": 0.6161710023880005, "eval_rewards/chosen": -0.5300700068473816, "eval_rewards/margins": 0.10568376630544662, "eval_rewards/rejected": -0.6357537508010864, "eval_runtime": 384.6239, "eval_samples_per_second": 11.19, "eval_steps_per_second": 1.399, "step": 7200 }, { "epoch": 1.2422467263955892, "grad_norm": 12.543858528137207, "learning_rate": 7.284677276799593e-08, "logits/chosen": -2.519188642501831, "logits/rejected": -2.50075101852417, "logps/chosen": -127.06645202636719, "logps/rejected": -134.30532836914062, "loss": 0.6598, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7003774046897888, "rewards/margins": 0.11237999051809311, "rewards/rejected": -0.8127573728561401, "step": 7210 }, { "epoch": 1.2439696760854584, "grad_norm": 11.850963592529297, "learning_rate": 7.275756143053821e-08, "logits/chosen": -2.3988280296325684, "logits/rejected": -2.36625337600708, "logps/chosen": -120.89958190917969, "logps/rejected": -133.55238342285156, "loss": 0.6481, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6636959314346313, "rewards/margins": 0.1551842838525772, "rewards/rejected": -0.8188802003860474, "step": 7220 }, { "epoch": 1.2456926257753274, "grad_norm": 13.022244453430176, "learning_rate": 7.266825862125827e-08, "logits/chosen": -2.434527635574341, "logits/rejected": -2.4163756370544434, "logps/chosen": -121.3495101928711, "logps/rejected": -131.59068298339844, "loss": 0.6588, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6712813377380371, "rewards/margins": 0.11482469737529755, "rewards/rejected": -0.7861061096191406, "step": 7230 }, { "epoch": 1.2474155754651963, "grad_norm": 12.103440284729004, "learning_rate": 7.257886469910018e-08, "logits/chosen": -2.4661643505096436, "logits/rejected": -2.455108404159546, "logps/chosen": -122.23663330078125, "logps/rejected": -136.91957092285156, "loss": 0.6334, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6478984951972961, "rewards/margins": 0.1735813319683075, "rewards/rejected": -0.8214799165725708, "step": 7240 }, { "epoch": 1.2491385251550655, "grad_norm": 11.90157413482666, "learning_rate": 7.248938002337412e-08, "logits/chosen": -2.5145223140716553, "logits/rejected": -2.4867358207702637, "logps/chosen": -120.96724700927734, "logps/rejected": -132.59451293945312, "loss": 0.6513, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6639958620071411, "rewards/margins": 0.13340340554714203, "rewards/rejected": -0.7973993420600891, "step": 7250 }, { "epoch": 1.2508614748449345, "grad_norm": 10.555716514587402, "learning_rate": 7.239980495375518e-08, "logits/chosen": -2.5021207332611084, "logits/rejected": -2.4729690551757812, "logps/chosen": -116.6308822631836, "logps/rejected": -135.07986450195312, "loss": 0.6197, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6294509768486023, "rewards/margins": 0.1976228654384613, "rewards/rejected": -0.827073872089386, "step": 7260 }, { "epoch": 1.2525844245348035, "grad_norm": 10.42241096496582, "learning_rate": 7.231013985028168e-08, "logits/chosen": -2.5166265964508057, "logits/rejected": -2.488889455795288, "logps/chosen": -114.82917785644531, "logps/rejected": -129.36495971679688, "loss": 0.6349, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6051352024078369, "rewards/margins": 0.16216349601745605, "rewards/rejected": -0.767298698425293, "step": 7270 }, { "epoch": 1.2543073742246726, "grad_norm": 11.546476364135742, "learning_rate": 7.222038507335384e-08, "logits/chosen": -2.55112886428833, "logits/rejected": -2.518460988998413, "logps/chosen": -114.93461608886719, "logps/rejected": -131.73526000976562, "loss": 0.6166, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5836488008499146, "rewards/margins": 0.20460018515586853, "rewards/rejected": -0.7882489562034607, "step": 7280 }, { "epoch": 1.2560303239145416, "grad_norm": 11.173965454101562, "learning_rate": 7.213054098373232e-08, "logits/chosen": -2.355006694793701, "logits/rejected": -2.3428616523742676, "logps/chosen": -118.236083984375, "logps/rejected": -131.27276611328125, "loss": 0.6518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6344476938247681, "rewards/margins": 0.11964921653270721, "rewards/rejected": -0.7540968656539917, "step": 7290 }, { "epoch": 1.2577532736044108, "grad_norm": 13.32795524597168, "learning_rate": 7.204060794253679e-08, "logits/chosen": -2.3103222846984863, "logits/rejected": -2.294548511505127, "logps/chosen": -109.36665344238281, "logps/rejected": -125.37540435791016, "loss": 0.6472, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5910590887069702, "rewards/margins": 0.14262047410011292, "rewards/rejected": -0.733679473400116, "step": 7300 }, { "epoch": 1.2577532736044108, "eval_logits/chosen": -2.5543453693389893, "eval_logits/rejected": -2.5474047660827637, "eval_logps/chosen": -108.9138412475586, "eval_logps/rejected": -123.72216033935547, "eval_loss": 0.6552937626838684, "eval_rewards/accuracies": 0.6196561455726624, "eval_rewards/chosen": -0.5020194053649902, "eval_rewards/margins": 0.10340093821287155, "eval_rewards/rejected": -0.6054202914237976, "eval_runtime": 384.5318, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 7300 }, { "epoch": 1.2594762232942798, "grad_norm": 11.045195579528809, "learning_rate": 7.195058631124443e-08, "logits/chosen": -2.4966492652893066, "logits/rejected": -2.4731640815734863, "logps/chosen": -118.0832290649414, "logps/rejected": -133.9816436767578, "loss": 0.6278, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6302419900894165, "rewards/margins": 0.1907169669866562, "rewards/rejected": -0.8209589719772339, "step": 7310 }, { "epoch": 1.2611991729841487, "grad_norm": 10.066198348999023, "learning_rate": 7.186047645168849e-08, "logits/chosen": -2.506199598312378, "logits/rejected": -2.4831595420837402, "logps/chosen": -114.11676025390625, "logps/rejected": -125.86553955078125, "loss": 0.6434, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5976320505142212, "rewards/margins": 0.14946410059928894, "rewards/rejected": -0.7470961809158325, "step": 7320 }, { "epoch": 1.262922122674018, "grad_norm": 10.430079460144043, "learning_rate": 7.177027872605686e-08, "logits/chosen": -2.4035518169403076, "logits/rejected": -2.3770573139190674, "logps/chosen": -111.19134521484375, "logps/rejected": -133.9873504638672, "loss": 0.6074, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5812875628471375, "rewards/margins": 0.24287056922912598, "rewards/rejected": -0.8241580724716187, "step": 7330 }, { "epoch": 1.264645072363887, "grad_norm": 10.417167663574219, "learning_rate": 7.167999349689062e-08, "logits/chosen": -2.474705696105957, "logits/rejected": -2.4504990577697754, "logps/chosen": -118.43119049072266, "logps/rejected": -132.00022888183594, "loss": 0.6498, "rewards/accuracies": 0.625, "rewards/chosen": -0.6305009722709656, "rewards/margins": 0.15034613013267517, "rewards/rejected": -0.7808471918106079, "step": 7340 }, { "epoch": 1.266368022053756, "grad_norm": 11.125683784484863, "learning_rate": 7.158962112708247e-08, "logits/chosen": -2.527188539505005, "logits/rejected": -2.500272274017334, "logps/chosen": -115.2213363647461, "logps/rejected": -129.96823120117188, "loss": 0.6225, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5830197930335999, "rewards/margins": 0.19897225499153137, "rewards/rejected": -0.7819920778274536, "step": 7350 }, { "epoch": 1.268090971743625, "grad_norm": 12.217016220092773, "learning_rate": 7.14991619798755e-08, "logits/chosen": -2.4353315830230713, "logits/rejected": -2.416670560836792, "logps/chosen": -116.96650695800781, "logps/rejected": -131.20217895507812, "loss": 0.6359, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6266859769821167, "rewards/margins": 0.16348426043987274, "rewards/rejected": -0.7901702523231506, "step": 7360 }, { "epoch": 1.269813921433494, "grad_norm": 11.18005084991455, "learning_rate": 7.140861641886148e-08, "logits/chosen": -2.3502328395843506, "logits/rejected": -2.3303933143615723, "logps/chosen": -112.33821868896484, "logps/rejected": -123.03433990478516, "loss": 0.6551, "rewards/accuracies": 0.65625, "rewards/chosen": -0.595317006111145, "rewards/margins": 0.12261603027582169, "rewards/rejected": -0.7179330587387085, "step": 7370 }, { "epoch": 1.2715368711233632, "grad_norm": 10.379005432128906, "learning_rate": 7.131798480797957e-08, "logits/chosen": -2.4382312297821045, "logits/rejected": -2.418172597885132, "logps/chosen": -113.05488586425781, "logps/rejected": -134.56137084960938, "loss": 0.6157, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5988710522651672, "rewards/margins": 0.20987263321876526, "rewards/rejected": -0.8087435960769653, "step": 7380 }, { "epoch": 1.2732598208132322, "grad_norm": 12.197848320007324, "learning_rate": 7.12272675115148e-08, "logits/chosen": -2.4099926948547363, "logits/rejected": -2.390993595123291, "logps/chosen": -112.275634765625, "logps/rejected": -130.8508758544922, "loss": 0.6311, "rewards/accuracies": 0.625, "rewards/chosen": -0.6006190180778503, "rewards/margins": 0.1776985228061676, "rewards/rejected": -0.7783175110816956, "step": 7390 }, { "epoch": 1.2749827705031014, "grad_norm": 10.072972297668457, "learning_rate": 7.113646489409654e-08, "logits/chosen": -2.407961368560791, "logits/rejected": -2.3715322017669678, "logps/chosen": -120.93656921386719, "logps/rejected": -131.6440887451172, "loss": 0.6388, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6548551321029663, "rewards/margins": 0.1558782309293747, "rewards/rejected": -0.8107333183288574, "step": 7400 }, { "epoch": 1.2749827705031014, "eval_logits/chosen": -2.548861265182495, "eval_logits/rejected": -2.541849136352539, "eval_logps/chosen": -108.55355834960938, "eval_logps/rejected": -123.39374542236328, "eval_loss": 0.6552349925041199, "eval_rewards/accuracies": 0.6205855011940002, "eval_rewards/chosen": -0.4984167218208313, "eval_rewards/margins": 0.10371941328048706, "eval_rewards/rejected": -0.6021361351013184, "eval_runtime": 385.1206, "eval_samples_per_second": 11.176, "eval_steps_per_second": 1.397, "step": 7400 }, { "epoch": 1.2767057201929704, "grad_norm": 13.564207077026367, "learning_rate": 7.104557732069722e-08, "logits/chosen": -2.385396718978882, "logits/rejected": -2.356330156326294, "logps/chosen": -113.54402160644531, "logps/rejected": -129.4213104248047, "loss": 0.6357, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5978765487670898, "rewards/margins": 0.1661193072795868, "rewards/rejected": -0.763995885848999, "step": 7410 }, { "epoch": 1.2784286698828393, "grad_norm": 10.676708221435547, "learning_rate": 7.09546051566306e-08, "logits/chosen": -2.3253355026245117, "logits/rejected": -2.3032877445220947, "logps/chosen": -113.17464447021484, "logps/rejected": -125.61772155761719, "loss": 0.6386, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5700669884681702, "rewards/margins": 0.15543384850025177, "rewards/rejected": -0.7255008816719055, "step": 7420 }, { "epoch": 1.2801516195727085, "grad_norm": 12.811087608337402, "learning_rate": 7.086354876755058e-08, "logits/chosen": -2.375821590423584, "logits/rejected": -2.348492383956909, "logps/chosen": -118.81611633300781, "logps/rejected": -135.79129028320312, "loss": 0.6283, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.654118537902832, "rewards/margins": 0.18502214550971985, "rewards/rejected": -0.8391407132148743, "step": 7430 }, { "epoch": 1.2818745692625775, "grad_norm": 10.5494384765625, "learning_rate": 7.07724085194495e-08, "logits/chosen": -2.4655237197875977, "logits/rejected": -2.4369113445281982, "logps/chosen": -124.45361328125, "logps/rejected": -138.9933624267578, "loss": 0.6364, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6844878196716309, "rewards/margins": 0.17180296778678894, "rewards/rejected": -0.8562908172607422, "step": 7440 }, { "epoch": 1.2835975189524467, "grad_norm": 11.717615127563477, "learning_rate": 7.068118477865677e-08, "logits/chosen": -2.541541337966919, "logits/rejected": -2.515986204147339, "logps/chosen": -117.92220306396484, "logps/rejected": -131.0966033935547, "loss": 0.6397, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6199640035629272, "rewards/margins": 0.16493521630764008, "rewards/rejected": -0.7848992347717285, "step": 7450 }, { "epoch": 1.2853204686423156, "grad_norm": 10.50747299194336, "learning_rate": 7.058987791183744e-08, "logits/chosen": -2.3575809001922607, "logits/rejected": -2.359462261199951, "logps/chosen": -109.51753234863281, "logps/rejected": -131.36456298828125, "loss": 0.6255, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5935528874397278, "rewards/margins": 0.19020214676856995, "rewards/rejected": -0.7837550640106201, "step": 7460 }, { "epoch": 1.2870434183321846, "grad_norm": 12.218599319458008, "learning_rate": 7.049848828599064e-08, "logits/chosen": -2.445612668991089, "logits/rejected": -2.4296865463256836, "logps/chosen": -115.34220123291016, "logps/rejected": -126.57014465332031, "loss": 0.6476, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6068757176399231, "rewards/margins": 0.1456916332244873, "rewards/rejected": -0.7525673508644104, "step": 7470 }, { "epoch": 1.2887663680220538, "grad_norm": 10.959244728088379, "learning_rate": 7.040701626844819e-08, "logits/chosen": -2.4036409854888916, "logits/rejected": -2.378356456756592, "logps/chosen": -113.02032470703125, "logps/rejected": -122.0857162475586, "loss": 0.6498, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5838802456855774, "rewards/margins": 0.144999697804451, "rewards/rejected": -0.7288798689842224, "step": 7480 }, { "epoch": 1.2904893177119228, "grad_norm": 11.797114372253418, "learning_rate": 7.031546222687296e-08, "logits/chosen": -2.3692786693573, "logits/rejected": -2.355942726135254, "logps/chosen": -115.99617767333984, "logps/rejected": -134.8970947265625, "loss": 0.6233, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6343568563461304, "rewards/margins": 0.18313702940940857, "rewards/rejected": -0.8174939155578613, "step": 7490 }, { "epoch": 1.292212267401792, "grad_norm": 10.20092487335205, "learning_rate": 7.022382652925766e-08, "logits/chosen": -2.3900606632232666, "logits/rejected": -2.3664214611053467, "logps/chosen": -112.93302917480469, "logps/rejected": -129.85130310058594, "loss": 0.641, "rewards/accuracies": 0.625, "rewards/chosen": -0.6145877838134766, "rewards/margins": 0.16414333879947662, "rewards/rejected": -0.7787311673164368, "step": 7500 }, { "epoch": 1.292212267401792, "eval_logits/chosen": -2.5403659343719482, "eval_logits/rejected": -2.5331685543060303, "eval_logps/chosen": -108.91471099853516, "eval_logps/rejected": -123.96131134033203, "eval_loss": 0.6543384194374084, "eval_rewards/accuracies": 0.6226765513420105, "eval_rewards/chosen": -0.5020280480384827, "eval_rewards/margins": 0.1057838574051857, "eval_rewards/rejected": -0.6078119277954102, "eval_runtime": 384.9736, "eval_samples_per_second": 11.18, "eval_steps_per_second": 1.397, "step": 7500 }, { "epoch": 1.293935217091661, "grad_norm": 11.690505027770996, "learning_rate": 7.01321095439231e-08, "logits/chosen": -2.420456647872925, "logits/rejected": -2.3924272060394287, "logps/chosen": -122.78355407714844, "logps/rejected": -127.64799499511719, "loss": 0.6574, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.655224621295929, "rewards/margins": 0.10904522240161896, "rewards/rejected": -0.7642698287963867, "step": 7510 }, { "epoch": 1.29565816678153, "grad_norm": 15.149540901184082, "learning_rate": 7.004031163951686e-08, "logits/chosen": -2.402318000793457, "logits/rejected": -2.3823370933532715, "logps/chosen": -110.96612548828125, "logps/rejected": -133.81349182128906, "loss": 0.6118, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5803836584091187, "rewards/margins": 0.2191765308380127, "rewards/rejected": -0.7995601892471313, "step": 7520 }, { "epoch": 1.297381116471399, "grad_norm": 13.324836730957031, "learning_rate": 6.994843318501175e-08, "logits/chosen": -2.3401191234588623, "logits/rejected": -2.3351352214813232, "logps/chosen": -111.37525939941406, "logps/rejected": -132.28050231933594, "loss": 0.6221, "rewards/accuracies": 0.625, "rewards/chosen": -0.5915793180465698, "rewards/margins": 0.1968701332807541, "rewards/rejected": -0.7884494066238403, "step": 7530 }, { "epoch": 1.299104066161268, "grad_norm": 10.166213035583496, "learning_rate": 6.985647454970436e-08, "logits/chosen": -2.498976469039917, "logits/rejected": -2.4916722774505615, "logps/chosen": -106.17533111572266, "logps/rejected": -129.9593048095703, "loss": 0.6058, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.546347975730896, "rewards/margins": 0.22648802399635315, "rewards/rejected": -0.7728360295295715, "step": 7540 }, { "epoch": 1.3008270158511372, "grad_norm": 11.016257286071777, "learning_rate": 6.976443610321355e-08, "logits/chosen": -2.4137473106384277, "logits/rejected": -2.395265579223633, "logps/chosen": -110.7983169555664, "logps/rejected": -132.8076171875, "loss": 0.613, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.590075671672821, "rewards/margins": 0.21986837685108185, "rewards/rejected": -0.8099439740180969, "step": 7550 }, { "epoch": 1.3025499655410062, "grad_norm": 10.684555053710938, "learning_rate": 6.9672318215479e-08, "logits/chosen": -2.5015499591827393, "logits/rejected": -2.480408191680908, "logps/chosen": -108.003173828125, "logps/rejected": -135.09716796875, "loss": 0.5994, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5542820692062378, "rewards/margins": 0.24306043982505798, "rewards/rejected": -0.7973425984382629, "step": 7560 }, { "epoch": 1.3042729152308752, "grad_norm": 11.643546104431152, "learning_rate": 6.958012125675961e-08, "logits/chosen": -2.482957363128662, "logits/rejected": -2.4652748107910156, "logps/chosen": -121.36790466308594, "logps/rejected": -137.83099365234375, "loss": 0.6165, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.645708441734314, "rewards/margins": 0.2100670039653778, "rewards/rejected": -0.8557754755020142, "step": 7570 }, { "epoch": 1.3059958649207444, "grad_norm": 11.516812324523926, "learning_rate": 6.948784559763221e-08, "logits/chosen": -2.4341461658477783, "logits/rejected": -2.4167208671569824, "logps/chosen": -111.94975280761719, "logps/rejected": -133.61476135253906, "loss": 0.6061, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5593603253364563, "rewards/margins": 0.2302013337612152, "rewards/rejected": -0.7895616292953491, "step": 7580 }, { "epoch": 1.3077188146106133, "grad_norm": 11.02752685546875, "learning_rate": 6.93954916089899e-08, "logits/chosen": -2.4580235481262207, "logits/rejected": -2.415940284729004, "logps/chosen": -126.49107360839844, "logps/rejected": -140.5691375732422, "loss": 0.6219, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6816823482513428, "rewards/margins": 0.20988862216472626, "rewards/rejected": -0.8915708661079407, "step": 7590 }, { "epoch": 1.3094417643004825, "grad_norm": 17.487403869628906, "learning_rate": 6.930305966204059e-08, "logits/chosen": -2.430729389190674, "logits/rejected": -2.4047584533691406, "logps/chosen": -122.69892883300781, "logps/rejected": -127.8052978515625, "loss": 0.6721, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7016171216964722, "rewards/margins": 0.09047193080186844, "rewards/rejected": -0.7920891046524048, "step": 7600 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -2.5224409103393555, "eval_logits/rejected": -2.5152130126953125, "eval_logps/chosen": -111.57231903076172, "eval_logps/rejected": -127.06050872802734, "eval_loss": 0.653099536895752, "eval_rewards/accuracies": 0.6229089498519897, "eval_rewards/chosen": -0.5286041498184204, "eval_rewards/margins": 0.1101997047662735, "eval_rewards/rejected": -0.6388038396835327, "eval_runtime": 384.8073, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 7600 }, { "epoch": 1.3111647139903515, "grad_norm": 9.191173553466797, "learning_rate": 6.921055012830563e-08, "logits/chosen": -2.3860549926757812, "logits/rejected": -2.3581323623657227, "logps/chosen": -115.19822692871094, "logps/rejected": -131.35890197753906, "loss": 0.6285, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6228688359260559, "rewards/margins": 0.1884930431842804, "rewards/rejected": -0.8113619089126587, "step": 7610 }, { "epoch": 1.3128876636802205, "grad_norm": 11.324193954467773, "learning_rate": 6.911796337961813e-08, "logits/chosen": -2.4442169666290283, "logits/rejected": -2.4186041355133057, "logps/chosen": -116.85527038574219, "logps/rejected": -128.7539825439453, "loss": 0.6391, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5934191942214966, "rewards/margins": 0.156190425157547, "rewards/rejected": -0.749609649181366, "step": 7620 }, { "epoch": 1.3146106133700897, "grad_norm": 12.662644386291504, "learning_rate": 6.902529978812159e-08, "logits/chosen": -2.374581813812256, "logits/rejected": -2.380875825881958, "logps/chosen": -107.82472229003906, "logps/rejected": -132.85824584960938, "loss": 0.6149, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5799170136451721, "rewards/margins": 0.20830205082893372, "rewards/rejected": -0.7882190942764282, "step": 7630 }, { "epoch": 1.3163335630599586, "grad_norm": 16.593372344970703, "learning_rate": 6.893255972626838e-08, "logits/chosen": -2.3311426639556885, "logits/rejected": -2.3094396591186523, "logps/chosen": -121.75987243652344, "logps/rejected": -139.49305725097656, "loss": 0.6076, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6384223699569702, "rewards/margins": 0.23241691291332245, "rewards/rejected": -0.8708394169807434, "step": 7640 }, { "epoch": 1.3180565127498278, "grad_norm": 10.331745147705078, "learning_rate": 6.883974356681823e-08, "logits/chosen": -2.4536070823669434, "logits/rejected": -2.4292032718658447, "logps/chosen": -127.8359146118164, "logps/rejected": -144.3247833251953, "loss": 0.628, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6943959593772888, "rewards/margins": 0.1910410076379776, "rewards/rejected": -0.88543701171875, "step": 7650 }, { "epoch": 1.3197794624396968, "grad_norm": 12.339012145996094, "learning_rate": 6.874685168283675e-08, "logits/chosen": -2.4251656532287598, "logits/rejected": -2.3903884887695312, "logps/chosen": -123.54502868652344, "logps/rejected": -140.67086791992188, "loss": 0.6265, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6651470065116882, "rewards/margins": 0.1938408464193344, "rewards/rejected": -0.8589879274368286, "step": 7660 }, { "epoch": 1.3215024121295658, "grad_norm": 20.665760040283203, "learning_rate": 6.865388444769388e-08, "logits/chosen": -2.3742659091949463, "logits/rejected": -2.3489937782287598, "logps/chosen": -121.5265884399414, "logps/rejected": -134.6374053955078, "loss": 0.6322, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6398226618766785, "rewards/margins": 0.17717327177524567, "rewards/rejected": -0.8169959187507629, "step": 7670 }, { "epoch": 1.323225361819435, "grad_norm": 10.402544021606445, "learning_rate": 6.856084223506247e-08, "logits/chosen": -2.4360992908477783, "logits/rejected": -2.418099880218506, "logps/chosen": -115.1739730834961, "logps/rejected": -134.08407592773438, "loss": 0.6179, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6453762054443359, "rewards/margins": 0.19838663935661316, "rewards/rejected": -0.8437628746032715, "step": 7680 }, { "epoch": 1.324948311509304, "grad_norm": 10.437387466430664, "learning_rate": 6.84677254189167e-08, "logits/chosen": -2.5248842239379883, "logits/rejected": -2.4808261394500732, "logps/chosen": -112.14786529541016, "logps/rejected": -129.3573760986328, "loss": 0.6082, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5716339349746704, "rewards/margins": 0.23138833045959473, "rewards/rejected": -0.8030222654342651, "step": 7690 }, { "epoch": 1.3266712611991731, "grad_norm": 9.979451179504395, "learning_rate": 6.837453437353064e-08, "logits/chosen": -2.3928892612457275, "logits/rejected": -2.3641133308410645, "logps/chosen": -117.5234146118164, "logps/rejected": -135.61276245117188, "loss": 0.6262, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6381297707557678, "rewards/margins": 0.18097563087940216, "rewards/rejected": -0.8191054463386536, "step": 7700 }, { "epoch": 1.3266712611991731, "eval_logits/chosen": -2.5058393478393555, "eval_logits/rejected": -2.4986164569854736, "eval_logps/chosen": -113.11471557617188, "eval_logps/rejected": -128.85545349121094, "eval_loss": 0.6528422236442566, "eval_rewards/accuracies": 0.6198884844779968, "eval_rewards/chosen": -0.544028103351593, "eval_rewards/margins": 0.11272517591714859, "eval_rewards/rejected": -0.6567533016204834, "eval_runtime": 385.2324, "eval_samples_per_second": 11.172, "eval_steps_per_second": 1.397, "step": 7700 }, { "epoch": 1.328394210889042, "grad_norm": 11.084187507629395, "learning_rate": 6.82812694734767e-08, "logits/chosen": -2.433046817779541, "logits/rejected": -2.418241500854492, "logps/chosen": -124.80726623535156, "logps/rejected": -139.28591918945312, "loss": 0.6453, "rewards/accuracies": 0.625, "rewards/chosen": -0.7323471307754517, "rewards/margins": 0.151248499751091, "rewards/rejected": -0.8835956454277039, "step": 7710 }, { "epoch": 1.330117160578911, "grad_norm": 11.728679656982422, "learning_rate": 6.818793109362416e-08, "logits/chosen": -2.407993793487549, "logits/rejected": -2.3756182193756104, "logps/chosen": -125.52972412109375, "logps/rejected": -138.90325927734375, "loss": 0.6249, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.693871796131134, "rewards/margins": 0.18704429268836975, "rewards/rejected": -0.8809159994125366, "step": 7720 }, { "epoch": 1.33184011026878, "grad_norm": 11.717412948608398, "learning_rate": 6.80945196091376e-08, "logits/chosen": -2.369554042816162, "logits/rejected": -2.348620891571045, "logps/chosen": -116.68568420410156, "logps/rejected": -134.80941772460938, "loss": 0.6193, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6276334524154663, "rewards/margins": 0.19776475429534912, "rewards/rejected": -0.8253981471061707, "step": 7730 }, { "epoch": 1.3335630599586492, "grad_norm": 11.634387969970703, "learning_rate": 6.800103539547548e-08, "logits/chosen": -2.360583782196045, "logits/rejected": -2.3449316024780273, "logps/chosen": -116.9215087890625, "logps/rejected": -142.9410858154297, "loss": 0.6056, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.652165949344635, "rewards/margins": 0.2394404113292694, "rewards/rejected": -0.891606330871582, "step": 7740 }, { "epoch": 1.3352860096485184, "grad_norm": 12.765972137451172, "learning_rate": 6.790747882838859e-08, "logits/chosen": -2.3764195442199707, "logits/rejected": -2.3468713760375977, "logps/chosen": -124.39608001708984, "logps/rejected": -145.98439025878906, "loss": 0.6159, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7045329809188843, "rewards/margins": 0.23335511982440948, "rewards/rejected": -0.9378880262374878, "step": 7750 }, { "epoch": 1.3370089593383874, "grad_norm": 11.26970100402832, "learning_rate": 6.781385028391851e-08, "logits/chosen": -2.279305934906006, "logits/rejected": -2.2602498531341553, "logps/chosen": -115.32120513916016, "logps/rejected": -141.68618774414062, "loss": 0.602, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6481447219848633, "rewards/margins": 0.2518182396888733, "rewards/rejected": -0.8999630212783813, "step": 7760 }, { "epoch": 1.3387319090282563, "grad_norm": 12.088203430175781, "learning_rate": 6.772015013839616e-08, "logits/chosen": -2.3379874229431152, "logits/rejected": -2.313225269317627, "logps/chosen": -117.2423095703125, "logps/rejected": -137.68634033203125, "loss": 0.6183, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6512611508369446, "rewards/margins": 0.20047590136528015, "rewards/rejected": -0.8517370223999023, "step": 7770 }, { "epoch": 1.3404548587181253, "grad_norm": 13.161063194274902, "learning_rate": 6.762637876844021e-08, "logits/chosen": -2.475210666656494, "logits/rejected": -2.4619956016540527, "logps/chosen": -123.12931823730469, "logps/rejected": -141.87379455566406, "loss": 0.644, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7159032821655273, "rewards/margins": 0.16397185623645782, "rewards/rejected": -0.8798750638961792, "step": 7780 }, { "epoch": 1.3421778084079945, "grad_norm": 13.581364631652832, "learning_rate": 6.753253655095565e-08, "logits/chosen": -2.4097869396209717, "logits/rejected": -2.3954663276672363, "logps/chosen": -120.80511474609375, "logps/rejected": -143.2562255859375, "loss": 0.6186, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6923469305038452, "rewards/margins": 0.22124560177326202, "rewards/rejected": -0.9135926365852356, "step": 7790 }, { "epoch": 1.3439007580978635, "grad_norm": 12.732964515686035, "learning_rate": 6.743862386313219e-08, "logits/chosen": -2.4334893226623535, "logits/rejected": -2.426264524459839, "logps/chosen": -124.7326431274414, "logps/rejected": -149.80714416503906, "loss": 0.6077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7075215578079224, "rewards/margins": 0.2432362139225006, "rewards/rejected": -0.9507578015327454, "step": 7800 }, { "epoch": 1.3439007580978635, "eval_logits/chosen": -2.4897849559783936, "eval_logits/rejected": -2.4824118614196777, "eval_logps/chosen": -116.00698852539062, "eval_logps/rejected": -132.1913299560547, "eval_loss": 0.6519642472267151, "eval_rewards/accuracies": 0.6231412887573242, "eval_rewards/chosen": -0.5729508996009827, "eval_rewards/margins": 0.11716112494468689, "eval_rewards/rejected": -0.6901120543479919, "eval_runtime": 385.0102, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 7800 }, { "epoch": 1.3456237077877327, "grad_norm": 13.366806983947754, "learning_rate": 6.734464108244285e-08, "logits/chosen": -2.4520998001098633, "logits/rejected": -2.4238152503967285, "logps/chosen": -125.62376403808594, "logps/rejected": -140.81715393066406, "loss": 0.6252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6660617589950562, "rewards/margins": 0.19454911351203918, "rewards/rejected": -0.8606107831001282, "step": 7810 }, { "epoch": 1.3473466574776016, "grad_norm": 9.556560516357422, "learning_rate": 6.725058858664234e-08, "logits/chosen": -2.4230477809906006, "logits/rejected": -2.39285945892334, "logps/chosen": -119.57745361328125, "logps/rejected": -143.25059509277344, "loss": 0.6111, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6321195960044861, "rewards/margins": 0.24726004898548126, "rewards/rejected": -0.8793797492980957, "step": 7820 }, { "epoch": 1.3490696071674706, "grad_norm": 13.65351390838623, "learning_rate": 6.715646675376557e-08, "logits/chosen": -2.3405568599700928, "logits/rejected": -2.328528642654419, "logps/chosen": -121.41092681884766, "logps/rejected": -146.38955688476562, "loss": 0.6198, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7161086797714233, "rewards/margins": 0.21350769698619843, "rewards/rejected": -0.9296164512634277, "step": 7830 }, { "epoch": 1.3507925568573398, "grad_norm": 10.71297836303711, "learning_rate": 6.70622759621262e-08, "logits/chosen": -2.3130688667297363, "logits/rejected": -2.2909352779388428, "logps/chosen": -123.2457275390625, "logps/rejected": -145.99148559570312, "loss": 0.6171, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6967824101448059, "rewards/margins": 0.2208057940006256, "rewards/rejected": -0.9175881147384644, "step": 7840 }, { "epoch": 1.3525155065472088, "grad_norm": 13.573526382446289, "learning_rate": 6.6968016590315e-08, "logits/chosen": -2.3070931434631348, "logits/rejected": -2.2753803730010986, "logps/chosen": -126.95500183105469, "logps/rejected": -137.7808380126953, "loss": 0.6349, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7137793302536011, "rewards/margins": 0.18026357889175415, "rewards/rejected": -0.8940428495407104, "step": 7850 }, { "epoch": 1.354238456237078, "grad_norm": 10.482673645019531, "learning_rate": 6.687368901719843e-08, "logits/chosen": -2.339378833770752, "logits/rejected": -2.304893970489502, "logps/chosen": -122.9904556274414, "logps/rejected": -149.17758178710938, "loss": 0.6005, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7009402513504028, "rewards/margins": 0.26430749893188477, "rewards/rejected": -0.9652477502822876, "step": 7860 }, { "epoch": 1.355961405926947, "grad_norm": 12.306436538696289, "learning_rate": 6.677929362191708e-08, "logits/chosen": -2.359908103942871, "logits/rejected": -2.343132734298706, "logps/chosen": -129.54879760742188, "logps/rejected": -149.89306640625, "loss": 0.6203, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7387486100196838, "rewards/margins": 0.21654868125915527, "rewards/rejected": -0.9552972912788391, "step": 7870 }, { "epoch": 1.3576843556168159, "grad_norm": 14.739594459533691, "learning_rate": 6.668483078388411e-08, "logits/chosen": -2.4242472648620605, "logits/rejected": -2.4089794158935547, "logps/chosen": -124.3792495727539, "logps/rejected": -142.53152465820312, "loss": 0.6336, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7126563191413879, "rewards/margins": 0.18753044307231903, "rewards/rejected": -0.9001868367195129, "step": 7880 }, { "epoch": 1.359407305306685, "grad_norm": 12.834400177001953, "learning_rate": 6.659030088278378e-08, "logits/chosen": -2.3692946434020996, "logits/rejected": -2.3493106365203857, "logps/chosen": -123.4371566772461, "logps/rejected": -146.53465270996094, "loss": 0.6169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6906110048294067, "rewards/margins": 0.22222641110420227, "rewards/rejected": -0.9128373861312866, "step": 7890 }, { "epoch": 1.361130254996554, "grad_norm": 13.229230880737305, "learning_rate": 6.649570429856992e-08, "logits/chosen": -2.359175205230713, "logits/rejected": -2.355050563812256, "logps/chosen": -124.67671203613281, "logps/rejected": -145.9084014892578, "loss": 0.6293, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7250050902366638, "rewards/margins": 0.19758859276771545, "rewards/rejected": -0.9225937128067017, "step": 7900 }, { "epoch": 1.361130254996554, "eval_logits/chosen": -2.482358455657959, "eval_logits/rejected": -2.4748923778533936, "eval_logps/chosen": -117.40166473388672, "eval_logps/rejected": -133.91432189941406, "eval_loss": 0.6511279940605164, "eval_rewards/accuracies": 0.6233736276626587, "eval_rewards/chosen": -0.5868976712226868, "eval_rewards/margins": 0.1204444169998169, "eval_rewards/rejected": -0.7073420286178589, "eval_runtime": 384.8087, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 7900 }, { "epoch": 1.3628532046864232, "grad_norm": 12.742846488952637, "learning_rate": 6.640104141146439e-08, "logits/chosen": -2.361497402191162, "logits/rejected": -2.3384346961975098, "logps/chosen": -129.8594970703125, "logps/rejected": -147.49058532714844, "loss": 0.6284, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.730360746383667, "rewards/margins": 0.20804348587989807, "rewards/rejected": -0.9384042620658875, "step": 7910 }, { "epoch": 1.3645761543762922, "grad_norm": 12.77011775970459, "learning_rate": 6.630631260195548e-08, "logits/chosen": -2.3626182079315186, "logits/rejected": -2.3410229682922363, "logps/chosen": -125.41242980957031, "logps/rejected": -143.36672973632812, "loss": 0.618, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7059171795845032, "rewards/margins": 0.21313908696174622, "rewards/rejected": -0.9190564155578613, "step": 7920 }, { "epoch": 1.3662991040661612, "grad_norm": 16.295289993286133, "learning_rate": 6.621151825079657e-08, "logits/chosen": -2.409039258956909, "logits/rejected": -2.3817052841186523, "logps/chosen": -133.2943572998047, "logps/rejected": -148.67955017089844, "loss": 0.6208, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7495981454849243, "rewards/margins": 0.213208869099617, "rewards/rejected": -0.9628068208694458, "step": 7930 }, { "epoch": 1.3680220537560304, "grad_norm": 14.971701622009277, "learning_rate": 6.611665873900434e-08, "logits/chosen": -2.3166592121124268, "logits/rejected": -2.299065113067627, "logps/chosen": -131.53468322753906, "logps/rejected": -153.00302124023438, "loss": 0.6169, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7297437787055969, "rewards/margins": 0.24808356165885925, "rewards/rejected": -0.9778273701667786, "step": 7940 }, { "epoch": 1.3697450034458993, "grad_norm": 11.8698091506958, "learning_rate": 6.602173444785747e-08, "logits/chosen": -2.276665449142456, "logits/rejected": -2.266939401626587, "logps/chosen": -121.47349548339844, "logps/rejected": -146.16220092773438, "loss": 0.6206, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6996745467185974, "rewards/margins": 0.2292627990245819, "rewards/rejected": -0.9289374351501465, "step": 7950 }, { "epoch": 1.3714679531357685, "grad_norm": 11.140199661254883, "learning_rate": 6.5926745758895e-08, "logits/chosen": -2.2959096431732178, "logits/rejected": -2.2703421115875244, "logps/chosen": -122.07315826416016, "logps/rejected": -138.4790802001953, "loss": 0.6598, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7159412503242493, "rewards/margins": 0.14399950206279755, "rewards/rejected": -0.8599408864974976, "step": 7960 }, { "epoch": 1.3731909028256375, "grad_norm": 10.78872013092041, "learning_rate": 6.583169305391479e-08, "logits/chosen": -2.382143020629883, "logits/rejected": -2.353829860687256, "logps/chosen": -132.39743041992188, "logps/rejected": -145.7576904296875, "loss": 0.6497, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7700007557868958, "rewards/margins": 0.16685865819454193, "rewards/rejected": -0.9368594288825989, "step": 7970 }, { "epoch": 1.3749138525155065, "grad_norm": 14.980616569519043, "learning_rate": 6.5736576714972e-08, "logits/chosen": -2.4327292442321777, "logits/rejected": -2.41796875, "logps/chosen": -126.8401870727539, "logps/rejected": -147.38278198242188, "loss": 0.6214, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6996058225631714, "rewards/margins": 0.21316389739513397, "rewards/rejected": -0.912769615650177, "step": 7980 }, { "epoch": 1.3766368022053757, "grad_norm": 12.41934871673584, "learning_rate": 6.564139712437761e-08, "logits/chosen": -2.4434096813201904, "logits/rejected": -2.429826021194458, "logps/chosen": -120.55796813964844, "logps/rejected": -144.46975708007812, "loss": 0.6174, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6989076733589172, "rewards/margins": 0.22920051217079163, "rewards/rejected": -0.9281080365180969, "step": 7990 }, { "epoch": 1.3783597518952446, "grad_norm": 12.667075157165527, "learning_rate": 6.554615466469677e-08, "logits/chosen": -2.31193470954895, "logits/rejected": -2.3006598949432373, "logps/chosen": -122.08732604980469, "logps/rejected": -148.1156768798828, "loss": 0.6065, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6756981611251831, "rewards/margins": 0.24303464591503143, "rewards/rejected": -0.9187329411506653, "step": 8000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -2.4742701053619385, "eval_logits/rejected": -2.4666762351989746, "eval_logps/chosen": -118.02407836914062, "eval_logps/rejected": -134.84156799316406, "eval_loss": 0.6501578092575073, "eval_rewards/accuracies": 0.6236059665679932, "eval_rewards/chosen": -0.5931217670440674, "eval_rewards/margins": 0.12349271029233932, "eval_rewards/rejected": -0.7166144847869873, "eval_runtime": 384.3801, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 8000 }, { "epoch": 1.3800827015851138, "grad_norm": 12.12289047241211, "learning_rate": 6.545084971874738e-08, "logits/chosen": -2.3441965579986572, "logits/rejected": -2.321648120880127, "logps/chosen": -127.4809799194336, "logps/rejected": -140.70608520507812, "loss": 0.6295, "rewards/accuracies": 0.625, "rewards/chosen": -0.7032114267349243, "rewards/margins": 0.1796160191297531, "rewards/rejected": -0.8828274011611938, "step": 8010 }, { "epoch": 1.3818056512749828, "grad_norm": 13.993814468383789, "learning_rate": 6.535548266959845e-08, "logits/chosen": -2.357900381088257, "logits/rejected": -2.319896697998047, "logps/chosen": -138.2122802734375, "logps/rejected": -153.4851531982422, "loss": 0.6207, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7797445058822632, "rewards/margins": 0.20767898857593536, "rewards/rejected": -0.9874235391616821, "step": 8020 }, { "epoch": 1.3835286009648518, "grad_norm": 12.521170616149902, "learning_rate": 6.526005390056863e-08, "logits/chosen": -2.348358392715454, "logits/rejected": -2.332282066345215, "logps/chosen": -120.7018051147461, "logps/rejected": -146.21417236328125, "loss": 0.6144, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6955846548080444, "rewards/margins": 0.23238544166088104, "rewards/rejected": -0.9279701113700867, "step": 8030 }, { "epoch": 1.385251550654721, "grad_norm": 11.842948913574219, "learning_rate": 6.516456379522468e-08, "logits/chosen": -2.3112587928771973, "logits/rejected": -2.2816693782806396, "logps/chosen": -132.21774291992188, "logps/rejected": -151.22940063476562, "loss": 0.6294, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7779108285903931, "rewards/margins": 0.2141016721725464, "rewards/rejected": -0.9920123815536499, "step": 8040 }, { "epoch": 1.38697450034459, "grad_norm": 11.105095863342285, "learning_rate": 6.506901273737985e-08, "logits/chosen": -2.3775672912597656, "logits/rejected": -2.357567310333252, "logps/chosen": -126.63232421875, "logps/rejected": -151.7419891357422, "loss": 0.6118, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7152479887008667, "rewards/margins": 0.24371013045310974, "rewards/rejected": -0.9589580297470093, "step": 8050 }, { "epoch": 1.388697450034459, "grad_norm": 13.90049934387207, "learning_rate": 6.497340111109239e-08, "logits/chosen": -2.4168243408203125, "logits/rejected": -2.383380174636841, "logps/chosen": -132.6084442138672, "logps/rejected": -143.3878631591797, "loss": 0.6509, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7726985216140747, "rewards/margins": 0.16030731797218323, "rewards/rejected": -0.9330059289932251, "step": 8060 }, { "epoch": 1.390420399724328, "grad_norm": 12.728414535522461, "learning_rate": 6.4877729300664e-08, "logits/chosen": -2.3291175365448, "logits/rejected": -2.299983024597168, "logps/chosen": -128.56222534179688, "logps/rejected": -145.14614868164062, "loss": 0.6268, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7510396242141724, "rewards/margins": 0.20077459514141083, "rewards/rejected": -0.9518140554428101, "step": 8070 }, { "epoch": 1.392143349414197, "grad_norm": 11.73464298248291, "learning_rate": 6.478199769063833e-08, "logits/chosen": -2.3173067569732666, "logits/rejected": -2.310159683227539, "logps/chosen": -119.56437683105469, "logps/rejected": -149.2445831298828, "loss": 0.6054, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6657792329788208, "rewards/margins": 0.247043177485466, "rewards/rejected": -0.912822425365448, "step": 8080 }, { "epoch": 1.3938662991040662, "grad_norm": 10.838653564453125, "learning_rate": 6.468620666579927e-08, "logits/chosen": -2.296538829803467, "logits/rejected": -2.2700247764587402, "logps/chosen": -125.75146484375, "logps/rejected": -147.97433471679688, "loss": 0.6148, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7263309955596924, "rewards/margins": 0.23887792229652405, "rewards/rejected": -0.9652088284492493, "step": 8090 }, { "epoch": 1.3955892487939352, "grad_norm": 11.62055492401123, "learning_rate": 6.459035661116967e-08, "logits/chosen": -2.347628355026245, "logits/rejected": -2.33947491645813, "logps/chosen": -129.45924377441406, "logps/rejected": -152.16795349121094, "loss": 0.6328, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7537738084793091, "rewards/margins": 0.18697968125343323, "rewards/rejected": -0.9407534599304199, "step": 8100 }, { "epoch": 1.3955892487939352, "eval_logits/chosen": -2.463512659072876, "eval_logits/rejected": -2.4558422565460205, "eval_logps/chosen": -119.2177734375, "eval_logps/rejected": -136.2457275390625, "eval_loss": 0.6498830318450928, "eval_rewards/accuracies": 0.625464677810669, "eval_rewards/chosen": -0.6050586700439453, "eval_rewards/margins": 0.12559719383716583, "eval_rewards/rejected": -0.7306559085845947, "eval_runtime": 384.7739, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 8100 }, { "epoch": 1.3973121984838044, "grad_norm": 14.124561309814453, "learning_rate": 6.449444791200956e-08, "logits/chosen": -2.3686397075653076, "logits/rejected": -2.3351588249206543, "logps/chosen": -129.67294311523438, "logps/rejected": -143.42198181152344, "loss": 0.6305, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7361140251159668, "rewards/margins": 0.18766313791275024, "rewards/rejected": -0.9237771034240723, "step": 8110 }, { "epoch": 1.3990351481736734, "grad_norm": 13.09841251373291, "learning_rate": 6.43984809538147e-08, "logits/chosen": -2.3339853286743164, "logits/rejected": -2.314669132232666, "logps/chosen": -123.64161682128906, "logps/rejected": -142.0753936767578, "loss": 0.6276, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6953922510147095, "rewards/margins": 0.19653202593326569, "rewards/rejected": -0.8919242024421692, "step": 8120 }, { "epoch": 1.4007580978635423, "grad_norm": 12.324987411499023, "learning_rate": 6.430245612231501e-08, "logits/chosen": -2.3438968658447266, "logits/rejected": -2.328284740447998, "logps/chosen": -123.36360931396484, "logps/rejected": -140.27740478515625, "loss": 0.6233, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6854349374771118, "rewards/margins": 0.2113173007965088, "rewards/rejected": -0.8967521786689758, "step": 8130 }, { "epoch": 1.4024810475534115, "grad_norm": 13.211545944213867, "learning_rate": 6.420637380347304e-08, "logits/chosen": -2.3109488487243652, "logits/rejected": -2.287367343902588, "logps/chosen": -127.21601867675781, "logps/rejected": -149.110595703125, "loss": 0.6248, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7448909878730774, "rewards/margins": 0.21813924610614777, "rewards/rejected": -0.963030219078064, "step": 8140 }, { "epoch": 1.4042039972432805, "grad_norm": 12.219706535339355, "learning_rate": 6.41102343834824e-08, "logits/chosen": -2.3817532062530518, "logits/rejected": -2.361610174179077, "logps/chosen": -126.07899475097656, "logps/rejected": -148.6870880126953, "loss": 0.6296, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7393119931221008, "rewards/margins": 0.21682056784629822, "rewards/rejected": -0.9561325907707214, "step": 8150 }, { "epoch": 1.4059269469331497, "grad_norm": 10.935731887817383, "learning_rate": 6.40140382487662e-08, "logits/chosen": -2.3210196495056152, "logits/rejected": -2.3000967502593994, "logps/chosen": -128.22439575195312, "logps/rejected": -152.42929077148438, "loss": 0.5899, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.722113311290741, "rewards/margins": 0.27929848432540894, "rewards/rejected": -1.0014116764068604, "step": 8160 }, { "epoch": 1.4076498966230186, "grad_norm": 11.593744277954102, "learning_rate": 6.391778578597555e-08, "logits/chosen": -2.3887534141540527, "logits/rejected": -2.362985372543335, "logps/chosen": -129.02682495117188, "logps/rejected": -143.5245361328125, "loss": 0.6131, "rewards/accuracies": 0.625, "rewards/chosen": -0.6931599974632263, "rewards/margins": 0.22256305813789368, "rewards/rejected": -0.9157230257987976, "step": 8170 }, { "epoch": 1.4093728463128876, "grad_norm": 12.986204147338867, "learning_rate": 6.38214773819879e-08, "logits/chosen": -2.3697712421417236, "logits/rejected": -2.3512022495269775, "logps/chosen": -134.36557006835938, "logps/rejected": -148.2884063720703, "loss": 0.6457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7807331681251526, "rewards/margins": 0.1520390808582306, "rewards/rejected": -0.9327722787857056, "step": 8180 }, { "epoch": 1.4110957960027566, "grad_norm": 13.900282859802246, "learning_rate": 6.37251134239056e-08, "logits/chosen": -2.332498550415039, "logits/rejected": -2.309671640396118, "logps/chosen": -134.37588500976562, "logps/rejected": -147.74203491210938, "loss": 0.6498, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7639726996421814, "rewards/margins": 0.17482450604438782, "rewards/rejected": -0.9387971758842468, "step": 8190 }, { "epoch": 1.4128187456926258, "grad_norm": 16.447162628173828, "learning_rate": 6.362869429905431e-08, "logits/chosen": -2.3573856353759766, "logits/rejected": -2.326807975769043, "logps/chosen": -133.86642456054688, "logps/rejected": -147.4443817138672, "loss": 0.646, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7887129783630371, "rewards/margins": 0.16021788120269775, "rewards/rejected": -0.9489308595657349, "step": 8200 }, { "epoch": 1.4128187456926258, "eval_logits/chosen": -2.4600377082824707, "eval_logits/rejected": -2.452349901199341, "eval_logps/chosen": -118.73445892333984, "eval_logps/rejected": -135.823486328125, "eval_loss": 0.6494462490081787, "eval_rewards/accuracies": 0.6231412887573242, "eval_rewards/chosen": -0.6002256274223328, "eval_rewards/margins": 0.12620803713798523, "eval_rewards/rejected": -0.7264336943626404, "eval_runtime": 384.8988, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 8200 }, { "epoch": 1.414541695382495, "grad_norm": 15.098549842834473, "learning_rate": 6.353222039498136e-08, "logits/chosen": -2.246654510498047, "logits/rejected": -2.2284741401672363, "logps/chosen": -130.2065887451172, "logps/rejected": -152.4451141357422, "loss": 0.6213, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7510167956352234, "rewards/margins": 0.21188810467720032, "rewards/rejected": -0.9629047513008118, "step": 8210 }, { "epoch": 1.416264645072364, "grad_norm": 13.643304824829102, "learning_rate": 6.343569209945431e-08, "logits/chosen": -2.366328239440918, "logits/rejected": -2.3396661281585693, "logps/chosen": -124.73356628417969, "logps/rejected": -146.63670349121094, "loss": 0.6184, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7164047360420227, "rewards/margins": 0.21949002146720886, "rewards/rejected": -0.9358948469161987, "step": 8220 }, { "epoch": 1.417987594762233, "grad_norm": 11.130935668945312, "learning_rate": 6.333910980045932e-08, "logits/chosen": -2.334264039993286, "logits/rejected": -2.3184635639190674, "logps/chosen": -130.94566345214844, "logps/rejected": -144.80474853515625, "loss": 0.6406, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7569707036018372, "rewards/margins": 0.17768417298793793, "rewards/rejected": -0.9346548318862915, "step": 8230 }, { "epoch": 1.4197105444521019, "grad_norm": 15.7880859375, "learning_rate": 6.324247388619967e-08, "logits/chosen": -2.430142879486084, "logits/rejected": -2.40490984916687, "logps/chosen": -131.57620239257812, "logps/rejected": -146.419189453125, "loss": 0.6287, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7540255188941956, "rewards/margins": 0.19174614548683167, "rewards/rejected": -0.9457718133926392, "step": 8240 }, { "epoch": 1.421433494141971, "grad_norm": 11.681885719299316, "learning_rate": 6.314578474509403e-08, "logits/chosen": -2.3634536266326904, "logits/rejected": -2.3433897495269775, "logps/chosen": -127.7612533569336, "logps/rejected": -148.54034423828125, "loss": 0.6076, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7011187076568604, "rewards/margins": 0.24105024337768555, "rewards/rejected": -0.9421690106391907, "step": 8250 }, { "epoch": 1.42315644383184, "grad_norm": 12.538498878479004, "learning_rate": 6.30490427657751e-08, "logits/chosen": -2.4380857944488525, "logits/rejected": -2.418407917022705, "logps/chosen": -127.59770202636719, "logps/rejected": -154.81971740722656, "loss": 0.5984, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7118310928344727, "rewards/margins": 0.28354528546333313, "rewards/rejected": -0.9953764081001282, "step": 8260 }, { "epoch": 1.4248793935217092, "grad_norm": 13.778732299804688, "learning_rate": 6.295224833708792e-08, "logits/chosen": -2.421600103378296, "logits/rejected": -2.411262035369873, "logps/chosen": -126.8388900756836, "logps/rejected": -150.25302124023438, "loss": 0.6264, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7344324588775635, "rewards/margins": 0.21441522240638733, "rewards/rejected": -0.9488475918769836, "step": 8270 }, { "epoch": 1.4266023432115782, "grad_norm": 12.443196296691895, "learning_rate": 6.285540184808836e-08, "logits/chosen": -2.3350818157196045, "logits/rejected": -2.324326276779175, "logps/chosen": -130.43807983398438, "logps/rejected": -146.71829223632812, "loss": 0.649, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7806102633476257, "rewards/margins": 0.1578224152326584, "rewards/rejected": -0.9384326934814453, "step": 8280 }, { "epoch": 1.4283252929014472, "grad_norm": 11.999506950378418, "learning_rate": 6.275850368804156e-08, "logits/chosen": -2.312540292739868, "logits/rejected": -2.2675204277038574, "logps/chosen": -130.52099609375, "logps/rejected": -142.7451171875, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": -0.7058475017547607, "rewards/margins": 0.1796690821647644, "rewards/rejected": -0.8855165243148804, "step": 8290 }, { "epoch": 1.4300482425913164, "grad_norm": 10.20169734954834, "learning_rate": 6.26615542464203e-08, "logits/chosen": -2.4780187606811523, "logits/rejected": -2.4557528495788574, "logps/chosen": -130.71356201171875, "logps/rejected": -147.21878051757812, "loss": 0.6384, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7313417792320251, "rewards/margins": 0.19554628431797028, "rewards/rejected": -0.9268879890441895, "step": 8300 }, { "epoch": 1.4300482425913164, "eval_logits/chosen": -2.456756830215454, "eval_logits/rejected": -2.4491126537323, "eval_logps/chosen": -116.86186218261719, "eval_logps/rejected": -133.69766235351562, "eval_loss": 0.6500291228294373, "eval_rewards/accuracies": 0.6233736276626587, "eval_rewards/chosen": -0.5814995765686035, "eval_rewards/margins": 0.1236758604645729, "eval_rewards/rejected": -0.7051754593849182, "eval_runtime": 384.8326, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 8300 }, { "epoch": 1.4317711922811853, "grad_norm": 14.733060836791992, "learning_rate": 6.256455391290352e-08, "logits/chosen": -2.2680420875549316, "logits/rejected": -2.2394232749938965, "logps/chosen": -121.70494079589844, "logps/rejected": -136.4855499267578, "loss": 0.6266, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6543580293655396, "rewards/margins": 0.19896702468395233, "rewards/rejected": -0.8533250093460083, "step": 8310 }, { "epoch": 1.4334941419710545, "grad_norm": 12.264205932617188, "learning_rate": 6.246750307737468e-08, "logits/chosen": -2.3352932929992676, "logits/rejected": -2.3200485706329346, "logps/chosen": -116.69293212890625, "logps/rejected": -142.01824951171875, "loss": 0.6101, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6417948603630066, "rewards/margins": 0.23784181475639343, "rewards/rejected": -0.8796366453170776, "step": 8320 }, { "epoch": 1.4352170916609235, "grad_norm": 12.914230346679688, "learning_rate": 6.237040212992028e-08, "logits/chosen": -2.3522069454193115, "logits/rejected": -2.3421552181243896, "logps/chosen": -120.03816223144531, "logps/rejected": -143.6808319091797, "loss": 0.6358, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7142330408096313, "rewards/margins": 0.1839415431022644, "rewards/rejected": -0.8981746435165405, "step": 8330 }, { "epoch": 1.4369400413507925, "grad_norm": 12.226571083068848, "learning_rate": 6.227325146082817e-08, "logits/chosen": -2.4019618034362793, "logits/rejected": -2.386125087738037, "logps/chosen": -124.94401550292969, "logps/rejected": -142.0945587158203, "loss": 0.6286, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7050046324729919, "rewards/margins": 0.19932475686073303, "rewards/rejected": -0.9043294191360474, "step": 8340 }, { "epoch": 1.4386629910406616, "grad_norm": 11.546863555908203, "learning_rate": 6.217605146058612e-08, "logits/chosen": -2.247965097427368, "logits/rejected": -2.23158597946167, "logps/chosen": -129.00962829589844, "logps/rejected": -144.1426239013672, "loss": 0.6439, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.758385956287384, "rewards/margins": 0.1570947766304016, "rewards/rejected": -0.9154807329177856, "step": 8350 }, { "epoch": 1.4403859407305306, "grad_norm": 14.135172843933105, "learning_rate": 6.207880251988014e-08, "logits/chosen": -2.2395167350769043, "logits/rejected": -2.2052838802337646, "logps/chosen": -124.658935546875, "logps/rejected": -145.8957061767578, "loss": 0.6109, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7010485529899597, "rewards/margins": 0.23889681696891785, "rewards/rejected": -0.9399453997612, "step": 8360 }, { "epoch": 1.4421088904203998, "grad_norm": 13.458427429199219, "learning_rate": 6.198150502959296e-08, "logits/chosen": -2.31392502784729, "logits/rejected": -2.296114206314087, "logps/chosen": -126.8591079711914, "logps/rejected": -149.21900939941406, "loss": 0.6339, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7580282688140869, "rewards/margins": 0.1980302780866623, "rewards/rejected": -0.9560585021972656, "step": 8370 }, { "epoch": 1.4438318401102688, "grad_norm": 10.475207328796387, "learning_rate": 6.188415938080246e-08, "logits/chosen": -2.4301230907440186, "logits/rejected": -2.4103808403015137, "logps/chosen": -116.72225189208984, "logps/rejected": -143.2314910888672, "loss": 0.6103, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6548103094100952, "rewards/margins": 0.23603832721710205, "rewards/rejected": -0.8908486366271973, "step": 8380 }, { "epoch": 1.4455547898001377, "grad_norm": 10.290321350097656, "learning_rate": 6.178676596478007e-08, "logits/chosen": -2.3942151069641113, "logits/rejected": -2.3628439903259277, "logps/chosen": -123.85685729980469, "logps/rejected": -154.1118621826172, "loss": 0.5805, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6716062426567078, "rewards/margins": 0.30842575430870056, "rewards/rejected": -0.9800319671630859, "step": 8390 }, { "epoch": 1.447277739490007, "grad_norm": 12.710296630859375, "learning_rate": 6.168932517298927e-08, "logits/chosen": -2.364626407623291, "logits/rejected": -2.343971014022827, "logps/chosen": -116.19096374511719, "logps/rejected": -138.34274291992188, "loss": 0.6173, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6588490605354309, "rewards/margins": 0.21676592528820038, "rewards/rejected": -0.8756150007247925, "step": 8400 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -2.452606201171875, "eval_logits/rejected": -2.4449288845062256, "eval_logps/chosen": -115.48355865478516, "eval_logps/rejected": -132.14564514160156, "eval_loss": 0.650427520275116, "eval_rewards/accuracies": 0.6217471957206726, "eval_rewards/chosen": -0.567716658115387, "eval_rewards/margins": 0.12193868309259415, "eval_rewards/rejected": -0.6896553039550781, "eval_runtime": 384.8793, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 8400 }, { "epoch": 1.449000689179876, "grad_norm": 9.543499946594238, "learning_rate": 6.159183739708386e-08, "logits/chosen": -2.3137786388397217, "logits/rejected": -2.280810594558716, "logps/chosen": -126.6877670288086, "logps/rejected": -150.96441650390625, "loss": 0.5868, "rewards/accuracies": 0.75, "rewards/chosen": -0.6902732849121094, "rewards/margins": 0.3012274503707886, "rewards/rejected": -0.9915008544921875, "step": 8410 }, { "epoch": 1.450723638869745, "grad_norm": 12.1053466796875, "learning_rate": 6.149430302890658e-08, "logits/chosen": -2.208137035369873, "logits/rejected": -2.193657636642456, "logps/chosen": -118.8209228515625, "logps/rejected": -144.1538543701172, "loss": 0.6086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6716912984848022, "rewards/margins": 0.2572110593318939, "rewards/rejected": -0.9289024472236633, "step": 8420 }, { "epoch": 1.452446588559614, "grad_norm": 14.762650489807129, "learning_rate": 6.139672246048741e-08, "logits/chosen": -2.325267791748047, "logits/rejected": -2.309227466583252, "logps/chosen": -122.99539947509766, "logps/rejected": -147.1749725341797, "loss": 0.6245, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6950401067733765, "rewards/margins": 0.2101527750492096, "rewards/rejected": -0.9051928520202637, "step": 8430 }, { "epoch": 1.454169538249483, "grad_norm": 12.852335929870605, "learning_rate": 6.129909608404203e-08, "logits/chosen": -2.348090410232544, "logits/rejected": -2.3340110778808594, "logps/chosen": -130.6591339111328, "logps/rejected": -144.08766174316406, "loss": 0.6466, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.758665144443512, "rewards/margins": 0.16690947115421295, "rewards/rejected": -0.9255746603012085, "step": 8440 }, { "epoch": 1.4558924879393522, "grad_norm": 12.755784034729004, "learning_rate": 6.120142429197024e-08, "logits/chosen": -2.233659267425537, "logits/rejected": -2.225896120071411, "logps/chosen": -119.9954605102539, "logps/rejected": -150.00758361816406, "loss": 0.6087, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7198360562324524, "rewards/margins": 0.25118693709373474, "rewards/rejected": -0.97102290391922, "step": 8450 }, { "epoch": 1.4576154376292212, "grad_norm": 13.98903751373291, "learning_rate": 6.110370747685437e-08, "logits/chosen": -2.3544061183929443, "logits/rejected": -2.328612804412842, "logps/chosen": -124.1575927734375, "logps/rejected": -148.77871704101562, "loss": 0.6106, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7135094404220581, "rewards/margins": 0.2369728982448578, "rewards/rejected": -0.9504823684692383, "step": 8460 }, { "epoch": 1.4593383873190904, "grad_norm": 13.045321464538574, "learning_rate": 6.100594603145774e-08, "logits/chosen": -2.326721668243408, "logits/rejected": -2.3012874126434326, "logps/chosen": -132.24061584472656, "logps/rejected": -147.90093994140625, "loss": 0.6372, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7617416381835938, "rewards/margins": 0.18581005930900574, "rewards/rejected": -0.9475516080856323, "step": 8470 }, { "epoch": 1.4610613370089593, "grad_norm": 14.153129577636719, "learning_rate": 6.090814034872306e-08, "logits/chosen": -2.3191442489624023, "logits/rejected": -2.2897255420684814, "logps/chosen": -126.59747314453125, "logps/rejected": -146.61880493164062, "loss": 0.6249, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.735602617263794, "rewards/margins": 0.214704230427742, "rewards/rejected": -0.9503068923950195, "step": 8480 }, { "epoch": 1.4627842866988283, "grad_norm": 9.504263877868652, "learning_rate": 6.08102908217708e-08, "logits/chosen": -2.376776933670044, "logits/rejected": -2.367931365966797, "logps/chosen": -124.80113220214844, "logps/rejected": -148.45089721679688, "loss": 0.6205, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7006114721298218, "rewards/margins": 0.21188035607337952, "rewards/rejected": -0.9124916791915894, "step": 8490 }, { "epoch": 1.4645072363886975, "grad_norm": 11.814811706542969, "learning_rate": 6.071239784389773e-08, "logits/chosen": -2.250884532928467, "logits/rejected": -2.235382318496704, "logps/chosen": -122.6833267211914, "logps/rejected": -146.9407958984375, "loss": 0.6041, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6837652921676636, "rewards/margins": 0.25118762254714966, "rewards/rejected": -0.9349529147148132, "step": 8500 }, { "epoch": 1.4645072363886975, "eval_logits/chosen": -2.4370453357696533, "eval_logits/rejected": -2.429211378097534, "eval_logps/chosen": -116.02777862548828, "eval_logps/rejected": -132.8700714111328, "eval_loss": 0.6501213908195496, "eval_rewards/accuracies": 0.6270910501480103, "eval_rewards/chosen": -0.5731588006019592, "eval_rewards/margins": 0.12374065071344376, "eval_rewards/rejected": -0.6968995332717896, "eval_runtime": 384.7144, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.398, "step": 8500 }, { "epoch": 1.4662301860785665, "grad_norm": 14.207695007324219, "learning_rate": 6.061446180857521e-08, "logits/chosen": -2.306429862976074, "logits/rejected": -2.2757408618927, "logps/chosen": -128.46165466308594, "logps/rejected": -148.2255096435547, "loss": 0.5998, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.706123948097229, "rewards/margins": 0.26086512207984924, "rewards/rejected": -0.9669890403747559, "step": 8510 }, { "epoch": 1.4679531357684357, "grad_norm": 12.93460750579834, "learning_rate": 6.051648310944766e-08, "logits/chosen": -2.2999918460845947, "logits/rejected": -2.270299196243286, "logps/chosen": -123.59654235839844, "logps/rejected": -141.2626190185547, "loss": 0.6218, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7002661824226379, "rewards/margins": 0.22540466487407684, "rewards/rejected": -0.9256709218025208, "step": 8520 }, { "epoch": 1.4696760854583046, "grad_norm": 12.019919395446777, "learning_rate": 6.041846214033103e-08, "logits/chosen": -2.278843641281128, "logits/rejected": -2.2614760398864746, "logps/chosen": -127.5106201171875, "logps/rejected": -141.7850341796875, "loss": 0.6428, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7362362742424011, "rewards/margins": 0.16734153032302856, "rewards/rejected": -0.9035778045654297, "step": 8530 }, { "epoch": 1.4713990351481736, "grad_norm": 11.9368314743042, "learning_rate": 6.032039929521118e-08, "logits/chosen": -2.4037058353424072, "logits/rejected": -2.388857841491699, "logps/chosen": -124.78189849853516, "logps/rejected": -141.38436889648438, "loss": 0.6389, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7008772492408752, "rewards/margins": 0.1686294674873352, "rewards/rejected": -0.8695065379142761, "step": 8540 }, { "epoch": 1.4731219848380428, "grad_norm": 11.869882583618164, "learning_rate": 6.02222949682422e-08, "logits/chosen": -2.3267855644226074, "logits/rejected": -2.3128583431243896, "logps/chosen": -126.34034729003906, "logps/rejected": -154.1063690185547, "loss": 0.605, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7250242233276367, "rewards/margins": 0.2627423405647278, "rewards/rejected": -0.9877665638923645, "step": 8550 }, { "epoch": 1.4748449345279118, "grad_norm": 12.881783485412598, "learning_rate": 6.0124149553745e-08, "logits/chosen": -2.4109718799591064, "logits/rejected": -2.3881163597106934, "logps/chosen": -126.77762603759766, "logps/rejected": -151.0612030029297, "loss": 0.6055, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7257034182548523, "rewards/margins": 0.2615645229816437, "rewards/rejected": -0.9872678518295288, "step": 8560 }, { "epoch": 1.476567884217781, "grad_norm": 14.819133758544922, "learning_rate": 6.002596344620556e-08, "logits/chosen": -2.271210193634033, "logits/rejected": -2.247753858566284, "logps/chosen": -127.88139343261719, "logps/rejected": -148.95095825195312, "loss": 0.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7352434396743774, "rewards/margins": 0.23391692340373993, "rewards/rejected": -0.969160258769989, "step": 8570 }, { "epoch": 1.47829083390765, "grad_norm": 12.75163745880127, "learning_rate": 5.992773704027354e-08, "logits/chosen": -2.3565399646759033, "logits/rejected": -2.3336920738220215, "logps/chosen": -120.94183349609375, "logps/rejected": -150.30868530273438, "loss": 0.5858, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.668055534362793, "rewards/margins": 0.28957587480545044, "rewards/rejected": -0.9576314687728882, "step": 8580 }, { "epoch": 1.480013783597519, "grad_norm": 11.414725303649902, "learning_rate": 5.982947073076041e-08, "logits/chosen": -2.362279176712036, "logits/rejected": -2.333160400390625, "logps/chosen": -122.26253509521484, "logps/rejected": -143.75381469726562, "loss": 0.599, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6526799201965332, "rewards/margins": 0.2510369122028351, "rewards/rejected": -0.9037168622016907, "step": 8590 }, { "epoch": 1.481736733287388, "grad_norm": 14.066166877746582, "learning_rate": 5.973116491263818e-08, "logits/chosen": -2.282959461212158, "logits/rejected": -2.2575316429138184, "logps/chosen": -130.51089477539062, "logps/rejected": -142.51197814941406, "loss": 0.6635, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7707261443138123, "rewards/margins": 0.1369166374206543, "rewards/rejected": -0.9076428413391113, "step": 8600 }, { "epoch": 1.481736733287388, "eval_logits/chosen": -2.42197847366333, "eval_logits/rejected": -2.414015293121338, "eval_logps/chosen": -118.88941192626953, "eval_logps/rejected": -136.21633911132812, "eval_loss": 0.6489555239677429, "eval_rewards/accuracies": 0.6252323389053345, "eval_rewards/chosen": -0.6017752289772034, "eval_rewards/margins": 0.1285870522260666, "eval_rewards/rejected": -0.7303622961044312, "eval_runtime": 384.9471, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 8600 }, { "epoch": 1.483459682977257, "grad_norm": 13.231621742248535, "learning_rate": 5.963281998103759e-08, "logits/chosen": -2.3086705207824707, "logits/rejected": -2.2862565517425537, "logps/chosen": -127.48872375488281, "logps/rejected": -146.39479064941406, "loss": 0.6252, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7334383726119995, "rewards/margins": 0.22494740784168243, "rewards/rejected": -0.9583857655525208, "step": 8610 }, { "epoch": 1.4851826326671262, "grad_norm": 11.913412094116211, "learning_rate": 5.953443633124658e-08, "logits/chosen": -2.2376344203948975, "logits/rejected": -2.223482370376587, "logps/chosen": -130.43106079101562, "logps/rejected": -141.7789306640625, "loss": 0.6526, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7565690279006958, "rewards/margins": 0.1487133800983429, "rewards/rejected": -0.9052823781967163, "step": 8620 }, { "epoch": 1.4869055823569952, "grad_norm": 13.142196655273438, "learning_rate": 5.9436014358708787e-08, "logits/chosen": -2.2200779914855957, "logits/rejected": -2.2072250843048096, "logps/chosen": -117.5044174194336, "logps/rejected": -143.3849639892578, "loss": 0.5982, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6579936742782593, "rewards/margins": 0.2552747428417206, "rewards/rejected": -0.9132683873176575, "step": 8630 }, { "epoch": 1.4886285320468642, "grad_norm": 13.552721977233887, "learning_rate": 5.933755445902177e-08, "logits/chosen": -2.3500521183013916, "logits/rejected": -2.3245906829833984, "logps/chosen": -130.78872680664062, "logps/rejected": -142.3094482421875, "loss": 0.6634, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7504715323448181, "rewards/margins": 0.144822359085083, "rewards/rejected": -0.8952938914299011, "step": 8640 }, { "epoch": 1.4903514817367332, "grad_norm": 13.302846908569336, "learning_rate": 5.9239057027935637e-08, "logits/chosen": -2.2644646167755127, "logits/rejected": -2.2434914112091064, "logps/chosen": -124.41485595703125, "logps/rejected": -145.83895874023438, "loss": 0.6182, "rewards/accuracies": 0.625, "rewards/chosen": -0.6946158409118652, "rewards/margins": 0.22304484248161316, "rewards/rejected": -0.9176605939865112, "step": 8650 }, { "epoch": 1.4920744314266023, "grad_norm": 13.048736572265625, "learning_rate": 5.914052246135127e-08, "logits/chosen": -2.2776613235473633, "logits/rejected": -2.262205123901367, "logps/chosen": -123.85188293457031, "logps/rejected": -147.48789978027344, "loss": 0.6113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7048993706703186, "rewards/margins": 0.2527524530887604, "rewards/rejected": -0.9576517343521118, "step": 8660 }, { "epoch": 1.4937973811164715, "grad_norm": 10.656261444091797, "learning_rate": 5.904195115531892e-08, "logits/chosen": -2.370086431503296, "logits/rejected": -2.351233959197998, "logps/chosen": -125.11270904541016, "logps/rejected": -149.8583984375, "loss": 0.6147, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7311683893203735, "rewards/margins": 0.23954811692237854, "rewards/rejected": -0.9707163572311401, "step": 8670 }, { "epoch": 1.4955203308063405, "grad_norm": 15.064244270324707, "learning_rate": 5.894334350603637e-08, "logits/chosen": -2.3081626892089844, "logits/rejected": -2.301806926727295, "logps/chosen": -124.0490951538086, "logps/rejected": -141.0283660888672, "loss": 0.6485, "rewards/accuracies": 0.625, "rewards/chosen": -0.7337080836296082, "rewards/margins": 0.14873264729976654, "rewards/rejected": -0.8824406862258911, "step": 8680 }, { "epoch": 1.4972432804962095, "grad_norm": 13.30392074584961, "learning_rate": 5.8844699909847576e-08, "logits/chosen": -2.3321213722229004, "logits/rejected": -2.3070807456970215, "logps/chosen": -130.0882568359375, "logps/rejected": -136.86813354492188, "loss": 0.6656, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7455196380615234, "rewards/margins": 0.1400761902332306, "rewards/rejected": -0.8855957984924316, "step": 8690 }, { "epoch": 1.4989662301860784, "grad_norm": 11.760110855102539, "learning_rate": 5.8746020763240956e-08, "logits/chosen": -2.3360636234283447, "logits/rejected": -2.3041799068450928, "logps/chosen": -131.28819274902344, "logps/rejected": -144.1375732421875, "loss": 0.6377, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7479070425033569, "rewards/margins": 0.16704407334327698, "rewards/rejected": -0.9149511456489563, "step": 8700 }, { "epoch": 1.4989662301860784, "eval_logits/chosen": -2.4247467517852783, "eval_logits/rejected": -2.4167640209198, "eval_logps/chosen": -115.79859161376953, "eval_logps/rejected": -132.69505310058594, "eval_loss": 0.649876594543457, "eval_rewards/accuracies": 0.625464677810669, "eval_rewards/chosen": -0.5708669424057007, "eval_rewards/margins": 0.1242823377251625, "eval_rewards/rejected": -0.6951491832733154, "eval_runtime": 384.8579, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 8700 }, { "epoch": 1.5006891798759476, "grad_norm": 17.00632667541504, "learning_rate": 5.8647306462847814e-08, "logits/chosen": -2.2961337566375732, "logits/rejected": -2.283881664276123, "logps/chosen": -127.45805358886719, "logps/rejected": -142.31607055664062, "loss": 0.6607, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7462243437767029, "rewards/margins": 0.14547453820705414, "rewards/rejected": -0.891698956489563, "step": 8710 }, { "epoch": 1.5024121295658168, "grad_norm": 12.124114990234375, "learning_rate": 5.854855740544078e-08, "logits/chosen": -2.31331205368042, "logits/rejected": -2.2876925468444824, "logps/chosen": -124.9738540649414, "logps/rejected": -143.38754272460938, "loss": 0.6234, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6862626075744629, "rewards/margins": 0.20305463671684265, "rewards/rejected": -0.8893172144889832, "step": 8720 }, { "epoch": 1.5041350792556858, "grad_norm": 12.336860656738281, "learning_rate": 5.844977398793211e-08, "logits/chosen": -2.3217110633850098, "logits/rejected": -2.290769100189209, "logps/chosen": -120.9361572265625, "logps/rejected": -144.50311279296875, "loss": 0.6153, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6866485476493835, "rewards/margins": 0.23989896476268768, "rewards/rejected": -0.9265475273132324, "step": 8730 }, { "epoch": 1.5058580289455548, "grad_norm": 13.71493911743164, "learning_rate": 5.8350956607372284e-08, "logits/chosen": -2.2873072624206543, "logits/rejected": -2.2767817974090576, "logps/chosen": -123.29255676269531, "logps/rejected": -148.6192169189453, "loss": 0.6144, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7188848257064819, "rewards/margins": 0.23730933666229248, "rewards/rejected": -0.9561940431594849, "step": 8740 }, { "epoch": 1.5075809786354237, "grad_norm": 12.820222854614258, "learning_rate": 5.825210566094817e-08, "logits/chosen": -2.3319649696350098, "logits/rejected": -2.308412551879883, "logps/chosen": -121.77445220947266, "logps/rejected": -147.75155639648438, "loss": 0.6013, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6807633638381958, "rewards/margins": 0.2647384703159332, "rewards/rejected": -0.9455019235610962, "step": 8750 }, { "epoch": 1.509303928325293, "grad_norm": 10.883214950561523, "learning_rate": 5.8153221545981634e-08, "logits/chosen": -2.2729172706604004, "logits/rejected": -2.2636818885803223, "logps/chosen": -117.68426513671875, "logps/rejected": -145.3979034423828, "loss": 0.6089, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6618735790252686, "rewards/margins": 0.24038393795490265, "rewards/rejected": -0.9022574424743652, "step": 8760 }, { "epoch": 1.5110268780151621, "grad_norm": 12.661680221557617, "learning_rate": 5.805430465992783e-08, "logits/chosen": -2.313819169998169, "logits/rejected": -2.285834550857544, "logps/chosen": -127.67130279541016, "logps/rejected": -146.73910522460938, "loss": 0.6166, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7259591817855835, "rewards/margins": 0.24383072555065155, "rewards/rejected": -0.9697898030281067, "step": 8770 }, { "epoch": 1.512749827705031, "grad_norm": 12.35444164276123, "learning_rate": 5.795535540037364e-08, "logits/chosen": -2.3322689533233643, "logits/rejected": -2.32749342918396, "logps/chosen": -120.35646057128906, "logps/rejected": -150.2484588623047, "loss": 0.6025, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7126277089118958, "rewards/margins": 0.2553618252277374, "rewards/rejected": -0.967989444732666, "step": 8780 }, { "epoch": 1.5144727773949, "grad_norm": 15.579487800598145, "learning_rate": 5.785637416503607e-08, "logits/chosen": -2.3356261253356934, "logits/rejected": -2.3098630905151367, "logps/chosen": -128.7639617919922, "logps/rejected": -146.7914581298828, "loss": 0.6258, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7384016513824463, "rewards/margins": 0.2092152088880539, "rewards/rejected": -0.9476169347763062, "step": 8790 }, { "epoch": 1.516195727084769, "grad_norm": 15.657267570495605, "learning_rate": 5.7757361351760625e-08, "logits/chosen": -2.2593512535095215, "logits/rejected": -2.228543758392334, "logps/chosen": -124.63716125488281, "logps/rejected": -139.7908172607422, "loss": 0.6376, "rewards/accuracies": 0.625, "rewards/chosen": -0.6963804960250854, "rewards/margins": 0.17658205330371857, "rewards/rejected": -0.872962474822998, "step": 8800 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -2.4155189990997314, "eval_logits/rejected": -2.4074251651763916, "eval_logps/chosen": -117.37520599365234, "eval_logps/rejected": -134.65060424804688, "eval_loss": 0.6488099098205566, "eval_rewards/accuracies": 0.6301115155220032, "eval_rewards/chosen": -0.586633026599884, "eval_rewards/margins": 0.12807168066501617, "eval_rewards/rejected": -0.7147047519683838, "eval_runtime": 384.7311, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 8800 }, { "epoch": 1.5179186767746382, "grad_norm": 14.576560020446777, "learning_rate": 5.765831735851978e-08, "logits/chosen": -2.365224599838257, "logits/rejected": -2.3419666290283203, "logps/chosen": -123.6998291015625, "logps/rejected": -150.27755737304688, "loss": 0.6105, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7273944616317749, "rewards/margins": 0.24932006001472473, "rewards/rejected": -0.9767144322395325, "step": 8810 }, { "epoch": 1.5196416264645074, "grad_norm": 12.565640449523926, "learning_rate": 5.7559242583411284e-08, "logits/chosen": -2.3421339988708496, "logits/rejected": -2.3210861682891846, "logps/chosen": -116.22010803222656, "logps/rejected": -141.04908752441406, "loss": 0.5995, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6579592823982239, "rewards/margins": 0.2547076642513275, "rewards/rejected": -0.9126667976379395, "step": 8820 }, { "epoch": 1.5213645761543764, "grad_norm": 13.293051719665527, "learning_rate": 5.746013742465665e-08, "logits/chosen": -2.1958701610565186, "logits/rejected": -2.1731865406036377, "logps/chosen": -125.70268249511719, "logps/rejected": -143.53817749023438, "loss": 0.6364, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7110669612884521, "rewards/margins": 0.19689084589481354, "rewards/rejected": -0.9079578518867493, "step": 8830 }, { "epoch": 1.5230875258442453, "grad_norm": 14.483805656433105, "learning_rate": 5.7361002280599503e-08, "logits/chosen": -2.2213780879974365, "logits/rejected": -2.207857608795166, "logps/chosen": -118.2529525756836, "logps/rejected": -146.90296936035156, "loss": 0.5992, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6590458154678345, "rewards/margins": 0.2720191776752472, "rewards/rejected": -0.9310649633407593, "step": 8840 }, { "epoch": 1.5248104755341143, "grad_norm": 11.819971084594727, "learning_rate": 5.726183754970397e-08, "logits/chosen": -2.3809990882873535, "logits/rejected": -2.3619656562805176, "logps/chosen": -119.54219818115234, "logps/rejected": -148.3084716796875, "loss": 0.6024, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6729179620742798, "rewards/margins": 0.26723653078079224, "rewards/rejected": -0.9401543736457825, "step": 8850 }, { "epoch": 1.5265334252239835, "grad_norm": 13.207598686218262, "learning_rate": 5.716264363055314e-08, "logits/chosen": -2.290774345397949, "logits/rejected": -2.2679219245910645, "logps/chosen": -126.52516174316406, "logps/rejected": -152.0948944091797, "loss": 0.6023, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7334404587745667, "rewards/margins": 0.2588955760002136, "rewards/rejected": -0.9923361539840698, "step": 8860 }, { "epoch": 1.5282563749138525, "grad_norm": 14.733277320861816, "learning_rate": 5.706342092184739e-08, "logits/chosen": -2.431640386581421, "logits/rejected": -2.3982081413269043, "logps/chosen": -129.1495361328125, "logps/rejected": -153.14715576171875, "loss": 0.6037, "rewards/accuracies": 0.6875, "rewards/chosen": -0.723796010017395, "rewards/margins": 0.2574194669723511, "rewards/rejected": -0.9812153577804565, "step": 8870 }, { "epoch": 1.5299793246037217, "grad_norm": 13.658443450927734, "learning_rate": 5.696416982240282e-08, "logits/chosen": -2.232278347015381, "logits/rejected": -2.207052707672119, "logps/chosen": -129.80697631835938, "logps/rejected": -151.1610565185547, "loss": 0.6217, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7635579109191895, "rewards/margins": 0.23256301879882812, "rewards/rejected": -0.9961209297180176, "step": 8880 }, { "epoch": 1.5317022742935906, "grad_norm": 15.977197647094727, "learning_rate": 5.686489073114965e-08, "logits/chosen": -2.2679660320281982, "logits/rejected": -2.242905855178833, "logps/chosen": -130.6324920654297, "logps/rejected": -151.64913940429688, "loss": 0.6077, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7613813877105713, "rewards/margins": 0.24742183089256287, "rewards/rejected": -1.008803129196167, "step": 8890 }, { "epoch": 1.5334252239834596, "grad_norm": 11.662043571472168, "learning_rate": 5.676558404713061e-08, "logits/chosen": -2.313070297241211, "logits/rejected": -2.283228635787964, "logps/chosen": -130.65310668945312, "logps/rejected": -151.60250854492188, "loss": 0.6174, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7576700448989868, "rewards/margins": 0.22839370369911194, "rewards/rejected": -0.9860638380050659, "step": 8900 }, { "epoch": 1.5334252239834596, "eval_logits/chosen": -2.3969428539276123, "eval_logits/rejected": -2.3886935710906982, "eval_logps/chosen": -121.26499938964844, "eval_logps/rejected": -139.1249237060547, "eval_loss": 0.647824227809906, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.625531017780304, "eval_rewards/margins": 0.13391713798046112, "eval_rewards/rejected": -0.7594481706619263, "eval_runtime": 384.916, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 8900 }, { "epoch": 1.5351481736733288, "grad_norm": 15.66219711303711, "learning_rate": 5.666625016949933e-08, "logits/chosen": -2.2808852195739746, "logits/rejected": -2.265544891357422, "logps/chosen": -136.09054565429688, "logps/rejected": -160.9842071533203, "loss": 0.6066, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8041488528251648, "rewards/margins": 0.2751007676124573, "rewards/rejected": -1.079249620437622, "step": 8910 }, { "epoch": 1.5368711233631978, "grad_norm": 11.078070640563965, "learning_rate": 5.656688949751875e-08, "logits/chosen": -2.374967575073242, "logits/rejected": -2.3424668312072754, "logps/chosen": -134.77926635742188, "logps/rejected": -159.73167419433594, "loss": 0.5952, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7773364782333374, "rewards/margins": 0.2988698482513428, "rewards/rejected": -1.0762063264846802, "step": 8920 }, { "epoch": 1.538594073053067, "grad_norm": 12.058008193969727, "learning_rate": 5.64675024305595e-08, "logits/chosen": -2.315626859664917, "logits/rejected": -2.2847824096679688, "logps/chosen": -132.51901245117188, "logps/rejected": -151.91177368164062, "loss": 0.6169, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7782979607582092, "rewards/margins": 0.23511436581611633, "rewards/rejected": -1.0134122371673584, "step": 8930 }, { "epoch": 1.540317022742936, "grad_norm": 13.10341739654541, "learning_rate": 5.6368089368098315e-08, "logits/chosen": -2.297457456588745, "logits/rejected": -2.278395175933838, "logps/chosen": -129.33663940429688, "logps/rejected": -153.09884643554688, "loss": 0.6152, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7370185852050781, "rewards/margins": 0.23606674373149872, "rewards/rejected": -0.9730854034423828, "step": 8940 }, { "epoch": 1.5420399724328049, "grad_norm": 16.191864013671875, "learning_rate": 5.626865070971638e-08, "logits/chosen": -2.1967902183532715, "logits/rejected": -2.196498394012451, "logps/chosen": -124.38032531738281, "logps/rejected": -149.29835510253906, "loss": 0.6273, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7027859091758728, "rewards/margins": 0.21845777332782745, "rewards/rejected": -0.9212436676025391, "step": 8950 }, { "epoch": 1.5437629221226739, "grad_norm": 15.123845100402832, "learning_rate": 5.616918685509783e-08, "logits/chosen": -2.3022170066833496, "logits/rejected": -2.274287223815918, "logps/chosen": -138.83013916015625, "logps/rejected": -166.82565307617188, "loss": 0.5934, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8442748785018921, "rewards/margins": 0.3067021369934082, "rewards/rejected": -1.1509768962860107, "step": 8960 }, { "epoch": 1.545485871812543, "grad_norm": 14.812797546386719, "learning_rate": 5.606969820402797e-08, "logits/chosen": -2.2684621810913086, "logits/rejected": -2.240384101867676, "logps/chosen": -131.76438903808594, "logps/rejected": -152.13082885742188, "loss": 0.6145, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7848045825958252, "rewards/margins": 0.22551974654197693, "rewards/rejected": -1.010324239730835, "step": 8970 }, { "epoch": 1.5472088215024122, "grad_norm": 17.189090728759766, "learning_rate": 5.597018515639189e-08, "logits/chosen": -2.348278522491455, "logits/rejected": -2.321652889251709, "logps/chosen": -140.35964965820312, "logps/rejected": -151.7267303466797, "loss": 0.6694, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8587892651557922, "rewards/margins": 0.13147902488708496, "rewards/rejected": -0.990268349647522, "step": 8980 }, { "epoch": 1.5489317711922812, "grad_norm": 12.447453498840332, "learning_rate": 5.587064811217266e-08, "logits/chosen": -2.251849412918091, "logits/rejected": -2.233555793762207, "logps/chosen": -127.77113342285156, "logps/rejected": -151.226806640625, "loss": 0.6057, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7282143831253052, "rewards/margins": 0.24384987354278564, "rewards/rejected": -0.972064197063446, "step": 8990 }, { "epoch": 1.5506547208821502, "grad_norm": 15.851520538330078, "learning_rate": 5.577108747144983e-08, "logits/chosen": -2.3005168437957764, "logits/rejected": -2.2733874320983887, "logps/chosen": -136.76089477539062, "logps/rejected": -156.3783416748047, "loss": 0.6228, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7934644222259521, "rewards/margins": 0.24450144171714783, "rewards/rejected": -1.0379658937454224, "step": 9000 }, { "epoch": 1.5506547208821502, "eval_logits/chosen": -2.389760971069336, "eval_logits/rejected": -2.3814666271209717, "eval_logps/chosen": -121.16392517089844, "eval_logps/rejected": -139.05027770996094, "eval_loss": 0.6477538347244263, "eval_rewards/accuracies": 0.6291821599006653, "eval_rewards/chosen": -0.6245203018188477, "eval_rewards/margins": 0.13418106734752655, "eval_rewards/rejected": -0.7587013840675354, "eval_runtime": 384.9787, "eval_samples_per_second": 11.18, "eval_steps_per_second": 1.397, "step": 9000 }, { "epoch": 1.5523776705720191, "grad_norm": 12.177999496459961, "learning_rate": 5.567150363439779e-08, "logits/chosen": -2.274502992630005, "logits/rejected": -2.2511608600616455, "logps/chosen": -131.14321899414062, "logps/rejected": -147.08657836914062, "loss": 0.6363, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7550393342971802, "rewards/margins": 0.18728122115135193, "rewards/rejected": -0.9423205256462097, "step": 9010 }, { "epoch": 1.5541006202618883, "grad_norm": 15.318517684936523, "learning_rate": 5.557189700128414e-08, "logits/chosen": -2.2132270336151123, "logits/rejected": -2.19401216506958, "logps/chosen": -126.7429428100586, "logps/rejected": -155.0507354736328, "loss": 0.5865, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7281891107559204, "rewards/margins": 0.3131178319454193, "rewards/rejected": -1.0413068532943726, "step": 9020 }, { "epoch": 1.5558235699517575, "grad_norm": 16.416961669921875, "learning_rate": 5.547226797246817e-08, "logits/chosen": -2.2276523113250732, "logits/rejected": -2.229720115661621, "logps/chosen": -124.99691009521484, "logps/rejected": -150.36830139160156, "loss": 0.6195, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7384492754936218, "rewards/margins": 0.22708168625831604, "rewards/rejected": -0.9655309915542603, "step": 9030 }, { "epoch": 1.5575465196416265, "grad_norm": 18.206544876098633, "learning_rate": 5.53726169483991e-08, "logits/chosen": -2.2322733402252197, "logits/rejected": -2.213137626647949, "logps/chosen": -131.87644958496094, "logps/rejected": -150.65623474121094, "loss": 0.6415, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8070753812789917, "rewards/margins": 0.179480642080307, "rewards/rejected": -0.9865560531616211, "step": 9040 }, { "epoch": 1.5592694693314955, "grad_norm": 15.540668487548828, "learning_rate": 5.5272944329614656e-08, "logits/chosen": -2.2933497428894043, "logits/rejected": -2.274948835372925, "logps/chosen": -133.30960083007812, "logps/rejected": -152.50271606445312, "loss": 0.6359, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7955208420753479, "rewards/margins": 0.20508281886577606, "rewards/rejected": -1.0006035566329956, "step": 9050 }, { "epoch": 1.5609924190213644, "grad_norm": 14.85422134399414, "learning_rate": 5.517325051673928e-08, "logits/chosen": -2.2882003784179688, "logits/rejected": -2.26269268989563, "logps/chosen": -132.16358947753906, "logps/rejected": -153.45974731445312, "loss": 0.6208, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7745057344436646, "rewards/margins": 0.22806155681610107, "rewards/rejected": -1.0025672912597656, "step": 9060 }, { "epoch": 1.5627153687112336, "grad_norm": 12.666839599609375, "learning_rate": 5.5073535910482625e-08, "logits/chosen": -2.253187894821167, "logits/rejected": -2.233231544494629, "logps/chosen": -127.13151550292969, "logps/rejected": -158.2010040283203, "loss": 0.5877, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7256596088409424, "rewards/margins": 0.3083544373512268, "rewards/rejected": -1.0340139865875244, "step": 9070 }, { "epoch": 1.5644383184011028, "grad_norm": 12.527570724487305, "learning_rate": 5.4973800911637966e-08, "logits/chosen": -2.262953042984009, "logits/rejected": -2.2547607421875, "logps/chosen": -120.70619201660156, "logps/rejected": -149.33441162109375, "loss": 0.6, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6872548460960388, "rewards/margins": 0.2518579661846161, "rewards/rejected": -0.9391128420829773, "step": 9080 }, { "epoch": 1.5661612680909718, "grad_norm": 12.955934524536133, "learning_rate": 5.487404592108047e-08, "logits/chosen": -2.2345328330993652, "logits/rejected": -2.2013678550720215, "logps/chosen": -130.22364807128906, "logps/rejected": -145.46182250976562, "loss": 0.6256, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7495505213737488, "rewards/margins": 0.20091386139392853, "rewards/rejected": -0.9504643678665161, "step": 9090 }, { "epoch": 1.5678842177808407, "grad_norm": 15.007445335388184, "learning_rate": 5.477427133976573e-08, "logits/chosen": -2.2926173210144043, "logits/rejected": -2.267416477203369, "logps/chosen": -134.2117919921875, "logps/rejected": -149.1011199951172, "loss": 0.6372, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7741454839706421, "rewards/margins": 0.18176405131816864, "rewards/rejected": -0.9559095501899719, "step": 9100 }, { "epoch": 1.5678842177808407, "eval_logits/chosen": -2.3852388858795166, "eval_logits/rejected": -2.3769235610961914, "eval_logps/chosen": -120.74646759033203, "eval_logps/rejected": -138.56756591796875, "eval_loss": 0.6480221748352051, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.6203456521034241, "eval_rewards/margins": 0.13352881371974945, "eval_rewards/rejected": -0.7538744807243347, "eval_runtime": 384.4403, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 9100 }, { "epoch": 1.5696071674707097, "grad_norm": 11.584162712097168, "learning_rate": 5.467447756872802e-08, "logits/chosen": -2.2915608882904053, "logits/rejected": -2.2689292430877686, "logps/chosen": -130.25265502929688, "logps/rejected": -150.4947509765625, "loss": 0.6184, "rewards/accuracies": 0.625, "rewards/chosen": -0.750041663646698, "rewards/margins": 0.22895345091819763, "rewards/rejected": -0.9789952039718628, "step": 9110 }, { "epoch": 1.571330117160579, "grad_norm": 17.13041114807129, "learning_rate": 5.457466500907877e-08, "logits/chosen": -2.3155112266540527, "logits/rejected": -2.2829627990722656, "logps/chosen": -133.50241088867188, "logps/rejected": -151.78570556640625, "loss": 0.6143, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7322681546211243, "rewards/margins": 0.23458734154701233, "rewards/rejected": -0.9668554067611694, "step": 9120 }, { "epoch": 1.573053066850448, "grad_norm": 12.453614234924316, "learning_rate": 5.447483406200496e-08, "logits/chosen": -2.2341463565826416, "logits/rejected": -2.2107033729553223, "logps/chosen": -130.88345336914062, "logps/rejected": -152.70944213867188, "loss": 0.6175, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7586455345153809, "rewards/margins": 0.22743931412696838, "rewards/rejected": -0.9860848188400269, "step": 9130 }, { "epoch": 1.574776016540317, "grad_norm": 12.045112609863281, "learning_rate": 5.437498512876741e-08, "logits/chosen": -2.302090883255005, "logits/rejected": -2.2535691261291504, "logps/chosen": -129.43870544433594, "logps/rejected": -152.5924072265625, "loss": 0.5892, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7215713262557983, "rewards/margins": 0.312619149684906, "rewards/rejected": -1.0341904163360596, "step": 9140 }, { "epoch": 1.576498966230186, "grad_norm": 16.659603118896484, "learning_rate": 5.427511861069932e-08, "logits/chosen": -2.286972999572754, "logits/rejected": -2.2577507495880127, "logps/chosen": -138.92745971679688, "logps/rejected": -164.51431274414062, "loss": 0.5967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8060145378112793, "rewards/margins": 0.2943601608276367, "rewards/rejected": -1.100374698638916, "step": 9150 }, { "epoch": 1.578221915920055, "grad_norm": 11.802719116210938, "learning_rate": 5.417523490920448e-08, "logits/chosen": -2.294285297393799, "logits/rejected": -2.287755250930786, "logps/chosen": -121.30552673339844, "logps/rejected": -152.68124389648438, "loss": 0.5972, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7215561866760254, "rewards/margins": 0.2744077444076538, "rewards/rejected": -0.9959639310836792, "step": 9160 }, { "epoch": 1.5799448656099242, "grad_norm": 12.288626670837402, "learning_rate": 5.4075334425755824e-08, "logits/chosen": -2.306957960128784, "logits/rejected": -2.2740955352783203, "logps/chosen": -136.5259552001953, "logps/rejected": -158.4174346923828, "loss": 0.6083, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8168443441390991, "rewards/margins": 0.2726067900657654, "rewards/rejected": -1.0894510746002197, "step": 9170 }, { "epoch": 1.5816678152997934, "grad_norm": 12.652445793151855, "learning_rate": 5.397541756189369e-08, "logits/chosen": -2.281018018722534, "logits/rejected": -2.261849880218506, "logps/chosen": -135.01614379882812, "logps/rejected": -153.14230346679688, "loss": 0.6314, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8028523325920105, "rewards/margins": 0.2069278210401535, "rewards/rejected": -1.0097801685333252, "step": 9180 }, { "epoch": 1.5833907649896624, "grad_norm": 12.863187789916992, "learning_rate": 5.387548471922425e-08, "logits/chosen": -2.380601406097412, "logits/rejected": -2.3737282752990723, "logps/chosen": -135.6869659423828, "logps/rejected": -163.7108154296875, "loss": 0.6059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8152972459793091, "rewards/margins": 0.25253671407699585, "rewards/rejected": -1.0678339004516602, "step": 9190 }, { "epoch": 1.5851137146795313, "grad_norm": 16.70528793334961, "learning_rate": 5.3775536299417957e-08, "logits/chosen": -2.280928611755371, "logits/rejected": -2.2582879066467285, "logps/chosen": -133.15750122070312, "logps/rejected": -160.739013671875, "loss": 0.6, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8030554056167603, "rewards/margins": 0.28822585940361023, "rewards/rejected": -1.0912811756134033, "step": 9200 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -2.3750526905059814, "eval_logits/rejected": -2.3665480613708496, "eval_logps/chosen": -122.71503448486328, "eval_logps/rejected": -140.8612060546875, "eval_loss": 0.6474426984786987, "eval_rewards/accuracies": 0.6328996419906616, "eval_rewards/chosen": -0.6400314569473267, "eval_rewards/margins": 0.13677936792373657, "eval_rewards/rejected": -0.7768107652664185, "eval_runtime": 384.9182, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 9200 }, { "epoch": 1.5868366643694003, "grad_norm": 17.34453010559082, "learning_rate": 5.3675572704207826e-08, "logits/chosen": -2.1768012046813965, "logits/rejected": -2.152116060256958, "logps/chosen": -137.04013061523438, "logps/rejected": -154.55587768554688, "loss": 0.6285, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8119314908981323, "rewards/margins": 0.21572165191173553, "rewards/rejected": -1.0276530981063843, "step": 9210 }, { "epoch": 1.5885596140592695, "grad_norm": 20.220067977905273, "learning_rate": 5.3575594335387876e-08, "logits/chosen": -2.260709047317505, "logits/rejected": -2.2382912635803223, "logps/chosen": -127.0832290649414, "logps/rejected": -149.2588348388672, "loss": 0.6224, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7706971764564514, "rewards/margins": 0.22640147805213928, "rewards/rejected": -0.9970986247062683, "step": 9220 }, { "epoch": 1.5902825637491387, "grad_norm": 13.270739555358887, "learning_rate": 5.347560159481153e-08, "logits/chosen": -2.1878116130828857, "logits/rejected": -2.170797824859619, "logps/chosen": -123.77239990234375, "logps/rejected": -152.74220275878906, "loss": 0.6037, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7446569204330444, "rewards/margins": 0.25421950221061707, "rewards/rejected": -0.9988763928413391, "step": 9230 }, { "epoch": 1.5920055134390076, "grad_norm": 14.129158020019531, "learning_rate": 5.337559488438994e-08, "logits/chosen": -2.2290263175964355, "logits/rejected": -2.2223076820373535, "logps/chosen": -135.3732147216797, "logps/rejected": -161.42129516601562, "loss": 0.6082, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7922304272651672, "rewards/margins": 0.27604490518569946, "rewards/rejected": -1.0682754516601562, "step": 9240 }, { "epoch": 1.5937284631288766, "grad_norm": 12.709400177001953, "learning_rate": 5.327557460609043e-08, "logits/chosen": -2.226959705352783, "logits/rejected": -2.2024128437042236, "logps/chosen": -128.5209197998047, "logps/rejected": -152.21353149414062, "loss": 0.6187, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7373493909835815, "rewards/margins": 0.2387053668498993, "rewards/rejected": -0.9760546684265137, "step": 9250 }, { "epoch": 1.5954514128187456, "grad_norm": 13.907978057861328, "learning_rate": 5.317554116193488e-08, "logits/chosen": -2.211817741394043, "logits/rejected": -2.1932473182678223, "logps/chosen": -131.46925354003906, "logps/rejected": -155.19528198242188, "loss": 0.6201, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8340648412704468, "rewards/margins": 0.22798161208629608, "rewards/rejected": -1.0620465278625488, "step": 9260 }, { "epoch": 1.5971743625086148, "grad_norm": 11.452183723449707, "learning_rate": 5.307549495399804e-08, "logits/chosen": -2.3337273597717285, "logits/rejected": -2.3031058311462402, "logps/chosen": -137.244384765625, "logps/rejected": -154.68031311035156, "loss": 0.6288, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8235111236572266, "rewards/margins": 0.20514735579490662, "rewards/rejected": -1.0286585092544556, "step": 9270 }, { "epoch": 1.598897312198484, "grad_norm": 15.080280303955078, "learning_rate": 5.2975436384406e-08, "logits/chosen": -2.273318290710449, "logits/rejected": -2.255495309829712, "logps/chosen": -127.97026062011719, "logps/rejected": -153.50143432617188, "loss": 0.6132, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7439999580383301, "rewards/margins": 0.2530801594257355, "rewards/rejected": -0.9970801472663879, "step": 9280 }, { "epoch": 1.600620261888353, "grad_norm": 11.744697570800781, "learning_rate": 5.287536585533453e-08, "logits/chosen": -2.1726717948913574, "logits/rejected": -2.1516425609588623, "logps/chosen": -126.49604797363281, "logps/rejected": -143.70533752441406, "loss": 0.63, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.718410849571228, "rewards/margins": 0.20998051762580872, "rewards/rejected": -0.9283913373947144, "step": 9290 }, { "epoch": 1.602343211578222, "grad_norm": 17.194292068481445, "learning_rate": 5.2775283769007464e-08, "logits/chosen": -2.2376160621643066, "logits/rejected": -2.2286219596862793, "logps/chosen": -137.44625854492188, "logps/rejected": -168.51844787597656, "loss": 0.5989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8376291394233704, "rewards/margins": 0.2888772785663605, "rewards/rejected": -1.1265064477920532, "step": 9300 }, { "epoch": 1.602343211578222, "eval_logits/chosen": -2.366192579269409, "eval_logits/rejected": -2.3576066493988037, "eval_logps/chosen": -123.44913482666016, "eval_logps/rejected": -141.85430908203125, "eval_loss": 0.6468191742897034, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.647372305393219, "eval_rewards/margins": 0.13936947286128998, "eval_rewards/rejected": -0.7867418527603149, "eval_runtime": 385.1922, "eval_samples_per_second": 11.174, "eval_steps_per_second": 1.397, "step": 9300 }, { "epoch": 1.6040661612680909, "grad_norm": 18.25770378112793, "learning_rate": 5.267519052769507e-08, "logits/chosen": -2.2578463554382324, "logits/rejected": -2.2324862480163574, "logps/chosen": -136.2218780517578, "logps/rejected": -155.37332153320312, "loss": 0.6203, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8247623443603516, "rewards/margins": 0.22520852088928223, "rewards/rejected": -1.0499707460403442, "step": 9310 }, { "epoch": 1.60578911095796, "grad_norm": 12.890725135803223, "learning_rate": 5.257508653371252e-08, "logits/chosen": -2.308851718902588, "logits/rejected": -2.287872791290283, "logps/chosen": -136.90524291992188, "logps/rejected": -167.29397583007812, "loss": 0.6064, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8607310056686401, "rewards/margins": 0.2680822014808655, "rewards/rejected": -1.1288131475448608, "step": 9320 }, { "epoch": 1.607512060647829, "grad_norm": 17.10941505432129, "learning_rate": 5.2474972189418096e-08, "logits/chosen": -2.2742292881011963, "logits/rejected": -2.2479586601257324, "logps/chosen": -139.09312438964844, "logps/rejected": -164.92210388183594, "loss": 0.6079, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8251506686210632, "rewards/margins": 0.27346938848495483, "rewards/rejected": -1.0986201763153076, "step": 9330 }, { "epoch": 1.6092350103376982, "grad_norm": 10.893423080444336, "learning_rate": 5.237484789721178e-08, "logits/chosen": -2.196669101715088, "logits/rejected": -2.1830501556396484, "logps/chosen": -136.19924926757812, "logps/rejected": -162.57736206054688, "loss": 0.6062, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8439007997512817, "rewards/margins": 0.2659851908683777, "rewards/rejected": -1.1098860502243042, "step": 9340 }, { "epoch": 1.6109579600275672, "grad_norm": 39.45381546020508, "learning_rate": 5.227471405953352e-08, "logits/chosen": -2.266451597213745, "logits/rejected": -2.2372560501098633, "logps/chosen": -128.46804809570312, "logps/rejected": -150.4097900390625, "loss": 0.6263, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7591797113418579, "rewards/margins": 0.22573132812976837, "rewards/rejected": -0.9849110841751099, "step": 9350 }, { "epoch": 1.6126809097174362, "grad_norm": 12.594921112060547, "learning_rate": 5.217457107886159e-08, "logits/chosen": -2.3132832050323486, "logits/rejected": -2.276333808898926, "logps/chosen": -134.48617553710938, "logps/rejected": -163.8741455078125, "loss": 0.58, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7780380249023438, "rewards/margins": 0.3349377512931824, "rewards/rejected": -1.112975835800171, "step": 9360 }, { "epoch": 1.6144038594073054, "grad_norm": 19.296052932739258, "learning_rate": 5.207441935771104e-08, "logits/chosen": -2.3226866722106934, "logits/rejected": -2.286971092224121, "logps/chosen": -132.7664337158203, "logps/rejected": -156.07736206054688, "loss": 0.6085, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7892715930938721, "rewards/margins": 0.257493793964386, "rewards/rejected": -1.0467654466629028, "step": 9370 }, { "epoch": 1.6161268090971743, "grad_norm": 16.97860336303711, "learning_rate": 5.197425929863204e-08, "logits/chosen": -2.3141379356384277, "logits/rejected": -2.297131061553955, "logps/chosen": -131.0404815673828, "logps/rejected": -155.17239379882812, "loss": 0.6155, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7663172483444214, "rewards/margins": 0.24784526228904724, "rewards/rejected": -1.014162540435791, "step": 9380 }, { "epoch": 1.6178497587870435, "grad_norm": 13.652398109436035, "learning_rate": 5.1874091304208314e-08, "logits/chosen": -2.129861831665039, "logits/rejected": -2.1121160984039307, "logps/chosen": -134.87527465820312, "logps/rejected": -164.6666259765625, "loss": 0.5966, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8177527189254761, "rewards/margins": 0.28078868985176086, "rewards/rejected": -1.098541498184204, "step": 9390 }, { "epoch": 1.6195727084769125, "grad_norm": 11.97217082977295, "learning_rate": 5.17739157770554e-08, "logits/chosen": -2.208547353744507, "logits/rejected": -2.188955545425415, "logps/chosen": -132.96665954589844, "logps/rejected": -157.69369506835938, "loss": 0.614, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7889873385429382, "rewards/margins": 0.24523696303367615, "rewards/rejected": -1.034224271774292, "step": 9400 }, { "epoch": 1.6195727084769125, "eval_logits/chosen": -2.3499743938446045, "eval_logits/rejected": -2.34134840965271, "eval_logps/chosen": -126.9617919921875, "eval_logps/rejected": -145.97003173828125, "eval_loss": 0.6458662152290344, "eval_rewards/accuracies": 0.6368494629859924, "eval_rewards/chosen": -0.6824989914894104, "eval_rewards/margins": 0.1454000622034073, "eval_rewards/rejected": -0.8278990387916565, "eval_runtime": 384.4068, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.4, "step": 9400 }, { "epoch": 1.6212956581667815, "grad_norm": 16.661169052124023, "learning_rate": 5.167373311981922e-08, "logits/chosen": -2.2199814319610596, "logits/rejected": -2.2042629718780518, "logps/chosen": -137.41224670410156, "logps/rejected": -158.0041046142578, "loss": 0.6279, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8353851437568665, "rewards/margins": 0.22336220741271973, "rewards/rejected": -1.0587472915649414, "step": 9410 }, { "epoch": 1.6230186078566504, "grad_norm": 20.128929138183594, "learning_rate": 5.157354373517425e-08, "logits/chosen": -2.244189739227295, "logits/rejected": -2.226304531097412, "logps/chosen": -146.09263610839844, "logps/rejected": -159.396484375, "loss": 0.6482, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9105369448661804, "rewards/margins": 0.16151931881904602, "rewards/rejected": -1.0720561742782593, "step": 9420 }, { "epoch": 1.6247415575465196, "grad_norm": 12.772502899169922, "learning_rate": 5.147334802582208e-08, "logits/chosen": -2.212515115737915, "logits/rejected": -2.1966004371643066, "logps/chosen": -138.55496215820312, "logps/rejected": -162.17005920410156, "loss": 0.6292, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8612286448478699, "rewards/margins": 0.27113550901412964, "rewards/rejected": -1.13236403465271, "step": 9430 }, { "epoch": 1.6264645072363888, "grad_norm": 15.36139965057373, "learning_rate": 5.1373146394489706e-08, "logits/chosen": -2.201496124267578, "logits/rejected": -2.192960262298584, "logps/chosen": -132.10421752929688, "logps/rejected": -156.23837280273438, "loss": 0.6278, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8246947526931763, "rewards/margins": 0.213077113032341, "rewards/rejected": -1.0377719402313232, "step": 9440 }, { "epoch": 1.6281874569262578, "grad_norm": 15.164769172668457, "learning_rate": 5.127293924392787e-08, "logits/chosen": -2.3521695137023926, "logits/rejected": -2.3431594371795654, "logps/chosen": -142.69729614257812, "logps/rejected": -157.05572509765625, "loss": 0.6496, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8930834531784058, "rewards/margins": 0.17109383642673492, "rewards/rejected": -1.064177393913269, "step": 9450 }, { "epoch": 1.6299104066161267, "grad_norm": 13.118999481201172, "learning_rate": 5.117272697690961e-08, "logits/chosen": -2.2133584022521973, "logits/rejected": -2.204341173171997, "logps/chosen": -126.57929992675781, "logps/rejected": -172.01535034179688, "loss": 0.5373, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7432840466499329, "rewards/margins": 0.4366130232810974, "rewards/rejected": -1.1798970699310303, "step": 9460 }, { "epoch": 1.6316333563059957, "grad_norm": 14.453757286071777, "learning_rate": 5.10725099962284e-08, "logits/chosen": -2.0880303382873535, "logits/rejected": -2.0585885047912598, "logps/chosen": -134.8499755859375, "logps/rejected": -149.53753662109375, "loss": 0.6496, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8220484852790833, "rewards/margins": 0.16864094138145447, "rewards/rejected": -0.9906893968582153, "step": 9470 }, { "epoch": 1.633356305995865, "grad_norm": 15.361272811889648, "learning_rate": 5.0972288704696764e-08, "logits/chosen": -2.2112982273101807, "logits/rejected": -2.1769070625305176, "logps/chosen": -138.25120544433594, "logps/rejected": -162.27767944335938, "loss": 0.6102, "rewards/accuracies": 0.65625, "rewards/chosen": -0.842586874961853, "rewards/margins": 0.2536259591579437, "rewards/rejected": -1.0962128639221191, "step": 9480 }, { "epoch": 1.635079255685734, "grad_norm": 17.464826583862305, "learning_rate": 5.0872063505144494e-08, "logits/chosen": -2.1904773712158203, "logits/rejected": -2.1711699962615967, "logps/chosen": -136.94525146484375, "logps/rejected": -159.8241424560547, "loss": 0.6191, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8283279538154602, "rewards/margins": 0.2471480816602707, "rewards/rejected": -1.0754759311676025, "step": 9490 }, { "epoch": 1.636802205375603, "grad_norm": 13.781883239746094, "learning_rate": 5.077183480041711e-08, "logits/chosen": -2.2438812255859375, "logits/rejected": -2.2275278568267822, "logps/chosen": -132.5435791015625, "logps/rejected": -161.4524688720703, "loss": 0.596, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7889715433120728, "rewards/margins": 0.27482491731643677, "rewards/rejected": -1.0637964010238647, "step": 9500 }, { "epoch": 1.636802205375603, "eval_logits/chosen": -2.3420205116271973, "eval_logits/rejected": -2.33329439163208, "eval_logps/chosen": -126.80589294433594, "eval_logps/rejected": -145.8628387451172, "eval_loss": 0.6455623507499695, "eval_rewards/accuracies": 0.6368494629859924, "eval_rewards/chosen": -0.6809399724006653, "eval_rewards/margins": 0.14588727056980133, "eval_rewards/rejected": -0.8268271684646606, "eval_runtime": 384.5373, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 9500 }, { "epoch": 1.638525155065472, "grad_norm": 13.61595630645752, "learning_rate": 5.067160299337423e-08, "logits/chosen": -2.153742551803589, "logits/rejected": -2.1346375942230225, "logps/chosen": -130.6231689453125, "logps/rejected": -164.5575408935547, "loss": 0.5946, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7974013686180115, "rewards/margins": 0.30875450372695923, "rewards/rejected": -1.1061558723449707, "step": 9510 }, { "epoch": 1.640248104755341, "grad_norm": 11.904542922973633, "learning_rate": 5.0571368486887913e-08, "logits/chosen": -2.352456569671631, "logits/rejected": -2.336483955383301, "logps/chosen": -136.1502227783203, "logps/rejected": -172.6043243408203, "loss": 0.5912, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.83812415599823, "rewards/margins": 0.309414803981781, "rewards/rejected": -1.1475389003753662, "step": 9520 }, { "epoch": 1.6419710544452102, "grad_norm": 19.78743553161621, "learning_rate": 5.047113168384112e-08, "logits/chosen": -2.2551584243774414, "logits/rejected": -2.2285451889038086, "logps/chosen": -136.8931121826172, "logps/rejected": -168.810302734375, "loss": 0.58, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8399711847305298, "rewards/margins": 0.3392803370952606, "rewards/rejected": -1.1792514324188232, "step": 9530 }, { "epoch": 1.6436940041350794, "grad_norm": 14.243350982666016, "learning_rate": 5.037089298712597e-08, "logits/chosen": -2.1853389739990234, "logits/rejected": -2.1623167991638184, "logps/chosen": -138.184326171875, "logps/rejected": -166.1068572998047, "loss": 0.6025, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8406573534011841, "rewards/margins": 0.286348819732666, "rewards/rejected": -1.1270062923431396, "step": 9540 }, { "epoch": 1.6454169538249483, "grad_norm": 14.862560272216797, "learning_rate": 5.027065279964226e-08, "logits/chosen": -2.233996629714966, "logits/rejected": -2.2311766147613525, "logps/chosen": -134.91009521484375, "logps/rejected": -167.38851928710938, "loss": 0.6003, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8168676495552063, "rewards/margins": 0.28906482458114624, "rewards/rejected": -1.1059324741363525, "step": 9550 }, { "epoch": 1.6471399035148173, "grad_norm": 20.028427124023438, "learning_rate": 5.017041152429572e-08, "logits/chosen": -2.2922523021698, "logits/rejected": -2.279959201812744, "logps/chosen": -140.45962524414062, "logps/rejected": -167.86880493164062, "loss": 0.6137, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8683684468269348, "rewards/margins": 0.2797490954399109, "rewards/rejected": -1.1481176614761353, "step": 9560 }, { "epoch": 1.6488628532046863, "grad_norm": 14.364787101745605, "learning_rate": 5.00701695639965e-08, "logits/chosen": -2.190647840499878, "logits/rejected": -2.1755378246307373, "logps/chosen": -138.60610961914062, "logps/rejected": -165.51473999023438, "loss": 0.616, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8835409283638, "rewards/margins": 0.26500529050827026, "rewards/rejected": -1.1485462188720703, "step": 9570 }, { "epoch": 1.6505858028945555, "grad_norm": 17.80682945251465, "learning_rate": 4.99699273216575e-08, "logits/chosen": -2.2682061195373535, "logits/rejected": -2.248408794403076, "logps/chosen": -143.7680206298828, "logps/rejected": -171.72933959960938, "loss": 0.618, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8928016424179077, "rewards/margins": 0.2550438940525055, "rewards/rejected": -1.1478455066680908, "step": 9580 }, { "epoch": 1.6523087525844247, "grad_norm": 13.538311958312988, "learning_rate": 4.986968520019272e-08, "logits/chosen": -2.3411145210266113, "logits/rejected": -2.3234317302703857, "logps/chosen": -143.87705993652344, "logps/rejected": -164.1705780029297, "loss": 0.6401, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8778031468391418, "rewards/margins": 0.20079176127910614, "rewards/rejected": -1.0785949230194092, "step": 9590 }, { "epoch": 1.6540317022742936, "grad_norm": 16.569501876831055, "learning_rate": 4.9769443602515724e-08, "logits/chosen": -2.178515672683716, "logits/rejected": -2.1467490196228027, "logps/chosen": -144.49021911621094, "logps/rejected": -168.6632843017578, "loss": 0.6174, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8942978978157043, "rewards/margins": 0.26871371269226074, "rewards/rejected": -1.1630115509033203, "step": 9600 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -2.321179151535034, "eval_logits/rejected": -2.3122761249542236, "eval_logps/chosen": -130.85472106933594, "eval_logps/rejected": -150.51263427734375, "eval_loss": 0.6448329091072083, "eval_rewards/accuracies": 0.6363847851753235, "eval_rewards/chosen": -0.7214282751083374, "eval_rewards/margins": 0.1518966257572174, "eval_rewards/rejected": -0.8733248710632324, "eval_runtime": 384.8944, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 9600 }, { "epoch": 1.6557546519641626, "grad_norm": 15.439477920532227, "learning_rate": 4.9669202931537895e-08, "logits/chosen": -2.2018351554870605, "logits/rejected": -2.1835737228393555, "logps/chosen": -134.9109649658203, "logps/rejected": -164.3643035888672, "loss": 0.59, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.786026656627655, "rewards/margins": 0.3080291450023651, "rewards/rejected": -1.0940557718276978, "step": 9610 }, { "epoch": 1.6574776016540316, "grad_norm": 13.52280044555664, "learning_rate": 4.956896359016698e-08, "logits/chosen": -2.3169360160827637, "logits/rejected": -2.3045763969421387, "logps/chosen": -142.59091186523438, "logps/rejected": -162.2111358642578, "loss": 0.6391, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8800448179244995, "rewards/margins": 0.1880597174167633, "rewards/rejected": -1.0681045055389404, "step": 9620 }, { "epoch": 1.6592005513439008, "grad_norm": 21.575002670288086, "learning_rate": 4.946872598130531e-08, "logits/chosen": -2.2289185523986816, "logits/rejected": -2.200310230255127, "logps/chosen": -147.0320587158203, "logps/rejected": -169.76309204101562, "loss": 0.6192, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9157378077507019, "rewards/margins": 0.25152549147605896, "rewards/rejected": -1.1672632694244385, "step": 9630 }, { "epoch": 1.66092350103377, "grad_norm": 15.734440803527832, "learning_rate": 4.9368490507848285e-08, "logits/chosen": -2.2782468795776367, "logits/rejected": -2.25008225440979, "logps/chosen": -143.11343383789062, "logps/rejected": -161.559326171875, "loss": 0.6172, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8711656332015991, "rewards/margins": 0.23914751410484314, "rewards/rejected": -1.1103131771087646, "step": 9640 }, { "epoch": 1.662646450723639, "grad_norm": 16.812705993652344, "learning_rate": 4.926825757268276e-08, "logits/chosen": -2.171779155731201, "logits/rejected": -2.1505627632141113, "logps/chosen": -145.31686401367188, "logps/rejected": -163.62966918945312, "loss": 0.6393, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8983901739120483, "rewards/margins": 0.19798320531845093, "rewards/rejected": -1.0963733196258545, "step": 9650 }, { "epoch": 1.664369400413508, "grad_norm": 12.746070861816406, "learning_rate": 4.916802757868529e-08, "logits/chosen": -2.184086322784424, "logits/rejected": -2.1746981143951416, "logps/chosen": -136.50216674804688, "logps/rejected": -166.5528564453125, "loss": 0.6167, "rewards/accuracies": 0.65625, "rewards/chosen": -0.859176516532898, "rewards/margins": 0.269855797290802, "rewards/rejected": -1.1290323734283447, "step": 9660 }, { "epoch": 1.6660923501033769, "grad_norm": 17.654985427856445, "learning_rate": 4.906780092872069e-08, "logits/chosen": -2.2691900730133057, "logits/rejected": -2.2417221069335938, "logps/chosen": -137.96969604492188, "logps/rejected": -166.61886596679688, "loss": 0.5905, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8185309171676636, "rewards/margins": 0.3142274022102356, "rewards/rejected": -1.132758378982544, "step": 9670 }, { "epoch": 1.667815299793246, "grad_norm": 13.933897972106934, "learning_rate": 4.89675780256403e-08, "logits/chosen": -2.2214903831481934, "logits/rejected": -2.207895040512085, "logps/chosen": -138.79635620117188, "logps/rejected": -153.263427734375, "loss": 0.6545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8432739973068237, "rewards/margins": 0.16740302741527557, "rewards/rejected": -1.0106770992279053, "step": 9680 }, { "epoch": 1.6695382494831152, "grad_norm": 20.195838928222656, "learning_rate": 4.886735927228044e-08, "logits/chosen": -2.1322407722473145, "logits/rejected": -2.1157989501953125, "logps/chosen": -142.687744140625, "logps/rejected": -161.8946533203125, "loss": 0.6318, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8685962557792664, "rewards/margins": 0.20423361659049988, "rewards/rejected": -1.0728299617767334, "step": 9690 }, { "epoch": 1.6712611991729842, "grad_norm": 18.60493278503418, "learning_rate": 4.876714507146066e-08, "logits/chosen": -2.18872332572937, "logits/rejected": -2.160177707672119, "logps/chosen": -137.0856475830078, "logps/rejected": -159.61163330078125, "loss": 0.6332, "rewards/accuracies": 0.625, "rewards/chosen": -0.8437908887863159, "rewards/margins": 0.22959363460540771, "rewards/rejected": -1.0733845233917236, "step": 9700 }, { "epoch": 1.6712611991729842, "eval_logits/chosen": -2.3232240676879883, "eval_logits/rejected": -2.314258337020874, "eval_logps/chosen": -127.7155990600586, "eval_logps/rejected": -146.987548828125, "eval_loss": 0.645205557346344, "eval_rewards/accuracies": 0.6356877088546753, "eval_rewards/chosen": -0.6900372505187988, "eval_rewards/margins": 0.14803703129291534, "eval_rewards/rejected": -0.8380742073059082, "eval_runtime": 384.3217, "eval_samples_per_second": 11.199, "eval_steps_per_second": 1.4, "step": 9700 }, { "epoch": 1.6729841488628532, "grad_norm": 19.854507446289062, "learning_rate": 4.86669358259823e-08, "logits/chosen": -2.06451153755188, "logits/rejected": -2.0499229431152344, "logps/chosen": -139.743896484375, "logps/rejected": -161.8704833984375, "loss": 0.6294, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.861153244972229, "rewards/margins": 0.2383410632610321, "rewards/rejected": -1.099494218826294, "step": 9710 }, { "epoch": 1.6747070985527222, "grad_norm": 13.128890991210938, "learning_rate": 4.856673193862677e-08, "logits/chosen": -2.228999376296997, "logits/rejected": -2.213212728500366, "logps/chosen": -137.44223022460938, "logps/rejected": -162.5154266357422, "loss": 0.6089, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8408370018005371, "rewards/margins": 0.2708285450935364, "rewards/rejected": -1.1116654872894287, "step": 9720 }, { "epoch": 1.6764300482425913, "grad_norm": 13.676172256469727, "learning_rate": 4.846653381215391e-08, "logits/chosen": -2.2127833366394043, "logits/rejected": -2.1972455978393555, "logps/chosen": -133.79367065429688, "logps/rejected": -164.43890380859375, "loss": 0.5991, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8005701303482056, "rewards/margins": 0.29943519830703735, "rewards/rejected": -1.1000052690505981, "step": 9730 }, { "epoch": 1.6781529979324605, "grad_norm": 15.458601951599121, "learning_rate": 4.836634184930043e-08, "logits/chosen": -2.2311646938323975, "logits/rejected": -2.2112231254577637, "logps/chosen": -139.5474395751953, "logps/rejected": -161.66346740722656, "loss": 0.6226, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8507331013679504, "rewards/margins": 0.2332516461610794, "rewards/rejected": -1.0839848518371582, "step": 9740 }, { "epoch": 1.6798759476223295, "grad_norm": 13.712337493896484, "learning_rate": 4.826615645277823e-08, "logits/chosen": -2.2344627380371094, "logits/rejected": -2.1938767433166504, "logps/chosen": -140.955810546875, "logps/rejected": -159.47604370117188, "loss": 0.6204, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8582791090011597, "rewards/margins": 0.24294321238994598, "rewards/rejected": -1.101222276687622, "step": 9750 }, { "epoch": 1.6815988973121985, "grad_norm": 14.223775863647461, "learning_rate": 4.8165978025272865e-08, "logits/chosen": -2.249579668045044, "logits/rejected": -2.2243008613586426, "logps/chosen": -137.7418975830078, "logps/rejected": -160.04269409179688, "loss": 0.6309, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8455856442451477, "rewards/margins": 0.2420284003019333, "rewards/rejected": -1.0876139402389526, "step": 9760 }, { "epoch": 1.6833218470020674, "grad_norm": 13.250173568725586, "learning_rate": 4.806580696944186e-08, "logits/chosen": -2.1560091972351074, "logits/rejected": -2.135222911834717, "logps/chosen": -136.96336364746094, "logps/rejected": -160.64523315429688, "loss": 0.6216, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.828553318977356, "rewards/margins": 0.23837676644325256, "rewards/rejected": -1.0669300556182861, "step": 9770 }, { "epoch": 1.6850447966919366, "grad_norm": 12.2053804397583, "learning_rate": 4.796564368791311e-08, "logits/chosen": -2.243837833404541, "logits/rejected": -2.200624465942383, "logps/chosen": -143.16171264648438, "logps/rejected": -164.6429443359375, "loss": 0.6002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.822791576385498, "rewards/margins": 0.30756694078445435, "rewards/rejected": -1.1303584575653076, "step": 9780 }, { "epoch": 1.6867677463818056, "grad_norm": 14.09291934967041, "learning_rate": 4.786548858328325e-08, "logits/chosen": -2.2144553661346436, "logits/rejected": -2.2106902599334717, "logps/chosen": -136.1967010498047, "logps/rejected": -174.01950073242188, "loss": 0.5946, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8278509378433228, "rewards/margins": 0.33650845289230347, "rewards/rejected": -1.1643593311309814, "step": 9790 }, { "epoch": 1.6884906960716748, "grad_norm": 15.92001724243164, "learning_rate": 4.7765342058116057e-08, "logits/chosen": -2.2426822185516357, "logits/rejected": -2.2081751823425293, "logps/chosen": -141.9800262451172, "logps/rejected": -167.13998413085938, "loss": 0.6115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8517776727676392, "rewards/margins": 0.2826196849346161, "rewards/rejected": -1.134397268295288, "step": 9800 }, { "epoch": 1.6884906960716748, "eval_logits/chosen": -2.3225085735321045, "eval_logits/rejected": -2.3134241104125977, "eval_logps/chosen": -127.55428314208984, "eval_logps/rejected": -146.86053466796875, "eval_loss": 0.6451530456542969, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.688423752784729, "eval_rewards/margins": 0.1483803540468216, "eval_rewards/rejected": -0.8368041515350342, "eval_runtime": 384.4592, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 9800 }, { "epoch": 1.6902136457615438, "grad_norm": 14.963744163513184, "learning_rate": 4.766520451494082e-08, "logits/chosen": -2.178990125656128, "logits/rejected": -2.144641876220703, "logps/chosen": -138.94650268554688, "logps/rejected": -166.52066040039062, "loss": 0.6134, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8352736234664917, "rewards/margins": 0.29102885723114014, "rewards/rejected": -1.1263024806976318, "step": 9810 }, { "epoch": 1.6919365954514127, "grad_norm": 13.498551368713379, "learning_rate": 4.756507635625075e-08, "logits/chosen": -2.219660997390747, "logits/rejected": -2.192598342895508, "logps/chosen": -134.29055786132812, "logps/rejected": -166.69290161132812, "loss": 0.5864, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.801688015460968, "rewards/margins": 0.3255941867828369, "rewards/rejected": -1.1272822618484497, "step": 9820 }, { "epoch": 1.693659545141282, "grad_norm": 12.89820671081543, "learning_rate": 4.7464957984501324e-08, "logits/chosen": -2.252384662628174, "logits/rejected": -2.243018865585327, "logps/chosen": -143.53627014160156, "logps/rejected": -167.6560516357422, "loss": 0.638, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.8554835319519043, "rewards/margins": 0.24761812388896942, "rewards/rejected": -1.1031017303466797, "step": 9830 }, { "epoch": 1.6953824948311509, "grad_norm": 12.623929977416992, "learning_rate": 4.736484980210865e-08, "logits/chosen": -2.194873332977295, "logits/rejected": -2.1643779277801514, "logps/chosen": -142.33627319335938, "logps/rejected": -167.91973876953125, "loss": 0.6173, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8645842671394348, "rewards/margins": 0.2742432951927185, "rewards/rejected": -1.1388275623321533, "step": 9840 }, { "epoch": 1.69710544452102, "grad_norm": 14.167814254760742, "learning_rate": 4.726475221144791e-08, "logits/chosen": -2.2100706100463867, "logits/rejected": -2.198821783065796, "logps/chosen": -131.62059020996094, "logps/rejected": -152.97836303710938, "loss": 0.6135, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7701894044876099, "rewards/margins": 0.24295096099376678, "rewards/rejected": -1.013140320777893, "step": 9850 }, { "epoch": 1.698828394210889, "grad_norm": 15.186372756958008, "learning_rate": 4.7164665614851735e-08, "logits/chosen": -2.2820308208465576, "logits/rejected": -2.2707371711730957, "logps/chosen": -145.71206665039062, "logps/rejected": -163.60635375976562, "loss": 0.6437, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8938573002815247, "rewards/margins": 0.20336231589317322, "rewards/rejected": -1.0972195863723755, "step": 9860 }, { "epoch": 1.700551343900758, "grad_norm": 16.36129379272461, "learning_rate": 4.706459041460853e-08, "logits/chosen": -2.2451891899108887, "logits/rejected": -2.220306158065796, "logps/chosen": -134.3350372314453, "logps/rejected": -156.4230499267578, "loss": 0.6171, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.816038966178894, "rewards/margins": 0.23933705687522888, "rewards/rejected": -1.0553760528564453, "step": 9870 }, { "epoch": 1.7022742935906272, "grad_norm": 12.645424842834473, "learning_rate": 4.69645270129609e-08, "logits/chosen": -2.1606013774871826, "logits/rejected": -2.1532657146453857, "logps/chosen": -132.5287628173828, "logps/rejected": -163.0583953857422, "loss": 0.6099, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8046320676803589, "rewards/margins": 0.27143457531929016, "rewards/rejected": -1.0760666131973267, "step": 9880 }, { "epoch": 1.7039972432804962, "grad_norm": 16.250497817993164, "learning_rate": 4.686447581210404e-08, "logits/chosen": -2.128981828689575, "logits/rejected": -2.118410348892212, "logps/chosen": -134.69178771972656, "logps/rejected": -166.2328643798828, "loss": 0.5753, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7827411890029907, "rewards/margins": 0.34079596400260925, "rewards/rejected": -1.123537302017212, "step": 9890 }, { "epoch": 1.7057201929703654, "grad_norm": 15.817575454711914, "learning_rate": 4.676443721418408e-08, "logits/chosen": -2.213707208633423, "logits/rejected": -2.1815929412841797, "logps/chosen": -127.66575622558594, "logps/rejected": -166.59573364257812, "loss": 0.5539, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7595924139022827, "rewards/margins": 0.4084106385707855, "rewards/rejected": -1.1680030822753906, "step": 9900 }, { "epoch": 1.7057201929703654, "eval_logits/chosen": -2.3197176456451416, "eval_logits/rejected": -2.3105931282043457, "eval_logps/chosen": -128.02891540527344, "eval_logps/rejected": -147.511474609375, "eval_loss": 0.6446408629417419, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -0.6931701302528381, "eval_rewards/margins": 0.15014345943927765, "eval_rewards/rejected": -0.8433136343955994, "eval_runtime": 384.123, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 9900 }, { "epoch": 1.7074431426602343, "grad_norm": 11.258186340332031, "learning_rate": 4.666441162129653e-08, "logits/chosen": -2.2739298343658447, "logits/rejected": -2.231238603591919, "logps/chosen": -140.17001342773438, "logps/rejected": -161.0088653564453, "loss": 0.6048, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8260173797607422, "rewards/margins": 0.2775678038597107, "rewards/rejected": -1.1035852432250977, "step": 9910 }, { "epoch": 1.7091660923501033, "grad_norm": 17.197668075561523, "learning_rate": 4.6564399435484616e-08, "logits/chosen": -2.265193462371826, "logits/rejected": -2.237905979156494, "logps/chosen": -136.23501586914062, "logps/rejected": -161.6768035888672, "loss": 0.5978, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8354613184928894, "rewards/margins": 0.2880064845085144, "rewards/rejected": -1.1234678030014038, "step": 9920 }, { "epoch": 1.7108890420399723, "grad_norm": 17.979215621948242, "learning_rate": 4.646440105873764e-08, "logits/chosen": -2.1969082355499268, "logits/rejected": -2.1891164779663086, "logps/chosen": -132.01129150390625, "logps/rejected": -165.70767211914062, "loss": 0.5928, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8010461926460266, "rewards/margins": 0.30742883682250977, "rewards/rejected": -1.1084750890731812, "step": 9930 }, { "epoch": 1.7126119917298415, "grad_norm": 12.098986625671387, "learning_rate": 4.636441689298945e-08, "logits/chosen": -2.2747788429260254, "logits/rejected": -2.264848470687866, "logps/chosen": -139.1338348388672, "logps/rejected": -168.95730590820312, "loss": 0.6226, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8407213091850281, "rewards/margins": 0.24144446849822998, "rewards/rejected": -1.0821655988693237, "step": 9940 }, { "epoch": 1.7143349414197107, "grad_norm": 22.385595321655273, "learning_rate": 4.626444734011674e-08, "logits/chosen": -2.2395224571228027, "logits/rejected": -2.2053744792938232, "logps/chosen": -140.4509735107422, "logps/rejected": -166.3548583984375, "loss": 0.6038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8648663759231567, "rewards/margins": 0.28538650274276733, "rewards/rejected": -1.1502528190612793, "step": 9950 }, { "epoch": 1.7160578911095796, "grad_norm": 19.255605697631836, "learning_rate": 4.6164492801937516e-08, "logits/chosen": -2.272346019744873, "logits/rejected": -2.2559800148010254, "logps/chosen": -144.24549865722656, "logps/rejected": -162.89450073242188, "loss": 0.6506, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9080840945243835, "rewards/margins": 0.1747702807188034, "rewards/rejected": -1.0828543901443481, "step": 9960 }, { "epoch": 1.7177808407994486, "grad_norm": 17.50830078125, "learning_rate": 4.606455368020934e-08, "logits/chosen": -2.22299861907959, "logits/rejected": -2.204890251159668, "logps/chosen": -134.70266723632812, "logps/rejected": -161.00222778320312, "loss": 0.6156, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8334255218505859, "rewards/margins": 0.25889724493026733, "rewards/rejected": -1.0923227071762085, "step": 9970 }, { "epoch": 1.7195037904893176, "grad_norm": 16.529211044311523, "learning_rate": 4.59646303766279e-08, "logits/chosen": -2.167935609817505, "logits/rejected": -2.15175199508667, "logps/chosen": -136.57620239257812, "logps/rejected": -168.3311767578125, "loss": 0.5962, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8378031849861145, "rewards/margins": 0.3137553036212921, "rewards/rejected": -1.15155827999115, "step": 9980 }, { "epoch": 1.7212267401791868, "grad_norm": 18.117942810058594, "learning_rate": 4.586472329282529e-08, "logits/chosen": -2.2473387718200684, "logits/rejected": -2.207033157348633, "logps/chosen": -140.76731872558594, "logps/rejected": -157.20399475097656, "loss": 0.639, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8703460693359375, "rewards/margins": 0.202531099319458, "rewards/rejected": -1.0728771686553955, "step": 9990 }, { "epoch": 1.722949689869056, "grad_norm": 13.983428955078125, "learning_rate": 4.576483283036835e-08, "logits/chosen": -2.2460057735443115, "logits/rejected": -2.226346492767334, "logps/chosen": -139.29913330078125, "logps/rejected": -169.69358825683594, "loss": 0.5881, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8355058431625366, "rewards/margins": 0.29175347089767456, "rewards/rejected": -1.127259373664856, "step": 10000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -2.3095526695251465, "eval_logits/rejected": -2.300386905670166, "eval_logps/chosen": -128.69415283203125, "eval_logps/rejected": -148.32015991210938, "eval_loss": 0.6445627808570862, "eval_rewards/accuracies": 0.6356877088546753, "eval_rewards/chosen": -0.6998225450515747, "eval_rewards/margins": 0.15157800912857056, "eval_rewards/rejected": -0.8514004945755005, "eval_runtime": 383.6242, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 10000 }, { "epoch": 1.724672639558925, "grad_norm": 15.158984184265137, "learning_rate": 4.566495939075722e-08, "logits/chosen": -2.236889362335205, "logits/rejected": -2.2126617431640625, "logps/chosen": -133.5480499267578, "logps/rejected": -168.2829132080078, "loss": 0.5754, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8116868734359741, "rewards/margins": 0.3433683514595032, "rewards/rejected": -1.1550551652908325, "step": 10010 }, { "epoch": 1.7263955892487939, "grad_norm": 18.537172317504883, "learning_rate": 4.5565103375423466e-08, "logits/chosen": -2.172159433364868, "logits/rejected": -2.1415655612945557, "logps/chosen": -142.4164581298828, "logps/rejected": -162.5454864501953, "loss": 0.623, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8874385952949524, "rewards/margins": 0.22822114825248718, "rewards/rejected": -1.1156597137451172, "step": 10020 }, { "epoch": 1.7281185389386629, "grad_norm": 18.817468643188477, "learning_rate": 4.546526518572878e-08, "logits/chosen": -2.2000653743743896, "logits/rejected": -2.170567274093628, "logps/chosen": -141.38662719726562, "logps/rejected": -158.1228485107422, "loss": 0.6249, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8399541974067688, "rewards/margins": 0.2199651300907135, "rewards/rejected": -1.0599192380905151, "step": 10030 }, { "epoch": 1.729841488628532, "grad_norm": 11.971080780029297, "learning_rate": 4.5365445222963096e-08, "logits/chosen": -2.338090419769287, "logits/rejected": -2.3187737464904785, "logps/chosen": -141.56509399414062, "logps/rejected": -167.83465576171875, "loss": 0.6145, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8672409057617188, "rewards/margins": 0.2667465806007385, "rewards/rejected": -1.1339874267578125, "step": 10040 }, { "epoch": 1.7315644383184012, "grad_norm": 18.907115936279297, "learning_rate": 4.5265643888343146e-08, "logits/chosen": -2.187636137008667, "logits/rejected": -2.1840672492980957, "logps/chosen": -144.9759979248047, "logps/rejected": -164.49069213867188, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": -0.9100421667098999, "rewards/margins": 0.17250248789787292, "rewards/rejected": -1.0825446844100952, "step": 10050 }, { "epoch": 1.7332873880082702, "grad_norm": 18.22229766845703, "learning_rate": 4.516586158301074e-08, "logits/chosen": -2.1519203186035156, "logits/rejected": -2.142122507095337, "logps/chosen": -131.17185974121094, "logps/rejected": -163.8739013671875, "loss": 0.6057, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8243473172187805, "rewards/margins": 0.29052823781967163, "rewards/rejected": -1.1148755550384521, "step": 10060 }, { "epoch": 1.7350103376981392, "grad_norm": 15.238069534301758, "learning_rate": 4.506609870803122e-08, "logits/chosen": -2.1237378120422363, "logits/rejected": -2.1098122596740723, "logps/chosen": -135.55996704101562, "logps/rejected": -166.37002563476562, "loss": 0.604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8310503959655762, "rewards/margins": 0.3078513443470001, "rewards/rejected": -1.1389015913009644, "step": 10070 }, { "epoch": 1.7367332873880081, "grad_norm": 19.08892250061035, "learning_rate": 4.4966355664391856e-08, "logits/chosen": -2.2336137294769287, "logits/rejected": -2.216768741607666, "logps/chosen": -144.35028076171875, "logps/rejected": -166.70860290527344, "loss": 0.6259, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8781234622001648, "rewards/margins": 0.23810401558876038, "rewards/rejected": -1.116227388381958, "step": 10080 }, { "epoch": 1.7384562370778773, "grad_norm": 12.441683769226074, "learning_rate": 4.486663285300019e-08, "logits/chosen": -2.2598929405212402, "logits/rejected": -2.2371418476104736, "logps/chosen": -136.45631408691406, "logps/rejected": -169.87832641601562, "loss": 0.5969, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8208305239677429, "rewards/margins": 0.3007936179637909, "rewards/rejected": -1.121624231338501, "step": 10090 }, { "epoch": 1.7401791867677465, "grad_norm": 17.461366653442383, "learning_rate": 4.4766930674682446e-08, "logits/chosen": -2.220125675201416, "logits/rejected": -2.2101218700408936, "logps/chosen": -136.82431030273438, "logps/rejected": -161.98684692382812, "loss": 0.6197, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8183838725090027, "rewards/margins": 0.25642913579940796, "rewards/rejected": -1.0748131275177002, "step": 10100 }, { "epoch": 1.7401791867677465, "eval_logits/chosen": -2.3032798767089844, "eval_logits/rejected": -2.293973922729492, "eval_logps/chosen": -127.3521728515625, "eval_logps/rejected": -146.7976837158203, "eval_loss": 0.6449527144432068, "eval_rewards/accuracies": 0.6342936754226685, "eval_rewards/chosen": -0.6864026784896851, "eval_rewards/margins": 0.14977297186851501, "eval_rewards/rejected": -0.8361757397651672, "eval_runtime": 384.8272, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 10100 }, { "epoch": 1.7419021364576155, "grad_norm": 18.607980728149414, "learning_rate": 4.4667249530181866e-08, "logits/chosen": -2.261636257171631, "logits/rejected": -2.262549638748169, "logps/chosen": -137.97755432128906, "logps/rejected": -170.9286651611328, "loss": 0.607, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8740439414978027, "rewards/margins": 0.2887888550758362, "rewards/rejected": -1.1628327369689941, "step": 10110 }, { "epoch": 1.7436250861474845, "grad_norm": 15.263072967529297, "learning_rate": 4.456758982015724e-08, "logits/chosen": -2.2228784561157227, "logits/rejected": -2.1980624198913574, "logps/chosen": -136.9678955078125, "logps/rejected": -164.87289428710938, "loss": 0.5931, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8324424028396606, "rewards/margins": 0.3102174699306488, "rewards/rejected": -1.1426599025726318, "step": 10120 }, { "epoch": 1.7453480358373534, "grad_norm": 15.646352767944336, "learning_rate": 4.446795194518113e-08, "logits/chosen": -2.2426841259002686, "logits/rejected": -2.219566583633423, "logps/chosen": -131.91358947753906, "logps/rejected": -162.7647247314453, "loss": 0.5963, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7856724858283997, "rewards/margins": 0.3054756820201874, "rewards/rejected": -1.0911481380462646, "step": 10130 }, { "epoch": 1.7470709855272226, "grad_norm": 16.928213119506836, "learning_rate": 4.436833630573837e-08, "logits/chosen": -2.2196404933929443, "logits/rejected": -2.178121328353882, "logps/chosen": -145.84657287597656, "logps/rejected": -166.92855834960938, "loss": 0.6104, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8542075157165527, "rewards/margins": 0.2754090428352356, "rewards/rejected": -1.1296164989471436, "step": 10140 }, { "epoch": 1.7487939352170918, "grad_norm": 14.765442848205566, "learning_rate": 4.4268743302224405e-08, "logits/chosen": -2.1662516593933105, "logits/rejected": -2.143148422241211, "logps/chosen": -136.57730102539062, "logps/rejected": -166.01812744140625, "loss": 0.6018, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8120381236076355, "rewards/margins": 0.2941080927848816, "rewards/rejected": -1.106146216392517, "step": 10150 }, { "epoch": 1.7505168849069608, "grad_norm": 12.129871368408203, "learning_rate": 4.416917333494369e-08, "logits/chosen": -2.191230535507202, "logits/rejected": -2.170567035675049, "logps/chosen": -134.02894592285156, "logps/rejected": -165.78924560546875, "loss": 0.5875, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8195708990097046, "rewards/margins": 0.3162716031074524, "rewards/rejected": -1.1358425617218018, "step": 10160 }, { "epoch": 1.7522398345968297, "grad_norm": 12.817859649658203, "learning_rate": 4.406962680410812e-08, "logits/chosen": -2.187701463699341, "logits/rejected": -2.173661470413208, "logps/chosen": -137.00331115722656, "logps/rejected": -165.77438354492188, "loss": 0.5979, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8209341764450073, "rewards/margins": 0.29276201128959656, "rewards/rejected": -1.1136962175369263, "step": 10170 }, { "epoch": 1.7539627842866987, "grad_norm": 16.7096004486084, "learning_rate": 4.3970104109835374e-08, "logits/chosen": -2.122936964035034, "logits/rejected": -2.0917460918426514, "logps/chosen": -137.17611694335938, "logps/rejected": -170.7346649169922, "loss": 0.5832, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8473415374755859, "rewards/margins": 0.32690685987472534, "rewards/rejected": -1.174248456954956, "step": 10180 }, { "epoch": 1.755685733976568, "grad_norm": 18.885801315307617, "learning_rate": 4.387060565214732e-08, "logits/chosen": -2.152899980545044, "logits/rejected": -2.125624418258667, "logps/chosen": -130.33421325683594, "logps/rejected": -165.26214599609375, "loss": 0.5792, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7937606573104858, "rewards/margins": 0.3447897434234619, "rewards/rejected": -1.1385502815246582, "step": 10190 }, { "epoch": 1.757408683666437, "grad_norm": 14.253737449645996, "learning_rate": 4.3771131830968386e-08, "logits/chosen": -2.221503734588623, "logits/rejected": -2.199347496032715, "logps/chosen": -138.52891540527344, "logps/rejected": -166.64492797851562, "loss": 0.6029, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8610817790031433, "rewards/margins": 0.29610908031463623, "rewards/rejected": -1.1571909189224243, "step": 10200 }, { "epoch": 1.757408683666437, "eval_logits/chosen": -2.281630039215088, "eval_logits/rejected": -2.272066593170166, "eval_logps/chosen": -132.54669189453125, "eval_logps/rejected": -152.94908142089844, "eval_loss": 0.6433464288711548, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.7383477687835693, "eval_rewards/margins": 0.15934188663959503, "eval_rewards/rejected": -0.8976895213127136, "eval_runtime": 385.4026, "eval_samples_per_second": 11.168, "eval_steps_per_second": 1.396, "step": 10200 }, { "epoch": 1.759131633356306, "grad_norm": 17.670040130615234, "learning_rate": 4.367168304612399e-08, "logits/chosen": -2.205653667449951, "logits/rejected": -2.1983067989349365, "logps/chosen": -149.83009338378906, "logps/rejected": -187.73196411132812, "loss": 0.5897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9581714868545532, "rewards/margins": 0.3478892743587494, "rewards/rejected": -1.306060791015625, "step": 10210 }, { "epoch": 1.760854583046175, "grad_norm": 17.196762084960938, "learning_rate": 4.3572259697338966e-08, "logits/chosen": -2.147998571395874, "logits/rejected": -2.1375207901000977, "logps/chosen": -135.0093994140625, "logps/rejected": -163.97073364257812, "loss": 0.6108, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8662388920783997, "rewards/margins": 0.26223769783973694, "rewards/rejected": -1.1284767389297485, "step": 10220 }, { "epoch": 1.762577532736044, "grad_norm": 12.86535358428955, "learning_rate": 4.347286218423585e-08, "logits/chosen": -2.1023175716400146, "logits/rejected": -2.0749263763427734, "logps/chosen": -141.42813110351562, "logps/rejected": -162.30569458007812, "loss": 0.6365, "rewards/accuracies": 0.625, "rewards/chosen": -0.8798721432685852, "rewards/margins": 0.2245951145887375, "rewards/rejected": -1.1044671535491943, "step": 10230 }, { "epoch": 1.7643004824259132, "grad_norm": 14.261350631713867, "learning_rate": 4.337349090633335e-08, "logits/chosen": -2.149608612060547, "logits/rejected": -2.123689889907837, "logps/chosen": -141.9866943359375, "logps/rejected": -181.30966186523438, "loss": 0.5783, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9052966833114624, "rewards/margins": 0.3841412663459778, "rewards/rejected": -1.2894378900527954, "step": 10240 }, { "epoch": 1.7660234321157822, "grad_norm": 18.512611389160156, "learning_rate": 4.327414626304473e-08, "logits/chosen": -2.192061185836792, "logits/rejected": -2.1531550884246826, "logps/chosen": -148.09976196289062, "logps/rejected": -169.9834442138672, "loss": 0.6011, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9132421612739563, "rewards/margins": 0.2897980511188507, "rewards/rejected": -1.2030402421951294, "step": 10250 }, { "epoch": 1.7677463818056514, "grad_norm": 15.683692932128906, "learning_rate": 4.317482865367619e-08, "logits/chosen": -2.1619386672973633, "logits/rejected": -2.161308765411377, "logps/chosen": -137.31143188476562, "logps/rejected": -170.02175903320312, "loss": 0.5929, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8404668569564819, "rewards/margins": 0.291026771068573, "rewards/rejected": -1.1314938068389893, "step": 10260 }, { "epoch": 1.7694693314955203, "grad_norm": 14.9175386428833, "learning_rate": 4.3075538477425296e-08, "logits/chosen": -2.1532955169677734, "logits/rejected": -2.1360678672790527, "logps/chosen": -148.23585510253906, "logps/rejected": -180.21438598632812, "loss": 0.5963, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9280508160591125, "rewards/margins": 0.3176276683807373, "rewards/rejected": -1.2456785440444946, "step": 10270 }, { "epoch": 1.7711922811853893, "grad_norm": 17.122722625732422, "learning_rate": 4.2976276133379336e-08, "logits/chosen": -2.134552240371704, "logits/rejected": -2.118440628051758, "logps/chosen": -151.31802368164062, "logps/rejected": -165.75881958007812, "loss": 0.6581, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9698888659477234, "rewards/margins": 0.15732906758785248, "rewards/rejected": -1.1272178888320923, "step": 10280 }, { "epoch": 1.7729152308752585, "grad_norm": 14.721953392028809, "learning_rate": 4.2877042020513696e-08, "logits/chosen": -2.115333080291748, "logits/rejected": -2.1033120155334473, "logps/chosen": -133.12435913085938, "logps/rejected": -174.62586975097656, "loss": 0.5754, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8168805837631226, "rewards/margins": 0.37453973293304443, "rewards/rejected": -1.191420316696167, "step": 10290 }, { "epoch": 1.7746381805651275, "grad_norm": 15.869515419006348, "learning_rate": 4.2777836537690336e-08, "logits/chosen": -2.172628879547119, "logits/rejected": -2.156878709793091, "logps/chosen": -146.99070739746094, "logps/rejected": -166.3126983642578, "loss": 0.6441, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9234288930892944, "rewards/margins": 0.1948300302028656, "rewards/rejected": -1.118259072303772, "step": 10300 }, { "epoch": 1.7746381805651275, "eval_logits/chosen": -2.2760071754455566, "eval_logits/rejected": -2.266352891921997, "eval_logps/chosen": -132.75343322753906, "eval_logps/rejected": -153.1610107421875, "eval_loss": 0.6434760093688965, "eval_rewards/accuracies": 0.6324349641799927, "eval_rewards/chosen": -0.7404152154922485, "eval_rewards/margins": 0.15939350426197052, "eval_rewards/rejected": -0.8998088240623474, "eval_runtime": 384.9966, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 10300 }, { "epoch": 1.7763611302549966, "grad_norm": 16.43121337890625, "learning_rate": 4.26786600836561e-08, "logits/chosen": -2.1087231636047363, "logits/rejected": -2.086599349975586, "logps/chosen": -150.3048553466797, "logps/rejected": -173.5523681640625, "loss": 0.6075, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.940421462059021, "rewards/margins": 0.28056690096855164, "rewards/rejected": -1.2209885120391846, "step": 10310 }, { "epoch": 1.7780840799448656, "grad_norm": 15.596647262573242, "learning_rate": 4.2579513057041225e-08, "logits/chosen": -2.164898633956909, "logits/rejected": -2.135221481323242, "logps/chosen": -152.44503784179688, "logps/rejected": -171.81338500976562, "loss": 0.6454, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9568337202072144, "rewards/margins": 0.2260902374982834, "rewards/rejected": -1.1829239130020142, "step": 10320 }, { "epoch": 1.7798070296347346, "grad_norm": 16.55746841430664, "learning_rate": 4.248039585635756e-08, "logits/chosen": -2.1852197647094727, "logits/rejected": -2.165299892425537, "logps/chosen": -143.6912384033203, "logps/rejected": -171.85385131835938, "loss": 0.6066, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9154688119888306, "rewards/margins": 0.25790801644325256, "rewards/rejected": -1.1733767986297607, "step": 10330 }, { "epoch": 1.7815299793246038, "grad_norm": 17.1219482421875, "learning_rate": 4.238130887999716e-08, "logits/chosen": -2.2100110054016113, "logits/rejected": -2.1829848289489746, "logps/chosen": -140.52951049804688, "logps/rejected": -173.30752563476562, "loss": 0.5919, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8444200754165649, "rewards/margins": 0.3429722785949707, "rewards/rejected": -1.1873924732208252, "step": 10340 }, { "epoch": 1.7832529290144727, "grad_norm": 14.312373161315918, "learning_rate": 4.228225252623055e-08, "logits/chosen": -2.285461902618408, "logits/rejected": -2.2589213848114014, "logps/chosen": -138.81597900390625, "logps/rejected": -164.53089904785156, "loss": 0.6045, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8473402857780457, "rewards/margins": 0.2874210476875305, "rewards/rejected": -1.1347613334655762, "step": 10350 }, { "epoch": 1.784975878704342, "grad_norm": 16.245309829711914, "learning_rate": 4.218322719320519e-08, "logits/chosen": -2.1347339153289795, "logits/rejected": -2.103478193283081, "logps/chosen": -145.54827880859375, "logps/rejected": -160.33938598632812, "loss": 0.6471, "rewards/accuracies": 0.625, "rewards/chosen": -0.8960143327713013, "rewards/margins": 0.17932698130607605, "rewards/rejected": -1.0753414630889893, "step": 10360 }, { "epoch": 1.786698828394211, "grad_norm": 15.313996315002441, "learning_rate": 4.208423327894387e-08, "logits/chosen": -2.0225048065185547, "logits/rejected": -1.991647481918335, "logps/chosen": -135.8849639892578, "logps/rejected": -165.5072479248047, "loss": 0.5964, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8390754461288452, "rewards/margins": 0.2871951162815094, "rewards/rejected": -1.1262706518173218, "step": 10370 }, { "epoch": 1.7884217780840799, "grad_norm": 22.271390914916992, "learning_rate": 4.1985271181343056e-08, "logits/chosen": -2.163297176361084, "logits/rejected": -2.1498043537139893, "logps/chosen": -145.9105987548828, "logps/rejected": -161.98081970214844, "loss": 0.6676, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.9403168559074402, "rewards/margins": 0.15644405782222748, "rewards/rejected": -1.0967609882354736, "step": 10380 }, { "epoch": 1.7901447277739488, "grad_norm": 15.086609840393066, "learning_rate": 4.188634129817135e-08, "logits/chosen": -2.2072253227233887, "logits/rejected": -2.1797983646392822, "logps/chosen": -139.36709594726562, "logps/rejected": -164.75955200195312, "loss": 0.6151, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8355790376663208, "rewards/margins": 0.27110618352890015, "rewards/rejected": -1.1066851615905762, "step": 10390 }, { "epoch": 1.791867677463818, "grad_norm": 15.889571189880371, "learning_rate": 4.178744402706788e-08, "logits/chosen": -2.1633167266845703, "logits/rejected": -2.1447815895080566, "logps/chosen": -138.95343017578125, "logps/rejected": -178.09666442871094, "loss": 0.5718, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8681317567825317, "rewards/margins": 0.35973799228668213, "rewards/rejected": -1.2278697490692139, "step": 10400 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -2.280719757080078, "eval_logits/rejected": -2.2711973190307617, "eval_logps/chosen": -129.17770385742188, "eval_logps/rejected": -149.0603485107422, "eval_loss": 0.6444300413131714, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.704658031463623, "eval_rewards/margins": 0.1541443020105362, "eval_rewards/rejected": -0.8588022589683533, "eval_runtime": 384.8903, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 10400 }, { "epoch": 1.7935906271536872, "grad_norm": 16.58106803894043, "learning_rate": 4.168857976554067e-08, "logits/chosen": -2.1462440490722656, "logits/rejected": -2.113055467605591, "logps/chosen": -147.19729614257812, "logps/rejected": -166.97935485839844, "loss": 0.6275, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9067977666854858, "rewards/margins": 0.23223397135734558, "rewards/rejected": -1.1390316486358643, "step": 10410 }, { "epoch": 1.7953135768435562, "grad_norm": 14.817155838012695, "learning_rate": 4.1589748910965104e-08, "logits/chosen": -2.1683151721954346, "logits/rejected": -2.1393213272094727, "logps/chosen": -139.8350067138672, "logps/rejected": -167.8277130126953, "loss": 0.611, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8571444749832153, "rewards/margins": 0.2798784077167511, "rewards/rejected": -1.137022852897644, "step": 10420 }, { "epoch": 1.7970365265334252, "grad_norm": 14.53480052947998, "learning_rate": 4.1490951860582243e-08, "logits/chosen": -2.239656925201416, "logits/rejected": -2.2195746898651123, "logps/chosen": -137.35696411132812, "logps/rejected": -158.53024291992188, "loss": 0.6286, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8248639106750488, "rewards/margins": 0.22988703846931458, "rewards/rejected": -1.054750919342041, "step": 10430 }, { "epoch": 1.7987594762232941, "grad_norm": 16.161693572998047, "learning_rate": 4.139218901149731e-08, "logits/chosen": -2.2611806392669678, "logits/rejected": -2.2504689693450928, "logps/chosen": -147.9939422607422, "logps/rejected": -165.90013122558594, "loss": 0.6411, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9012395143508911, "rewards/margins": 0.18483200669288635, "rewards/rejected": -1.0860713720321655, "step": 10440 }, { "epoch": 1.8004824259131633, "grad_norm": 14.8311128616333, "learning_rate": 4.129346076067802e-08, "logits/chosen": -2.198347806930542, "logits/rejected": -2.188472270965576, "logps/chosen": -139.87362670898438, "logps/rejected": -175.63955688476562, "loss": 0.5848, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8625160455703735, "rewards/margins": 0.31749850511550903, "rewards/rejected": -1.1800144910812378, "step": 10450 }, { "epoch": 1.8022053756030325, "grad_norm": 13.839213371276855, "learning_rate": 4.119476750495312e-08, "logits/chosen": -2.1854052543640137, "logits/rejected": -2.1569669246673584, "logps/chosen": -139.53134155273438, "logps/rejected": -166.59963989257812, "loss": 0.5973, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8495635986328125, "rewards/margins": 0.2861550450325012, "rewards/rejected": -1.135718584060669, "step": 10460 }, { "epoch": 1.8039283252929015, "grad_norm": 23.473339080810547, "learning_rate": 4.109610964101054e-08, "logits/chosen": -2.05077862739563, "logits/rejected": -2.02666974067688, "logps/chosen": -135.4434814453125, "logps/rejected": -161.55758666992188, "loss": 0.6168, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8353704214096069, "rewards/margins": 0.2724042534828186, "rewards/rejected": -1.1077747344970703, "step": 10470 }, { "epoch": 1.8056512749827704, "grad_norm": 13.003592491149902, "learning_rate": 4.099748756539609e-08, "logits/chosen": -2.178328275680542, "logits/rejected": -2.1420979499816895, "logps/chosen": -141.23043823242188, "logps/rejected": -172.04591369628906, "loss": 0.5866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8525956273078918, "rewards/margins": 0.34384042024612427, "rewards/rejected": -1.1964360475540161, "step": 10480 }, { "epoch": 1.8073742246726394, "grad_norm": 13.315703392028809, "learning_rate": 4.089890167451169e-08, "logits/chosen": -2.173445463180542, "logits/rejected": -2.145054340362549, "logps/chosen": -133.5574188232422, "logps/rejected": -160.74505615234375, "loss": 0.6027, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7974697351455688, "rewards/margins": 0.284454882144928, "rewards/rejected": -1.0819246768951416, "step": 10490 }, { "epoch": 1.8090971743625086, "grad_norm": 15.488198280334473, "learning_rate": 4.08003523646138e-08, "logits/chosen": -2.1538448333740234, "logits/rejected": -2.1327991485595703, "logps/chosen": -144.0309600830078, "logps/rejected": -181.07220458984375, "loss": 0.5866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9133148193359375, "rewards/margins": 0.34629201889038086, "rewards/rejected": -1.2596067190170288, "step": 10500 }, { "epoch": 1.8090971743625086, "eval_logits/chosen": -2.2695493698120117, "eval_logits/rejected": -2.2597997188568115, "eval_logps/chosen": -131.37030029296875, "eval_logps/rejected": -151.716064453125, "eval_loss": 0.6436738967895508, "eval_rewards/accuracies": 0.6342936754226685, "eval_rewards/chosen": -0.7265839576721191, "eval_rewards/margins": 0.1587754786014557, "eval_rewards/rejected": -0.8853594064712524, "eval_runtime": 385.1182, "eval_samples_per_second": 11.176, "eval_steps_per_second": 1.397, "step": 10500 }, { "epoch": 1.8108201240523778, "grad_norm": 14.934866905212402, "learning_rate": 4.070184003181189e-08, "logits/chosen": -2.160978317260742, "logits/rejected": -2.1332619190216064, "logps/chosen": -141.9182891845703, "logps/rejected": -170.94322204589844, "loss": 0.5952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8633019328117371, "rewards/margins": 0.31438082456588745, "rewards/rejected": -1.177682638168335, "step": 10510 }, { "epoch": 1.8125430737422468, "grad_norm": 18.554651260375977, "learning_rate": 4.060336507206673e-08, "logits/chosen": -2.213937520980835, "logits/rejected": -2.208878993988037, "logps/chosen": -139.72268676757812, "logps/rejected": -179.26918029785156, "loss": 0.5859, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8923715353012085, "rewards/margins": 0.3549218773841858, "rewards/rejected": -1.24729323387146, "step": 10520 }, { "epoch": 1.8142660234321157, "grad_norm": 14.874615669250488, "learning_rate": 4.0504927881188946e-08, "logits/chosen": -2.1197073459625244, "logits/rejected": -2.098106861114502, "logps/chosen": -146.2544403076172, "logps/rejected": -168.626220703125, "loss": 0.6267, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9291355013847351, "rewards/margins": 0.23587246239185333, "rewards/rejected": -1.1650078296661377, "step": 10530 }, { "epoch": 1.8159889731219847, "grad_norm": 12.893082618713379, "learning_rate": 4.040652885483733e-08, "logits/chosen": -2.083888530731201, "logits/rejected": -2.0583150386810303, "logps/chosen": -141.14779663085938, "logps/rejected": -166.489990234375, "loss": 0.6082, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8881710767745972, "rewards/margins": 0.2827834486961365, "rewards/rejected": -1.1709544658660889, "step": 10540 }, { "epoch": 1.817711922811854, "grad_norm": 18.706315994262695, "learning_rate": 4.0308168388517284e-08, "logits/chosen": -2.2584919929504395, "logits/rejected": -2.247316360473633, "logps/chosen": -145.86941528320312, "logps/rejected": -179.62362670898438, "loss": 0.598, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9211845397949219, "rewards/margins": 0.31723958253860474, "rewards/rejected": -1.2384244203567505, "step": 10550 }, { "epoch": 1.819434872501723, "grad_norm": 18.000244140625, "learning_rate": 4.020984687757918e-08, "logits/chosen": -2.121932029724121, "logits/rejected": -2.0984504222869873, "logps/chosen": -143.6024932861328, "logps/rejected": -173.8885955810547, "loss": 0.6074, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8797028660774231, "rewards/margins": 0.3129701614379883, "rewards/rejected": -1.1926729679107666, "step": 10560 }, { "epoch": 1.821157822191592, "grad_norm": 18.816566467285156, "learning_rate": 4.0111564717216845e-08, "logits/chosen": -2.195450782775879, "logits/rejected": -2.1770336627960205, "logps/chosen": -146.30859375, "logps/rejected": -178.07371520996094, "loss": 0.5959, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9036176800727844, "rewards/margins": 0.30947795510292053, "rewards/rejected": -1.2130956649780273, "step": 10570 }, { "epoch": 1.822880771881461, "grad_norm": 13.887199401855469, "learning_rate": 4.001332230246597e-08, "logits/chosen": -2.202655076980591, "logits/rejected": -2.1818783283233643, "logps/chosen": -140.010009765625, "logps/rejected": -167.59677124023438, "loss": 0.599, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8664800524711609, "rewards/margins": 0.28134989738464355, "rewards/rejected": -1.1478300094604492, "step": 10580 }, { "epoch": 1.82460372157133, "grad_norm": 12.5405912399292, "learning_rate": 3.9915120028202434e-08, "logits/chosen": -2.148926019668579, "logits/rejected": -2.1124074459075928, "logps/chosen": -147.2456817626953, "logps/rejected": -166.82044982910156, "loss": 0.6167, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9134114980697632, "rewards/margins": 0.2554444670677185, "rewards/rejected": -1.168855905532837, "step": 10590 }, { "epoch": 1.8263266712611992, "grad_norm": 16.951129913330078, "learning_rate": 3.9816958289140836e-08, "logits/chosen": -2.223396062850952, "logits/rejected": -2.2109744548797607, "logps/chosen": -138.66775512695312, "logps/rejected": -165.499267578125, "loss": 0.6278, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8578516840934753, "rewards/margins": 0.25026166439056396, "rewards/rejected": -1.108113408088684, "step": 10600 }, { "epoch": 1.8263266712611992, "eval_logits/chosen": -2.2650763988494873, "eval_logits/rejected": -2.2552788257598877, "eval_logps/chosen": -130.57827758789062, "eval_logps/rejected": -150.80702209472656, "eval_loss": 0.6437148451805115, "eval_rewards/accuracies": 0.6347583532333374, "eval_rewards/chosen": -0.7186638712882996, "eval_rewards/margins": 0.15760517120361328, "eval_rewards/rejected": -0.8762690424919128, "eval_runtime": 385.0195, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 10600 }, { "epoch": 1.8280496209510684, "grad_norm": 15.38785457611084, "learning_rate": 3.971883747983278e-08, "logits/chosen": -2.134243965148926, "logits/rejected": -2.1216976642608643, "logps/chosen": -140.45745849609375, "logps/rejected": -167.74581909179688, "loss": 0.6117, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.893933892250061, "rewards/margins": 0.2648622393608093, "rewards/rejected": -1.1587960720062256, "step": 10610 }, { "epoch": 1.8297725706409373, "grad_norm": 20.82608413696289, "learning_rate": 3.9620757994665383e-08, "logits/chosen": -2.0833215713500977, "logits/rejected": -2.0583508014678955, "logps/chosen": -144.21826171875, "logps/rejected": -172.72299194335938, "loss": 0.6007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.904144287109375, "rewards/margins": 0.29979896545410156, "rewards/rejected": -1.203943133354187, "step": 10620 }, { "epoch": 1.8314955203308063, "grad_norm": 11.789497375488281, "learning_rate": 3.952272022785971e-08, "logits/chosen": -2.169780969619751, "logits/rejected": -2.144395351409912, "logps/chosen": -139.1234588623047, "logps/rejected": -171.3330535888672, "loss": 0.6008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8486492037773132, "rewards/margins": 0.32299065589904785, "rewards/rejected": -1.1716396808624268, "step": 10630 }, { "epoch": 1.8332184700206753, "grad_norm": 14.893974304199219, "learning_rate": 3.9424724573469094e-08, "logits/chosen": -2.1571614742279053, "logits/rejected": -2.1320600509643555, "logps/chosen": -138.50320434570312, "logps/rejected": -169.2348175048828, "loss": 0.5889, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8343726396560669, "rewards/margins": 0.332188218832016, "rewards/rejected": -1.1665608882904053, "step": 10640 }, { "epoch": 1.8349414197105445, "grad_norm": 15.725971221923828, "learning_rate": 3.9326771425377586e-08, "logits/chosen": -2.219003677368164, "logits/rejected": -2.1948435306549072, "logps/chosen": -141.66043090820312, "logps/rejected": -179.11489868164062, "loss": 0.5833, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8803890943527222, "rewards/margins": 0.3725835680961609, "rewards/rejected": -1.2529727220535278, "step": 10650 }, { "epoch": 1.8366643694004137, "grad_norm": 15.388813018798828, "learning_rate": 3.9228861177298434e-08, "logits/chosen": -2.117363929748535, "logits/rejected": -2.101840019226074, "logps/chosen": -149.44125366210938, "logps/rejected": -170.25894165039062, "loss": 0.6372, "rewards/accuracies": 0.65625, "rewards/chosen": -0.947431206703186, "rewards/margins": 0.20335738360881805, "rewards/rejected": -1.1507885456085205, "step": 10660 }, { "epoch": 1.8383873190902826, "grad_norm": 13.729190826416016, "learning_rate": 3.913099422277242e-08, "logits/chosen": -2.131894111633301, "logits/rejected": -2.1088287830352783, "logps/chosen": -145.60157775878906, "logps/rejected": -175.6693878173828, "loss": 0.6023, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9346339106559753, "rewards/margins": 0.3064206540584564, "rewards/rejected": -1.2410545349121094, "step": 10670 }, { "epoch": 1.8401102687801516, "grad_norm": 14.68144702911377, "learning_rate": 3.903317095516634e-08, "logits/chosen": -2.1833302974700928, "logits/rejected": -2.143803119659424, "logps/chosen": -142.4230194091797, "logps/rejected": -163.93824768066406, "loss": 0.6084, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8613392114639282, "rewards/margins": 0.27023443579673767, "rewards/rejected": -1.1315736770629883, "step": 10680 }, { "epoch": 1.8418332184700206, "grad_norm": 14.89194107055664, "learning_rate": 3.893539176767138e-08, "logits/chosen": -2.1484665870666504, "logits/rejected": -2.135282039642334, "logps/chosen": -142.46084594726562, "logps/rejected": -178.20193481445312, "loss": 0.5903, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8871084451675415, "rewards/margins": 0.3311447501182556, "rewards/rejected": -1.2182531356811523, "step": 10690 }, { "epoch": 1.8435561681598898, "grad_norm": 20.9420223236084, "learning_rate": 3.8837657053301533e-08, "logits/chosen": -2.2023608684539795, "logits/rejected": -2.163548707962036, "logps/chosen": -143.1197967529297, "logps/rejected": -163.82421875, "loss": 0.6083, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8695224523544312, "rewards/margins": 0.26532793045043945, "rewards/rejected": -1.134850263595581, "step": 10700 }, { "epoch": 1.8435561681598898, "eval_logits/chosen": -2.253403902053833, "eval_logits/rejected": -2.243455410003662, "eval_logps/chosen": -132.6900177001953, "eval_logps/rejected": -153.36471557617188, "eval_loss": 0.6428442597389221, "eval_rewards/accuracies": 0.6305761933326721, "eval_rewards/chosen": -0.7397811412811279, "eval_rewards/margins": 0.16206470131874084, "eval_rewards/rejected": -0.9018458127975464, "eval_runtime": 385.4075, "eval_samples_per_second": 11.167, "eval_steps_per_second": 1.396, "step": 10700 }, { "epoch": 1.8452791178497587, "grad_norm": 15.535030364990234, "learning_rate": 3.873996720489205e-08, "logits/chosen": -2.117147922515869, "logits/rejected": -2.0882503986358643, "logps/chosen": -139.2937774658203, "logps/rejected": -161.57162475585938, "loss": 0.6118, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8584913015365601, "rewards/margins": 0.2628484070301056, "rewards/rejected": -1.1213396787643433, "step": 10710 }, { "epoch": 1.847002067539628, "grad_norm": 26.524709701538086, "learning_rate": 3.864232261509787e-08, "logits/chosen": -2.140989303588867, "logits/rejected": -2.1107444763183594, "logps/chosen": -146.13963317871094, "logps/rejected": -169.7803955078125, "loss": 0.6354, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9355264902114868, "rewards/margins": 0.23946543037891388, "rewards/rejected": -1.1749918460845947, "step": 10720 }, { "epoch": 1.848725017229497, "grad_norm": 14.984110832214355, "learning_rate": 3.8544723676392e-08, "logits/chosen": -2.2335128784179688, "logits/rejected": -2.199219226837158, "logps/chosen": -140.30679321289062, "logps/rejected": -163.94207763671875, "loss": 0.5985, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8634767532348633, "rewards/margins": 0.27716079354286194, "rewards/rejected": -1.1406376361846924, "step": 10730 }, { "epoch": 1.8504479669193659, "grad_norm": 19.155048370361328, "learning_rate": 3.844717078106394e-08, "logits/chosen": -2.0987420082092285, "logits/rejected": -2.0827109813690186, "logps/chosen": -146.81689453125, "logps/rejected": -172.4161834716797, "loss": 0.618, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9212657809257507, "rewards/margins": 0.2513931393623352, "rewards/rejected": -1.1726588010787964, "step": 10740 }, { "epoch": 1.852170916609235, "grad_norm": 13.430131912231445, "learning_rate": 3.8349664321218135e-08, "logits/chosen": -2.100236415863037, "logits/rejected": -2.0692098140716553, "logps/chosen": -138.74395751953125, "logps/rejected": -175.67669677734375, "loss": 0.5804, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8668569326400757, "rewards/margins": 0.36326250433921814, "rewards/rejected": -1.2301194667816162, "step": 10750 }, { "epoch": 1.853893866299104, "grad_norm": 15.672896385192871, "learning_rate": 3.82522046887724e-08, "logits/chosen": -2.1090798377990723, "logits/rejected": -2.0927541255950928, "logps/chosen": -141.32620239257812, "logps/rejected": -165.01124572753906, "loss": 0.6254, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8764539957046509, "rewards/margins": 0.24579842388629913, "rewards/rejected": -1.1222522258758545, "step": 10760 }, { "epoch": 1.8556168159889732, "grad_norm": 15.11644458770752, "learning_rate": 3.815479227545633e-08, "logits/chosen": -2.0915427207946777, "logits/rejected": -2.077320098876953, "logps/chosen": -145.64175415039062, "logps/rejected": -170.94107055664062, "loss": 0.6092, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8857005834579468, "rewards/margins": 0.2778187692165375, "rewards/rejected": -1.1635195016860962, "step": 10770 }, { "epoch": 1.8573397656788422, "grad_norm": 14.661605834960938, "learning_rate": 3.8057427472809736e-08, "logits/chosen": -2.2210752964019775, "logits/rejected": -2.2000391483306885, "logps/chosen": -143.1808319091797, "logps/rejected": -173.306884765625, "loss": 0.615, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.900111973285675, "rewards/margins": 0.282103955745697, "rewards/rejected": -1.182215929031372, "step": 10780 }, { "epoch": 1.8590627153687111, "grad_norm": 20.410158157348633, "learning_rate": 3.796011067218101e-08, "logits/chosen": -2.2542407512664795, "logits/rejected": -2.22265362739563, "logps/chosen": -134.27561950683594, "logps/rejected": -168.1732940673828, "loss": 0.5906, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8193452954292297, "rewards/margins": 0.3283326327800751, "rewards/rejected": -1.1476778984069824, "step": 10790 }, { "epoch": 1.8607856650585803, "grad_norm": 17.53208351135254, "learning_rate": 3.786284226472565e-08, "logits/chosen": -2.238175630569458, "logits/rejected": -2.21584153175354, "logps/chosen": -139.04769897460938, "logps/rejected": -171.2996368408203, "loss": 0.5999, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8708834648132324, "rewards/margins": 0.3029094934463501, "rewards/rejected": -1.173792839050293, "step": 10800 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -2.2513372898101807, "eval_logits/rejected": -2.2412359714508057, "eval_logps/chosen": -133.3793487548828, "eval_logps/rejected": -154.22219848632812, "eval_loss": 0.6425167322158813, "eval_rewards/accuracies": 0.6324349641799927, "eval_rewards/chosen": -0.7466743588447571, "eval_rewards/margins": 0.16374626755714417, "eval_rewards/rejected": -0.9104207158088684, "eval_runtime": 384.8792, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 10800 }, { "epoch": 1.8625086147484493, "grad_norm": 14.047548294067383, "learning_rate": 3.776562264140464e-08, "logits/chosen": -2.175610065460205, "logits/rejected": -2.134432792663574, "logps/chosen": -148.88336181640625, "logps/rejected": -170.747802734375, "loss": 0.6047, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9150352478027344, "rewards/margins": 0.291339248418808, "rewards/rejected": -1.2063745260238647, "step": 10810 }, { "epoch": 1.8642315644383185, "grad_norm": 14.999090194702148, "learning_rate": 3.766845219298291e-08, "logits/chosen": -2.1174261569976807, "logits/rejected": -2.0932211875915527, "logps/chosen": -133.87582397460938, "logps/rejected": -163.33006286621094, "loss": 0.5975, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8078087568283081, "rewards/margins": 0.3114509880542755, "rewards/rejected": -1.1192595958709717, "step": 10820 }, { "epoch": 1.8659545141281875, "grad_norm": 17.85299301147461, "learning_rate": 3.757133131002764e-08, "logits/chosen": -2.1383566856384277, "logits/rejected": -2.1101975440979004, "logps/chosen": -141.75344848632812, "logps/rejected": -170.37002563476562, "loss": 0.61, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8993728756904602, "rewards/margins": 0.2712690234184265, "rewards/rejected": -1.1706418991088867, "step": 10830 }, { "epoch": 1.8676774638180564, "grad_norm": 15.568462371826172, "learning_rate": 3.747426038290689e-08, "logits/chosen": -2.1676125526428223, "logits/rejected": -2.1427371501922607, "logps/chosen": -139.57675170898438, "logps/rejected": -160.5889892578125, "loss": 0.6327, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8658591508865356, "rewards/margins": 0.23096327483654022, "rewards/rejected": -1.0968225002288818, "step": 10840 }, { "epoch": 1.8694004135079254, "grad_norm": 15.441035270690918, "learning_rate": 3.737723980178786e-08, "logits/chosen": -2.0913443565368652, "logits/rejected": -2.072479724884033, "logps/chosen": -133.4951629638672, "logps/rejected": -166.19149780273438, "loss": 0.6032, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8333438634872437, "rewards/margins": 0.28401896357536316, "rewards/rejected": -1.1173628568649292, "step": 10850 }, { "epoch": 1.8711233631977946, "grad_norm": 18.331331253051758, "learning_rate": 3.7280269956635414e-08, "logits/chosen": -2.214963436126709, "logits/rejected": -2.1811411380767822, "logps/chosen": -142.8532257080078, "logps/rejected": -168.93576049804688, "loss": 0.6092, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8759439587593079, "rewards/margins": 0.289278119802475, "rewards/rejected": -1.16522216796875, "step": 10860 }, { "epoch": 1.8728463128876638, "grad_norm": 11.792555809020996, "learning_rate": 3.718335123721054e-08, "logits/chosen": -2.0742616653442383, "logits/rejected": -2.0635597705841064, "logps/chosen": -130.9443359375, "logps/rejected": -172.99917602539062, "loss": 0.5815, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8045860528945923, "rewards/margins": 0.3647902309894562, "rewards/rejected": -1.1693761348724365, "step": 10870 }, { "epoch": 1.8745692625775328, "grad_norm": 15.975428581237793, "learning_rate": 3.708648403306859e-08, "logits/chosen": -2.140714168548584, "logits/rejected": -2.1191792488098145, "logps/chosen": -140.34561157226562, "logps/rejected": -169.61854553222656, "loss": 0.5928, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8613649606704712, "rewards/margins": 0.3078747093677521, "rewards/rejected": -1.1692396402359009, "step": 10880 }, { "epoch": 1.8762922122674017, "grad_norm": 13.549330711364746, "learning_rate": 3.698966873355802e-08, "logits/chosen": -2.1803011894226074, "logits/rejected": -2.1594996452331543, "logps/chosen": -140.63050842285156, "logps/rejected": -171.95518493652344, "loss": 0.6015, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8747009038925171, "rewards/margins": 0.29660362005233765, "rewards/rejected": -1.17130446434021, "step": 10890 }, { "epoch": 1.8780151619572707, "grad_norm": 16.400907516479492, "learning_rate": 3.6892905727818544e-08, "logits/chosen": -2.2359566688537598, "logits/rejected": -2.2015864849090576, "logps/chosen": -140.0823211669922, "logps/rejected": -163.5909423828125, "loss": 0.6016, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8508996963500977, "rewards/margins": 0.2789711058139801, "rewards/rejected": -1.1298710107803345, "step": 10900 }, { "epoch": 1.8780151619572707, "eval_logits/chosen": -2.2419686317443848, "eval_logits/rejected": -2.231727123260498, "eval_logps/chosen": -134.16761779785156, "eval_logps/rejected": -155.1724853515625, "eval_loss": 0.6423208117485046, "eval_rewards/accuracies": 0.6342936754226685, "eval_rewards/chosen": -0.7545571327209473, "eval_rewards/margins": 0.16536663472652435, "eval_rewards/rejected": -0.9199238419532776, "eval_runtime": 385.208, "eval_samples_per_second": 11.173, "eval_steps_per_second": 1.397, "step": 10900 }, { "epoch": 1.8797381116471399, "grad_norm": 16.359081268310547, "learning_rate": 3.679619540477975e-08, "logits/chosen": -2.1347248554229736, "logits/rejected": -2.1046249866485596, "logps/chosen": -140.82342529296875, "logps/rejected": -164.4052734375, "loss": 0.6179, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8515699505805969, "rewards/margins": 0.25068145990371704, "rewards/rejected": -1.1022512912750244, "step": 10910 }, { "epoch": 1.881461061337009, "grad_norm": 17.925527572631836, "learning_rate": 3.669953815315943e-08, "logits/chosen": -2.1079506874084473, "logits/rejected": -2.082575559616089, "logps/chosen": -147.4144744873047, "logps/rejected": -174.2635040283203, "loss": 0.5955, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9157683253288269, "rewards/margins": 0.297660768032074, "rewards/rejected": -1.2134290933609009, "step": 10920 }, { "epoch": 1.883184011026878, "grad_norm": 19.56711769104004, "learning_rate": 3.6602934361462065e-08, "logits/chosen": -2.0662264823913574, "logits/rejected": -2.0382227897644043, "logps/chosen": -146.7861785888672, "logps/rejected": -167.21250915527344, "loss": 0.628, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9257417917251587, "rewards/margins": 0.2315080463886261, "rewards/rejected": -1.1572496891021729, "step": 10930 }, { "epoch": 1.884906960716747, "grad_norm": 20.01211929321289, "learning_rate": 3.6506384417977314e-08, "logits/chosen": -2.042912006378174, "logits/rejected": -2.025900363922119, "logps/chosen": -151.1432647705078, "logps/rejected": -172.36669921875, "loss": 0.6256, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9468406438827515, "rewards/margins": 0.22510845959186554, "rewards/rejected": -1.171949028968811, "step": 10940 }, { "epoch": 1.886629910406616, "grad_norm": 22.662534713745117, "learning_rate": 3.6409888710778344e-08, "logits/chosen": -2.102442979812622, "logits/rejected": -2.083610773086548, "logps/chosen": -150.12399291992188, "logps/rejected": -174.45953369140625, "loss": 0.6211, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9551755785942078, "rewards/margins": 0.24247443675994873, "rewards/rejected": -1.1976499557495117, "step": 10950 }, { "epoch": 1.8883528600964852, "grad_norm": 19.54989242553711, "learning_rate": 3.631344762772034e-08, "logits/chosen": -2.156007766723633, "logits/rejected": -2.130568027496338, "logps/chosen": -146.7346649169922, "logps/rejected": -178.57745361328125, "loss": 0.6094, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9292421340942383, "rewards/margins": 0.3307949900627136, "rewards/rejected": -1.2600371837615967, "step": 10960 }, { "epoch": 1.8900758097863544, "grad_norm": 18.44040870666504, "learning_rate": 3.621706155643891e-08, "logits/chosen": -2.164633274078369, "logits/rejected": -2.131767749786377, "logps/chosen": -146.42147827148438, "logps/rejected": -175.13168334960938, "loss": 0.6094, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8955854177474976, "rewards/margins": 0.31462064385414124, "rewards/rejected": -1.2102060317993164, "step": 10970 }, { "epoch": 1.8917987594762233, "grad_norm": 21.46990394592285, "learning_rate": 3.612073088434858e-08, "logits/chosen": -2.162540912628174, "logits/rejected": -2.1441233158111572, "logps/chosen": -150.9958038330078, "logps/rejected": -184.1426544189453, "loss": 0.617, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9642265439033508, "rewards/margins": 0.31541356444358826, "rewards/rejected": -1.2796400785446167, "step": 10980 }, { "epoch": 1.8935217091660923, "grad_norm": 17.002307891845703, "learning_rate": 3.6024455998641206e-08, "logits/chosen": -2.082855701446533, "logits/rejected": -2.065614700317383, "logps/chosen": -139.9417266845703, "logps/rejected": -170.90518188476562, "loss": 0.6009, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8745418787002563, "rewards/margins": 0.30955320596694946, "rewards/rejected": -1.1840951442718506, "step": 10990 }, { "epoch": 1.8952446588559613, "grad_norm": 15.544243812561035, "learning_rate": 3.592823728628439e-08, "logits/chosen": -2.268205165863037, "logits/rejected": -2.255527973175049, "logps/chosen": -145.85581970214844, "logps/rejected": -180.926513671875, "loss": 0.6056, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9296669960021973, "rewards/margins": 0.30975398421287537, "rewards/rejected": -1.239421010017395, "step": 11000 }, { "epoch": 1.8952446588559613, "eval_logits/chosen": -2.2438161373138428, "eval_logits/rejected": -2.2336204051971436, "eval_logps/chosen": -133.00901794433594, "eval_logps/rejected": -153.91583251953125, "eval_loss": 0.6424133777618408, "eval_rewards/accuracies": 0.6303438544273376, "eval_rewards/chosen": -0.7429712414741516, "eval_rewards/margins": 0.1643858253955841, "eval_rewards/rejected": -0.9073571562767029, "eval_runtime": 384.881, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 11000 }, { "epoch": 1.8969676085458305, "grad_norm": 13.385296821594238, "learning_rate": 3.5832075134019955e-08, "logits/chosen": -2.1892457008361816, "logits/rejected": -2.156001329421997, "logps/chosen": -139.17245483398438, "logps/rejected": -173.08497619628906, "loss": 0.582, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8596612215042114, "rewards/margins": 0.3564857840538025, "rewards/rejected": -1.2161470651626587, "step": 11010 }, { "epoch": 1.8986905582356997, "grad_norm": 15.220624923706055, "learning_rate": 3.573596992836239e-08, "logits/chosen": -2.2058544158935547, "logits/rejected": -2.1847777366638184, "logps/chosen": -144.6665496826172, "logps/rejected": -169.0140838623047, "loss": 0.6104, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8966256380081177, "rewards/margins": 0.263113409280777, "rewards/rejected": -1.1597388982772827, "step": 11020 }, { "epoch": 1.9004135079255686, "grad_norm": 16.45634651184082, "learning_rate": 3.5639922055597306e-08, "logits/chosen": -2.174046754837036, "logits/rejected": -2.1644673347473145, "logps/chosen": -148.73912048339844, "logps/rejected": -175.10986328125, "loss": 0.6189, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9395090937614441, "rewards/margins": 0.2666943371295929, "rewards/rejected": -1.2062033414840698, "step": 11030 }, { "epoch": 1.9021364576154376, "grad_norm": 13.699787139892578, "learning_rate": 3.5543931901779855e-08, "logits/chosen": -2.203010082244873, "logits/rejected": -2.183018207550049, "logps/chosen": -151.67031860351562, "logps/rejected": -178.61093139648438, "loss": 0.6112, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9520352482795715, "rewards/margins": 0.27866148948669434, "rewards/rejected": -1.230696678161621, "step": 11040 }, { "epoch": 1.9038594073053066, "grad_norm": 16.1466007232666, "learning_rate": 3.544799985273321e-08, "logits/chosen": -2.1286892890930176, "logits/rejected": -2.1035001277923584, "logps/chosen": -133.95420837402344, "logps/rejected": -173.42544555664062, "loss": 0.5702, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8199006915092468, "rewards/margins": 0.36169666051864624, "rewards/rejected": -1.181597352027893, "step": 11050 }, { "epoch": 1.9055823569951758, "grad_norm": 16.861194610595703, "learning_rate": 3.535212629404697e-08, "logits/chosen": -2.1597084999084473, "logits/rejected": -2.1142749786376953, "logps/chosen": -144.04820251464844, "logps/rejected": -177.36911010742188, "loss": 0.5775, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8977333903312683, "rewards/margins": 0.358584463596344, "rewards/rejected": -1.2563177347183228, "step": 11060 }, { "epoch": 1.907305306685045, "grad_norm": 14.4827299118042, "learning_rate": 3.525631161107564e-08, "logits/chosen": -2.1735339164733887, "logits/rejected": -2.125136613845825, "logps/chosen": -137.1099395751953, "logps/rejected": -175.73214721679688, "loss": 0.5527, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8393446207046509, "rewards/margins": 0.4223240911960602, "rewards/rejected": -1.2616688013076782, "step": 11070 }, { "epoch": 1.909028256374914, "grad_norm": 18.573715209960938, "learning_rate": 3.516055618893712e-08, "logits/chosen": -2.1537680625915527, "logits/rejected": -2.1253244876861572, "logps/chosen": -150.51162719726562, "logps/rejected": -182.40243530273438, "loss": 0.5821, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9500817060470581, "rewards/margins": 0.3490012288093567, "rewards/rejected": -1.2990829944610596, "step": 11080 }, { "epoch": 1.9107512060647829, "grad_norm": 16.755136489868164, "learning_rate": 3.50648604125111e-08, "logits/chosen": -2.1314380168914795, "logits/rejected": -2.1078097820281982, "logps/chosen": -154.7153778076172, "logps/rejected": -176.64910888671875, "loss": 0.6102, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9689188003540039, "rewards/margins": 0.27030545473098755, "rewards/rejected": -1.2392241954803467, "step": 11090 }, { "epoch": 1.9124741557546519, "grad_norm": 16.401973724365234, "learning_rate": 3.496922466643748e-08, "logits/chosen": -2.033411741256714, "logits/rejected": -2.0170555114746094, "logps/chosen": -137.7972412109375, "logps/rejected": -168.44577026367188, "loss": 0.6068, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8691918253898621, "rewards/margins": 0.27503761649131775, "rewards/rejected": -1.1442296504974365, "step": 11100 }, { "epoch": 1.9124741557546519, "eval_logits/chosen": -2.227536916732788, "eval_logits/rejected": -2.216994047164917, "eval_logps/chosen": -136.35061645507812, "eval_logps/rejected": -157.85231018066406, "eval_loss": 0.6415258646011353, "eval_rewards/accuracies": 0.63150554895401, "eval_rewards/chosen": -0.7763871550559998, "eval_rewards/margins": 0.17033463716506958, "eval_rewards/rejected": -0.9467218518257141, "eval_runtime": 385.151, "eval_samples_per_second": 11.175, "eval_steps_per_second": 1.397, "step": 11100 }, { "epoch": 1.914197105444521, "grad_norm": 14.058769226074219, "learning_rate": 3.487364933511494e-08, "logits/chosen": -2.1420087814331055, "logits/rejected": -2.11448073387146, "logps/chosen": -155.005126953125, "logps/rejected": -193.01858520507812, "loss": 0.5854, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0110347270965576, "rewards/margins": 0.38394156098365784, "rewards/rejected": -1.3949763774871826, "step": 11110 }, { "epoch": 1.9159200551343902, "grad_norm": 16.759851455688477, "learning_rate": 3.4778134802699274e-08, "logits/chosen": -2.240304470062256, "logits/rejected": -2.2136197090148926, "logps/chosen": -155.89695739746094, "logps/rejected": -177.0760955810547, "loss": 0.6157, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9638206362724304, "rewards/margins": 0.2701886296272278, "rewards/rejected": -1.2340091466903687, "step": 11120 }, { "epoch": 1.9176430048242592, "grad_norm": 18.060779571533203, "learning_rate": 3.4682681453101966e-08, "logits/chosen": -2.0726406574249268, "logits/rejected": -2.0467450618743896, "logps/chosen": -156.54104614257812, "logps/rejected": -178.5406494140625, "loss": 0.6153, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0066497325897217, "rewards/margins": 0.24690060317516327, "rewards/rejected": -1.2535502910614014, "step": 11130 }, { "epoch": 1.9193659545141282, "grad_norm": 15.363555908203125, "learning_rate": 3.458728966998853e-08, "logits/chosen": -2.0575735569000244, "logits/rejected": -2.0361971855163574, "logps/chosen": -143.7605438232422, "logps/rejected": -169.78445434570312, "loss": 0.6049, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8994717597961426, "rewards/margins": 0.28259962797164917, "rewards/rejected": -1.1820714473724365, "step": 11140 }, { "epoch": 1.9210889042039971, "grad_norm": 18.13688087463379, "learning_rate": 3.4491959836777025e-08, "logits/chosen": -2.119196653366089, "logits/rejected": -2.104414701461792, "logps/chosen": -155.87875366210938, "logps/rejected": -178.13723754882812, "loss": 0.6269, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9778518676757812, "rewards/margins": 0.2384147346019745, "rewards/rejected": -1.2162667512893677, "step": 11150 }, { "epoch": 1.9228118538938663, "grad_norm": 17.07344627380371, "learning_rate": 3.439669233663651e-08, "logits/chosen": -2.1884121894836426, "logits/rejected": -2.168919563293457, "logps/chosen": -143.20266723632812, "logps/rejected": -168.40750122070312, "loss": 0.615, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9108399152755737, "rewards/margins": 0.2630824148654938, "rewards/rejected": -1.1739223003387451, "step": 11160 }, { "epoch": 1.9245348035837355, "grad_norm": 17.23695182800293, "learning_rate": 3.430148755248552e-08, "logits/chosen": -2.0476460456848145, "logits/rejected": -2.020421028137207, "logps/chosen": -147.83729553222656, "logps/rejected": -162.50868225097656, "loss": 0.6281, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9067513346672058, "rewards/margins": 0.22811265289783478, "rewards/rejected": -1.1348638534545898, "step": 11170 }, { "epoch": 1.9262577532736045, "grad_norm": 14.98134994506836, "learning_rate": 3.4206345866990535e-08, "logits/chosen": -2.170823335647583, "logits/rejected": -2.146702289581299, "logps/chosen": -153.30392456054688, "logps/rejected": -179.1726531982422, "loss": 0.6105, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9933806657791138, "rewards/margins": 0.2901056408882141, "rewards/rejected": -1.2834863662719727, "step": 11180 }, { "epoch": 1.9279807029634735, "grad_norm": 14.643978118896484, "learning_rate": 3.41112676625643e-08, "logits/chosen": -2.1666550636291504, "logits/rejected": -2.133633852005005, "logps/chosen": -143.29917907714844, "logps/rejected": -176.61819458007812, "loss": 0.5721, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8841732144355774, "rewards/margins": 0.3656248450279236, "rewards/rejected": -1.249798059463501, "step": 11190 }, { "epoch": 1.9297036526533424, "grad_norm": 18.589616775512695, "learning_rate": 3.401625332136455e-08, "logits/chosen": -2.164454936981201, "logits/rejected": -2.141761064529419, "logps/chosen": -142.15316772460938, "logps/rejected": -173.24183654785156, "loss": 0.5907, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8884008526802063, "rewards/margins": 0.30655401945114136, "rewards/rejected": -1.194954752922058, "step": 11200 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -2.2258594036102295, "eval_logits/rejected": -2.215369701385498, "eval_logps/chosen": -135.14556884765625, "eval_logps/rejected": -156.5322723388672, "eval_loss": 0.6416376233100891, "eval_rewards/accuracies": 0.6324349641799927, "eval_rewards/chosen": -0.7643365859985352, "eval_rewards/margins": 0.16918493807315826, "eval_rewards/rejected": -0.933521568775177, "eval_runtime": 385.3102, "eval_samples_per_second": 11.17, "eval_steps_per_second": 1.396, "step": 11200 }, { "epoch": 1.9314266023432116, "grad_norm": 13.645044326782227, "learning_rate": 3.3921303225292226e-08, "logits/chosen": -2.0335795879364014, "logits/rejected": -2.011505126953125, "logps/chosen": -138.0536651611328, "logps/rejected": -178.84747314453125, "loss": 0.5795, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8764287829399109, "rewards/margins": 0.3725907802581787, "rewards/rejected": -1.2490196228027344, "step": 11210 }, { "epoch": 1.9331495520330806, "grad_norm": 19.461091995239258, "learning_rate": 3.382641775599008e-08, "logits/chosen": -2.1198267936706543, "logits/rejected": -2.101374864578247, "logps/chosen": -143.53126525878906, "logps/rejected": -176.012939453125, "loss": 0.6188, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9148159027099609, "rewards/margins": 0.2884688079357147, "rewards/rejected": -1.203284740447998, "step": 11220 }, { "epoch": 1.9348725017229498, "grad_norm": 15.754158973693848, "learning_rate": 3.373159729484113e-08, "logits/chosen": -2.0857930183410645, "logits/rejected": -2.0724740028381348, "logps/chosen": -158.6096649169922, "logps/rejected": -175.16921997070312, "loss": 0.6504, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0284069776535034, "rewards/margins": 0.20803245902061462, "rewards/rejected": -1.2364394664764404, "step": 11230 }, { "epoch": 1.9365954514128187, "grad_norm": 13.510889053344727, "learning_rate": 3.363684222296704e-08, "logits/chosen": -2.097597122192383, "logits/rejected": -2.078786849975586, "logps/chosen": -145.527587890625, "logps/rejected": -172.16566467285156, "loss": 0.6113, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9087750315666199, "rewards/margins": 0.2784481942653656, "rewards/rejected": -1.187223196029663, "step": 11240 }, { "epoch": 1.9383184011026877, "grad_norm": 14.932703018188477, "learning_rate": 3.3542152921226686e-08, "logits/chosen": -2.1237001419067383, "logits/rejected": -2.0975117683410645, "logps/chosen": -142.00363159179688, "logps/rejected": -175.05258178710938, "loss": 0.5852, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8771541714668274, "rewards/margins": 0.33710870146751404, "rewards/rejected": -1.2142629623413086, "step": 11250 }, { "epoch": 1.940041350792557, "grad_norm": 14.702116966247559, "learning_rate": 3.3447529770214565e-08, "logits/chosen": -2.0696117877960205, "logits/rejected": -2.0362331867218018, "logps/chosen": -146.04872131347656, "logps/rejected": -165.2068328857422, "loss": 0.6224, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8821519613265991, "rewards/margins": 0.2543164789676666, "rewards/rejected": -1.1364682912826538, "step": 11260 }, { "epoch": 1.9417643004824259, "grad_norm": 15.495363235473633, "learning_rate": 3.335297315025935e-08, "logits/chosen": -2.1114563941955566, "logits/rejected": -2.079831600189209, "logps/chosen": -142.7719268798828, "logps/rejected": -172.2384033203125, "loss": 0.5796, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8720538020133972, "rewards/margins": 0.3399291932582855, "rewards/rejected": -1.2119829654693604, "step": 11270 }, { "epoch": 1.943487250172295, "grad_norm": 14.526769638061523, "learning_rate": 3.325848344142219e-08, "logits/chosen": -2.1580450534820557, "logits/rejected": -2.1280298233032227, "logps/chosen": -146.6006317138672, "logps/rejected": -165.89434814453125, "loss": 0.626, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9028850793838501, "rewards/margins": 0.23194792866706848, "rewards/rejected": -1.1348328590393066, "step": 11280 }, { "epoch": 1.945210199862164, "grad_norm": 16.603395462036133, "learning_rate": 3.3164061023495385e-08, "logits/chosen": -2.146371364593506, "logits/rejected": -2.1256675720214844, "logps/chosen": -144.02137756347656, "logps/rejected": -173.77401733398438, "loss": 0.5826, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8406258821487427, "rewards/margins": 0.33569806814193726, "rewards/rejected": -1.1763238906860352, "step": 11290 }, { "epoch": 1.946933149552033, "grad_norm": 16.55389404296875, "learning_rate": 3.306970627600073e-08, "logits/chosen": -2.096113920211792, "logits/rejected": -2.0719850063323975, "logps/chosen": -154.57101440429688, "logps/rejected": -167.23641967773438, "loss": 0.6504, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.956468939781189, "rewards/margins": 0.1787852793931961, "rewards/rejected": -1.135254144668579, "step": 11300 }, { "epoch": 1.946933149552033, "eval_logits/chosen": -2.22763991355896, "eval_logits/rejected": -2.217198610305786, "eval_logps/chosen": -133.49484252929688, "eval_logps/rejected": -154.6342010498047, "eval_loss": 0.641975462436676, "eval_rewards/accuracies": 0.6289498209953308, "eval_rewards/chosen": -0.7478294372558594, "eval_rewards/margins": 0.16671136021614075, "eval_rewards/rejected": -0.9145408272743225, "eval_runtime": 385.113, "eval_samples_per_second": 11.176, "eval_steps_per_second": 1.397, "step": 11300 }, { "epoch": 1.948656099241902, "grad_norm": 16.611129760742188, "learning_rate": 3.297541957818801e-08, "logits/chosen": -2.2009270191192627, "logits/rejected": -2.1890411376953125, "logps/chosen": -136.6061553955078, "logps/rejected": -174.31503295898438, "loss": 0.5741, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8614300489425659, "rewards/margins": 0.3664931654930115, "rewards/rejected": -1.2279231548309326, "step": 11310 }, { "epoch": 1.9503790489317712, "grad_norm": 12.560344696044922, "learning_rate": 3.2881201309033555e-08, "logits/chosen": -2.132798671722412, "logits/rejected": -2.1023151874542236, "logps/chosen": -139.7478485107422, "logps/rejected": -173.36195373535156, "loss": 0.5824, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8482630848884583, "rewards/margins": 0.34626397490501404, "rewards/rejected": -1.1945271492004395, "step": 11320 }, { "epoch": 1.9521019986216404, "grad_norm": 22.273035049438477, "learning_rate": 3.278705184723856e-08, "logits/chosen": -2.1342053413391113, "logits/rejected": -2.0999293327331543, "logps/chosen": -148.65792846679688, "logps/rejected": -175.73095703125, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9609923362731934, "rewards/margins": 0.3149716854095459, "rewards/rejected": -1.2759640216827393, "step": 11330 }, { "epoch": 1.9538249483115093, "grad_norm": 16.607248306274414, "learning_rate": 3.2692971571227705e-08, "logits/chosen": -2.0323688983917236, "logits/rejected": -2.0056400299072266, "logps/chosen": -150.04843139648438, "logps/rejected": -183.73699951171875, "loss": 0.5977, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9466899037361145, "rewards/margins": 0.3430887460708618, "rewards/rejected": -1.2897785902023315, "step": 11340 }, { "epoch": 1.9555478980013783, "grad_norm": 16.950824737548828, "learning_rate": 3.25989608591476e-08, "logits/chosen": -2.1680474281311035, "logits/rejected": -2.1496119499206543, "logps/chosen": -154.79278564453125, "logps/rejected": -180.25418090820312, "loss": 0.6202, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9788613319396973, "rewards/margins": 0.26319506764411926, "rewards/rejected": -1.2420563697814941, "step": 11350 }, { "epoch": 1.9572708476912473, "grad_norm": 12.987069129943848, "learning_rate": 3.250502008886524e-08, "logits/chosen": -2.130478858947754, "logits/rejected": -2.100525379180908, "logps/chosen": -145.18128967285156, "logps/rejected": -171.57431030273438, "loss": 0.6003, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9292659759521484, "rewards/margins": 0.30089348554611206, "rewards/rejected": -1.2301595211029053, "step": 11360 }, { "epoch": 1.9589937973811165, "grad_norm": 14.670994758605957, "learning_rate": 3.241114963796646e-08, "logits/chosen": -2.097120761871338, "logits/rejected": -2.077298402786255, "logps/chosen": -150.8975830078125, "logps/rejected": -174.4925079345703, "loss": 0.6164, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9462224841117859, "rewards/margins": 0.26242509484291077, "rewards/rejected": -1.2086474895477295, "step": 11370 }, { "epoch": 1.9607167470709856, "grad_norm": 21.320772171020508, "learning_rate": 3.231734988375447e-08, "logits/chosen": -2.0661702156066895, "logits/rejected": -2.0394110679626465, "logps/chosen": -144.27322387695312, "logps/rejected": -174.4996795654297, "loss": 0.6045, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9243995547294617, "rewards/margins": 0.3199179470539093, "rewards/rejected": -1.244317650794983, "step": 11380 }, { "epoch": 1.9624396967608546, "grad_norm": 15.216944694519043, "learning_rate": 3.222362120324837e-08, "logits/chosen": -2.1798717975616455, "logits/rejected": -2.1522154808044434, "logps/chosen": -137.3231658935547, "logps/rejected": -178.4552001953125, "loss": 0.5774, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8499847650527954, "rewards/margins": 0.38140755891799927, "rewards/rejected": -1.23139226436615, "step": 11390 }, { "epoch": 1.9641626464507236, "grad_norm": 17.31537628173828, "learning_rate": 3.2129963973181526e-08, "logits/chosen": -2.141483783721924, "logits/rejected": -2.109914779663086, "logps/chosen": -142.1708526611328, "logps/rejected": -171.01956176757812, "loss": 0.6037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.862584114074707, "rewards/margins": 0.3145027458667755, "rewards/rejected": -1.1770869493484497, "step": 11400 }, { "epoch": 1.9641626464507236, "eval_logits/chosen": -2.2199316024780273, "eval_logits/rejected": -2.209334135055542, "eval_logps/chosen": -134.98611450195312, "eval_logps/rejected": -156.4750213623047, "eval_loss": 0.6413188576698303, "eval_rewards/accuracies": 0.6296468377113342, "eval_rewards/chosen": -0.7627421617507935, "eval_rewards/margins": 0.17020682990550995, "eval_rewards/rejected": -0.9329490661621094, "eval_runtime": 385.0159, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 11400 }, { "epoch": 1.9658855961405926, "grad_norm": 17.18006134033203, "learning_rate": 3.2036378570000146e-08, "logits/chosen": -2.1379072666168213, "logits/rejected": -2.10851788520813, "logps/chosen": -142.50030517578125, "logps/rejected": -168.09339904785156, "loss": 0.6118, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8681458234786987, "rewards/margins": 0.2620962858200073, "rewards/rejected": -1.130242109298706, "step": 11410 }, { "epoch": 1.9676085458304617, "grad_norm": 15.00400161743164, "learning_rate": 3.1942865369861704e-08, "logits/chosen": -2.1207568645477295, "logits/rejected": -2.0927529335021973, "logps/chosen": -143.63040161132812, "logps/rejected": -164.89695739746094, "loss": 0.6192, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8994544148445129, "rewards/margins": 0.24692866206169128, "rewards/rejected": -1.1463830471038818, "step": 11420 }, { "epoch": 1.969331495520331, "grad_norm": 14.680079460144043, "learning_rate": 3.18494247486335e-08, "logits/chosen": -2.05918025970459, "logits/rejected": -2.0340049266815186, "logps/chosen": -147.0135498046875, "logps/rejected": -172.14688110351562, "loss": 0.6106, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9204323887825012, "rewards/margins": 0.2744022309780121, "rewards/rejected": -1.194834589958191, "step": 11430 }, { "epoch": 1.9710544452102, "grad_norm": 17.141448974609375, "learning_rate": 3.1756057081891104e-08, "logits/chosen": -2.1291680335998535, "logits/rejected": -2.114319324493408, "logps/chosen": -146.44517517089844, "logps/rejected": -166.45254516601562, "loss": 0.6346, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9389501810073853, "rewards/margins": 0.19942371547222137, "rewards/rejected": -1.138373851776123, "step": 11440 }, { "epoch": 1.9727773949000689, "grad_norm": 16.71525764465332, "learning_rate": 3.166276274491684e-08, "logits/chosen": -2.0695135593414307, "logits/rejected": -2.0401968955993652, "logps/chosen": -143.0440216064453, "logps/rejected": -178.1204071044922, "loss": 0.5747, "rewards/accuracies": 0.75, "rewards/chosen": -0.9033585786819458, "rewards/margins": 0.3608400523662567, "rewards/rejected": -1.2641985416412354, "step": 11450 }, { "epoch": 1.9745003445899378, "grad_norm": 16.200048446655273, "learning_rate": 3.156954211269828e-08, "logits/chosen": -2.099626064300537, "logits/rejected": -2.0785412788391113, "logps/chosen": -141.47412109375, "logps/rejected": -174.02854919433594, "loss": 0.5873, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8593629002571106, "rewards/margins": 0.3375614285469055, "rewards/rejected": -1.1969242095947266, "step": 11460 }, { "epoch": 1.976223294279807, "grad_norm": 14.326093673706055, "learning_rate": 3.147639555992677e-08, "logits/chosen": -2.136005401611328, "logits/rejected": -2.10081148147583, "logps/chosen": -152.38046264648438, "logps/rejected": -183.26654052734375, "loss": 0.5767, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9553796052932739, "rewards/margins": 0.3675006926059723, "rewards/rejected": -1.3228802680969238, "step": 11470 }, { "epoch": 1.9779462439696762, "grad_norm": 15.438941955566406, "learning_rate": 3.138332346099587e-08, "logits/chosen": -2.1710891723632812, "logits/rejected": -2.1329033374786377, "logps/chosen": -140.94488525390625, "logps/rejected": -166.95150756835938, "loss": 0.602, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8485042452812195, "rewards/margins": 0.31671398878097534, "rewards/rejected": -1.1652181148529053, "step": 11480 }, { "epoch": 1.9796691936595452, "grad_norm": 18.704729080200195, "learning_rate": 3.129032618999994e-08, "logits/chosen": -2.0882651805877686, "logits/rejected": -2.0632243156433105, "logps/chosen": -155.3004608154297, "logps/rejected": -172.23086547851562, "loss": 0.6475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0060917139053345, "rewards/margins": 0.20503780245780945, "rewards/rejected": -1.2111294269561768, "step": 11490 }, { "epoch": 1.9813921433494142, "grad_norm": 16.656694412231445, "learning_rate": 3.119740412073252e-08, "logits/chosen": -2.140742778778076, "logits/rejected": -2.1206910610198975, "logps/chosen": -138.21206665039062, "logps/rejected": -158.35801696777344, "loss": 0.6435, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8792376518249512, "rewards/margins": 0.19645270705223083, "rewards/rejected": -1.075690507888794, "step": 11500 }, { "epoch": 1.9813921433494142, "eval_logits/chosen": -2.218370199203491, "eval_logits/rejected": -2.2078094482421875, "eval_logps/chosen": -134.86007690429688, "eval_logps/rejected": -156.327392578125, "eval_loss": 0.6415006518363953, "eval_rewards/accuracies": 0.6301115155220032, "eval_rewards/chosen": -0.761481761932373, "eval_rewards/margins": 0.16999098658561707, "eval_rewards/rejected": -0.9314727783203125, "eval_runtime": 385.129, "eval_samples_per_second": 11.175, "eval_steps_per_second": 1.397, "step": 11500 }, { "epoch": 1.9831150930392831, "grad_norm": 22.941991806030273, "learning_rate": 3.1104557626684884e-08, "logits/chosen": -2.0600409507751465, "logits/rejected": -2.046126365661621, "logps/chosen": -145.99452209472656, "logps/rejected": -175.51980590820312, "loss": 0.6178, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9051405191421509, "rewards/margins": 0.317369282245636, "rewards/rejected": -1.2225099802017212, "step": 11510 }, { "epoch": 1.9848380427291523, "grad_norm": 17.974853515625, "learning_rate": 3.101178708104456e-08, "logits/chosen": -2.1407508850097656, "logits/rejected": -2.088731527328491, "logps/chosen": -147.83425903320312, "logps/rejected": -168.90122985839844, "loss": 0.5985, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8803563117980957, "rewards/margins": 0.308893620967865, "rewards/rejected": -1.189250111579895, "step": 11520 }, { "epoch": 1.9865609924190215, "grad_norm": 12.599196434020996, "learning_rate": 3.091909285669383e-08, "logits/chosen": -2.1182777881622314, "logits/rejected": -2.084608554840088, "logps/chosen": -142.55538940429688, "logps/rejected": -163.006103515625, "loss": 0.6158, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8909363746643066, "rewards/margins": 0.2490203082561493, "rewards/rejected": -1.1399565935134888, "step": 11530 }, { "epoch": 1.9882839421088905, "grad_norm": 16.714204788208008, "learning_rate": 3.082647532620817e-08, "logits/chosen": -2.1595141887664795, "logits/rejected": -2.142829656600952, "logps/chosen": -136.802490234375, "logps/rejected": -180.17520141601562, "loss": 0.5631, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8605507016181946, "rewards/margins": 0.388934463262558, "rewards/rejected": -1.2494851350784302, "step": 11540 }, { "epoch": 1.9900068917987594, "grad_norm": 13.683196067810059, "learning_rate": 3.0733934861854794e-08, "logits/chosen": -2.0978140830993652, "logits/rejected": -2.07684063911438, "logps/chosen": -140.7404327392578, "logps/rejected": -170.95933532714844, "loss": 0.6114, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8941097259521484, "rewards/margins": 0.28719091415405273, "rewards/rejected": -1.1813005208969116, "step": 11550 }, { "epoch": 1.9917298414886284, "grad_norm": 14.191267013549805, "learning_rate": 3.0641471835591184e-08, "logits/chosen": -2.121584177017212, "logits/rejected": -2.091630220413208, "logps/chosen": -149.193359375, "logps/rejected": -171.97373962402344, "loss": 0.6218, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9421037435531616, "rewards/margins": 0.2598298192024231, "rewards/rejected": -1.2019336223602295, "step": 11560 }, { "epoch": 1.9934527911784976, "grad_norm": 17.705841064453125, "learning_rate": 3.054908661906353e-08, "logits/chosen": -2.123671770095825, "logits/rejected": -2.1004276275634766, "logps/chosen": -150.16494750976562, "logps/rejected": -181.62252807617188, "loss": 0.6143, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9605393409729004, "rewards/margins": 0.2986890971660614, "rewards/rejected": -1.2592284679412842, "step": 11570 }, { "epoch": 1.9951757408683668, "grad_norm": 17.302921295166016, "learning_rate": 3.045677958360532e-08, "logits/chosen": -2.17277193069458, "logits/rejected": -2.1514034271240234, "logps/chosen": -147.14431762695312, "logps/rejected": -174.19351196289062, "loss": 0.6134, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.908860981464386, "rewards/margins": 0.27632951736450195, "rewards/rejected": -1.1851904392242432, "step": 11580 }, { "epoch": 1.9968986905582358, "grad_norm": 22.508161544799805, "learning_rate": 3.0364551100235795e-08, "logits/chosen": -2.060063600540161, "logits/rejected": -2.025623083114624, "logps/chosen": -148.91146850585938, "logps/rejected": -160.8529510498047, "loss": 0.6339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8941003680229187, "rewards/margins": 0.20420987904071808, "rewards/rejected": -1.0983102321624756, "step": 11590 }, { "epoch": 1.9986216402481047, "grad_norm": 13.724039077758789, "learning_rate": 3.027240153965839e-08, "logits/chosen": -2.1192171573638916, "logits/rejected": -2.101248264312744, "logps/chosen": -132.7873077392578, "logps/rejected": -163.21640014648438, "loss": 0.6037, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8233798742294312, "rewards/margins": 0.2709590792655945, "rewards/rejected": -1.0943387746810913, "step": 11600 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -2.2224414348602295, "eval_logits/rejected": -2.211949348449707, "eval_logps/chosen": -132.96453857421875, "eval_logps/rejected": -154.14678955078125, "eval_loss": 0.641775906085968, "eval_rewards/accuracies": 0.6294144988059998, "eval_rewards/chosen": -0.7425263524055481, "eval_rewards/margins": 0.1671404093503952, "eval_rewards/rejected": -0.9096667170524597, "eval_runtime": 385.3092, "eval_samples_per_second": 11.17, "eval_steps_per_second": 1.396, "step": 11600 }, { "epoch": 2.0003445899379737, "grad_norm": 13.551985740661621, "learning_rate": 3.0180331272259404e-08, "logits/chosen": -2.1045384407043457, "logits/rejected": -2.0812675952911377, "logps/chosen": -137.6370849609375, "logps/rejected": -172.7627716064453, "loss": 0.5953, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8656929731369019, "rewards/margins": 0.34895288944244385, "rewards/rejected": -1.2146458625793457, "step": 11610 }, { "epoch": 2.0020675396278427, "grad_norm": 17.046794891357422, "learning_rate": 3.0088340668106376e-08, "logits/chosen": -2.105328321456909, "logits/rejected": -2.0844876766204834, "logps/chosen": -146.08731079101562, "logps/rejected": -172.68511962890625, "loss": 0.6144, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.892816424369812, "rewards/margins": 0.2675938010215759, "rewards/rejected": -1.1604102849960327, "step": 11620 }, { "epoch": 2.003790489317712, "grad_norm": 16.253496170043945, "learning_rate": 2.999643009694671e-08, "logits/chosen": -2.138611078262329, "logits/rejected": -2.10589861869812, "logps/chosen": -148.37692260742188, "logps/rejected": -173.70901489257812, "loss": 0.5954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9030311703681946, "rewards/margins": 0.3055158853530884, "rewards/rejected": -1.2085471153259277, "step": 11630 }, { "epoch": 2.005513439007581, "grad_norm": 15.551525115966797, "learning_rate": 2.990459992820601e-08, "logits/chosen": -2.163944959640503, "logits/rejected": -2.1408309936523438, "logps/chosen": -139.31124877929688, "logps/rejected": -166.83218383789062, "loss": 0.6001, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8687824010848999, "rewards/margins": 0.2981419563293457, "rewards/rejected": -1.166924238204956, "step": 11640 }, { "epoch": 2.00723638869745, "grad_norm": 14.260374069213867, "learning_rate": 2.981285053098682e-08, "logits/chosen": -2.068153142929077, "logits/rejected": -2.0348410606384277, "logps/chosen": -134.29324340820312, "logps/rejected": -164.4078369140625, "loss": 0.5837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7966753244400024, "rewards/margins": 0.3361361026763916, "rewards/rejected": -1.132811427116394, "step": 11650 }, { "epoch": 2.008959338387319, "grad_norm": 15.495975494384766, "learning_rate": 2.972118227406698e-08, "logits/chosen": -2.195077896118164, "logits/rejected": -2.1469154357910156, "logps/chosen": -145.08949279785156, "logps/rejected": -173.0511016845703, "loss": 0.594, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9013023376464844, "rewards/margins": 0.3263542354106903, "rewards/rejected": -1.227656602859497, "step": 11660 }, { "epoch": 2.010682288077188, "grad_norm": 15.107081413269043, "learning_rate": 2.9629595525898188e-08, "logits/chosen": -2.106961488723755, "logits/rejected": -2.0719175338745117, "logps/chosen": -144.5116424560547, "logps/rejected": -184.60157775878906, "loss": 0.556, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8905149698257446, "rewards/margins": 0.4163680076599121, "rewards/rejected": -1.3068828582763672, "step": 11670 }, { "epoch": 2.0124052377670574, "grad_norm": 14.7352933883667, "learning_rate": 2.9538090654604596e-08, "logits/chosen": -2.022814989089966, "logits/rejected": -1.9983813762664795, "logps/chosen": -146.06353759765625, "logps/rejected": -184.65882873535156, "loss": 0.5761, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8968318700790405, "rewards/margins": 0.37926506996154785, "rewards/rejected": -1.276097059249878, "step": 11680 }, { "epoch": 2.0141281874569263, "grad_norm": 16.93663787841797, "learning_rate": 2.9446668027981127e-08, "logits/chosen": -2.1414148807525635, "logits/rejected": -2.10644268989563, "logps/chosen": -154.0445556640625, "logps/rejected": -177.6328125, "loss": 0.6096, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9402874112129211, "rewards/margins": 0.293734610080719, "rewards/rejected": -1.2340219020843506, "step": 11690 }, { "epoch": 2.0158511371467953, "grad_norm": 13.309664726257324, "learning_rate": 2.9355328013492255e-08, "logits/chosen": -2.20237398147583, "logits/rejected": -2.169412136077881, "logps/chosen": -141.04112243652344, "logps/rejected": -170.56077575683594, "loss": 0.6036, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8420683145523071, "rewards/margins": 0.2933594584465027, "rewards/rejected": -1.135427713394165, "step": 11700 }, { "epoch": 2.0158511371467953, "eval_logits/chosen": -2.217425584793091, "eval_logits/rejected": -2.206829786300659, "eval_logps/chosen": -133.14979553222656, "eval_logps/rejected": -154.4552764892578, "eval_loss": 0.641371488571167, "eval_rewards/accuracies": 0.6289498209953308, "eval_rewards/chosen": -0.744378924369812, "eval_rewards/margins": 0.16837261617183685, "eval_rewards/rejected": -0.9127516150474548, "eval_runtime": 384.9503, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 11700 }, { "epoch": 2.0175740868366643, "grad_norm": 20.12196159362793, "learning_rate": 2.926407097827034e-08, "logits/chosen": -2.13209867477417, "logits/rejected": -2.108107805252075, "logps/chosen": -145.1027069091797, "logps/rejected": -169.5911102294922, "loss": 0.6117, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8941985964775085, "rewards/margins": 0.27050337195396423, "rewards/rejected": -1.1647019386291504, "step": 11710 }, { "epoch": 2.0192970365265333, "grad_norm": 16.48675537109375, "learning_rate": 2.917289728911424e-08, "logits/chosen": -2.1714067459106445, "logits/rejected": -2.1503050327301025, "logps/chosen": -147.0916748046875, "logps/rejected": -175.4585723876953, "loss": 0.621, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9353798031806946, "rewards/margins": 0.29182952642440796, "rewards/rejected": -1.2272093296051025, "step": 11720 }, { "epoch": 2.0210199862164027, "grad_norm": 15.128205299377441, "learning_rate": 2.90818073124878e-08, "logits/chosen": -2.1099441051483154, "logits/rejected": -2.0892720222473145, "logps/chosen": -144.11288452148438, "logps/rejected": -182.5670166015625, "loss": 0.577, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8893001675605774, "rewards/margins": 0.36189571022987366, "rewards/rejected": -1.2511956691741943, "step": 11730 }, { "epoch": 2.0227429359062716, "grad_norm": 16.662914276123047, "learning_rate": 2.899080141451836e-08, "logits/chosen": -2.1663658618927, "logits/rejected": -2.1456141471862793, "logps/chosen": -140.3199005126953, "logps/rejected": -172.33314514160156, "loss": 0.5942, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8737931251525879, "rewards/margins": 0.31466561555862427, "rewards/rejected": -1.1884586811065674, "step": 11740 }, { "epoch": 2.0244658855961406, "grad_norm": 12.746676445007324, "learning_rate": 2.8899879960995376e-08, "logits/chosen": -2.179152011871338, "logits/rejected": -2.1720426082611084, "logps/chosen": -131.9429168701172, "logps/rejected": -176.4367218017578, "loss": 0.5689, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8367003202438354, "rewards/margins": 0.37631934881210327, "rewards/rejected": -1.213019609451294, "step": 11750 }, { "epoch": 2.0261888352860096, "grad_norm": 14.332768440246582, "learning_rate": 2.8809043317368876e-08, "logits/chosen": -2.1324727535247803, "logits/rejected": -2.1096091270446777, "logps/chosen": -141.2311553955078, "logps/rejected": -174.8484344482422, "loss": 0.5843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8643898963928223, "rewards/margins": 0.34912070631980896, "rewards/rejected": -1.2135106325149536, "step": 11760 }, { "epoch": 2.0279117849758785, "grad_norm": 14.983498573303223, "learning_rate": 2.871829184874795e-08, "logits/chosen": -2.0761077404022217, "logits/rejected": -2.038146734237671, "logps/chosen": -144.0665283203125, "logps/rejected": -173.80783081054688, "loss": 0.5874, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8728230595588684, "rewards/margins": 0.34624117612838745, "rewards/rejected": -1.2190642356872559, "step": 11770 }, { "epoch": 2.029634734665748, "grad_norm": 19.010204315185547, "learning_rate": 2.8627625919899363e-08, "logits/chosen": -2.0318408012390137, "logits/rejected": -2.018629550933838, "logps/chosen": -141.2653350830078, "logps/rejected": -167.12208557128906, "loss": 0.6255, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9088839292526245, "rewards/margins": 0.24204608798027039, "rewards/rejected": -1.1509300470352173, "step": 11780 }, { "epoch": 2.031357684355617, "grad_norm": 15.42205810546875, "learning_rate": 2.8537045895246103e-08, "logits/chosen": -2.103947162628174, "logits/rejected": -2.096266269683838, "logps/chosen": -131.54701232910156, "logps/rejected": -182.22183227539062, "loss": 0.5338, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8216150999069214, "rewards/margins": 0.4606616497039795, "rewards/rejected": -1.2822768688201904, "step": 11790 }, { "epoch": 2.033080634045486, "grad_norm": 14.78586483001709, "learning_rate": 2.8446552138865797e-08, "logits/chosen": -2.1057920455932617, "logits/rejected": -2.0848259925842285, "logps/chosen": -150.41708374023438, "logps/rejected": -172.26278686523438, "loss": 0.6111, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9330343008041382, "rewards/margins": 0.2561315894126892, "rewards/rejected": -1.1891659498214722, "step": 11800 }, { "epoch": 2.033080634045486, "eval_logits/chosen": -2.2025508880615234, "eval_logits/rejected": -2.1917428970336914, "eval_logps/chosen": -135.81239318847656, "eval_logps/rejected": -157.57235717773438, "eval_loss": 0.6407946944236755, "eval_rewards/accuracies": 0.6284851431846619, "eval_rewards/chosen": -0.7710050940513611, "eval_rewards/margins": 0.172917440533638, "eval_rewards/rejected": -0.9439225196838379, "eval_runtime": 385.222, "eval_samples_per_second": 11.173, "eval_steps_per_second": 1.397, "step": 11800 }, { "epoch": 2.034803583735355, "grad_norm": 17.793256759643555, "learning_rate": 2.8356145014489408e-08, "logits/chosen": -2.0853958129882812, "logits/rejected": -2.0657572746276855, "logps/chosen": -152.5973358154297, "logps/rejected": -180.66957092285156, "loss": 0.6063, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9659139513969421, "rewards/margins": 0.31285327672958374, "rewards/rejected": -1.2787672281265259, "step": 11810 }, { "epoch": 2.036526533425224, "grad_norm": 13.064412117004395, "learning_rate": 2.8265824885499605e-08, "logits/chosen": -2.099360942840576, "logits/rejected": -2.0810630321502686, "logps/chosen": -142.74703979492188, "logps/rejected": -175.0826873779297, "loss": 0.6044, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9288276433944702, "rewards/margins": 0.30582544207572937, "rewards/rejected": -1.2346532344818115, "step": 11820 }, { "epoch": 2.0382494831150932, "grad_norm": 19.97876739501953, "learning_rate": 2.817559211492948e-08, "logits/chosen": -2.069870948791504, "logits/rejected": -2.054990291595459, "logps/chosen": -138.44482421875, "logps/rejected": -176.4115753173828, "loss": 0.5864, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8740692138671875, "rewards/margins": 0.34462273120880127, "rewards/rejected": -1.2186918258666992, "step": 11830 }, { "epoch": 2.039972432804962, "grad_norm": 15.733636856079102, "learning_rate": 2.80854470654609e-08, "logits/chosen": -2.0899147987365723, "logits/rejected": -2.064992666244507, "logps/chosen": -140.33624267578125, "logps/rejected": -176.60333251953125, "loss": 0.5735, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8811368942260742, "rewards/margins": 0.3659118115901947, "rewards/rejected": -1.2470486164093018, "step": 11840 }, { "epoch": 2.041695382494831, "grad_norm": 19.335407257080078, "learning_rate": 2.7995390099423217e-08, "logits/chosen": -2.023524522781372, "logits/rejected": -1.9834750890731812, "logps/chosen": -151.14559936523438, "logps/rejected": -171.8472900390625, "loss": 0.6205, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9637249112129211, "rewards/margins": 0.2654620110988617, "rewards/rejected": -1.22918701171875, "step": 11850 }, { "epoch": 2.0434183321847, "grad_norm": 17.52025604248047, "learning_rate": 2.7905421578791754e-08, "logits/chosen": -2.120047092437744, "logits/rejected": -2.1043002605438232, "logps/chosen": -160.3626251220703, "logps/rejected": -183.46060180664062, "loss": 0.6507, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0549418926239014, "rewards/margins": 0.20423094928264618, "rewards/rejected": -1.259173035621643, "step": 11860 }, { "epoch": 2.045141281874569, "grad_norm": 16.2296142578125, "learning_rate": 2.7815541865186215e-08, "logits/chosen": -2.0508406162261963, "logits/rejected": -2.034073829650879, "logps/chosen": -139.08230590820312, "logps/rejected": -178.80789184570312, "loss": 0.5923, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9079233407974243, "rewards/margins": 0.3439319431781769, "rewards/rejected": -1.2518551349639893, "step": 11870 }, { "epoch": 2.0468642315644385, "grad_norm": 20.792476654052734, "learning_rate": 2.7725751319869485e-08, "logits/chosen": -2.0726428031921387, "logits/rejected": -2.041367769241333, "logps/chosen": -145.16331481933594, "logps/rejected": -182.15489196777344, "loss": 0.5644, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8807977437973022, "rewards/margins": 0.3881632387638092, "rewards/rejected": -1.268960952758789, "step": 11880 }, { "epoch": 2.0485871812543075, "grad_norm": 17.177602767944336, "learning_rate": 2.7636050303746004e-08, "logits/chosen": -2.1124892234802246, "logits/rejected": -2.086585521697998, "logps/chosen": -152.1103973388672, "logps/rejected": -184.46475219726562, "loss": 0.5915, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9606548547744751, "rewards/margins": 0.35888296365737915, "rewards/rejected": -1.319537878036499, "step": 11890 }, { "epoch": 2.0503101309441765, "grad_norm": 18.331647872924805, "learning_rate": 2.7546439177360336e-08, "logits/chosen": -1.9853017330169678, "logits/rejected": -1.954981803894043, "logps/chosen": -146.99481201171875, "logps/rejected": -183.2722930908203, "loss": 0.5739, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9373520016670227, "rewards/margins": 0.376522421836853, "rewards/rejected": -1.3138744831085205, "step": 11900 }, { "epoch": 2.0503101309441765, "eval_logits/chosen": -2.1862330436706543, "eval_logits/rejected": -2.1751818656921387, "eval_logps/chosen": -139.33628845214844, "eval_logps/rejected": -161.68716430664062, "eval_loss": 0.6400790810585022, "eval_rewards/accuracies": 0.6282528042793274, "eval_rewards/chosen": -0.8062440156936646, "eval_rewards/margins": 0.1788264811038971, "eval_rewards/rejected": -0.9850704669952393, "eval_runtime": 384.8478, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 11900 }, { "epoch": 2.0520330806340454, "grad_norm": 14.862703323364258, "learning_rate": 2.7456918300895748e-08, "logits/chosen": -2.1430649757385254, "logits/rejected": -2.143555164337158, "logps/chosen": -151.27426147460938, "logps/rejected": -189.43338012695312, "loss": 0.6067, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9923728704452515, "rewards/margins": 0.32215404510498047, "rewards/rejected": -1.314526915550232, "step": 11910 }, { "epoch": 2.0537560303239144, "grad_norm": 15.798375129699707, "learning_rate": 2.736748803417277e-08, "logits/chosen": -2.0879335403442383, "logits/rejected": -2.066047191619873, "logps/chosen": -157.04498291015625, "logps/rejected": -186.4480438232422, "loss": 0.5991, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9875925779342651, "rewards/margins": 0.3081526756286621, "rewards/rejected": -1.2957453727722168, "step": 11920 }, { "epoch": 2.055478980013784, "grad_norm": 15.735671043395996, "learning_rate": 2.7278148736647748e-08, "logits/chosen": -2.082273483276367, "logits/rejected": -2.0664048194885254, "logps/chosen": -143.75213623046875, "logps/rejected": -186.97071838378906, "loss": 0.5544, "rewards/accuracies": 0.75, "rewards/chosen": -0.9096201062202454, "rewards/margins": 0.40183061361312866, "rewards/rejected": -1.311450719833374, "step": 11930 }, { "epoch": 2.057201929703653, "grad_norm": 15.802986145019531, "learning_rate": 2.7188900767411338e-08, "logits/chosen": -2.038670778274536, "logits/rejected": -2.0089197158813477, "logps/chosen": -144.79005432128906, "logps/rejected": -178.7763214111328, "loss": 0.5802, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9065011739730835, "rewards/margins": 0.3493736982345581, "rewards/rejected": -1.2558748722076416, "step": 11940 }, { "epoch": 2.0589248793935218, "grad_norm": 18.83980941772461, "learning_rate": 2.709974448518718e-08, "logits/chosen": -2.148047924041748, "logits/rejected": -2.1200716495513916, "logps/chosen": -157.41915893554688, "logps/rejected": -181.556640625, "loss": 0.6328, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.997038722038269, "rewards/margins": 0.25461655855178833, "rewards/rejected": -1.2516553401947021, "step": 11950 }, { "epoch": 2.0606478290833907, "grad_norm": 19.09943962097168, "learning_rate": 2.7010680248330307e-08, "logits/chosen": -2.0255045890808105, "logits/rejected": -2.0036354064941406, "logps/chosen": -149.8284149169922, "logps/rejected": -190.45596313476562, "loss": 0.5768, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9751051664352417, "rewards/margins": 0.401517778635025, "rewards/rejected": -1.3766229152679443, "step": 11960 }, { "epoch": 2.0623707787732597, "grad_norm": 15.893461227416992, "learning_rate": 2.6921708414825857e-08, "logits/chosen": -2.0874311923980713, "logits/rejected": -2.064255952835083, "logps/chosen": -152.45919799804688, "logps/rejected": -184.37521362304688, "loss": 0.5964, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9631436467170715, "rewards/margins": 0.34516850113868713, "rewards/rejected": -1.308312177658081, "step": 11970 }, { "epoch": 2.0640937284631287, "grad_norm": 15.452431678771973, "learning_rate": 2.6832829342287488e-08, "logits/chosen": -2.0895285606384277, "logits/rejected": -2.060600757598877, "logps/chosen": -148.33029174804688, "logps/rejected": -188.08963012695312, "loss": 0.5554, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9352174997329712, "rewards/margins": 0.4021036624908447, "rewards/rejected": -1.3373210430145264, "step": 11980 }, { "epoch": 2.065816678152998, "grad_norm": 16.7091007232666, "learning_rate": 2.674404338795611e-08, "logits/chosen": -2.1494126319885254, "logits/rejected": -2.1023356914520264, "logps/chosen": -151.52452087402344, "logps/rejected": -178.547119140625, "loss": 0.5896, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9388395547866821, "rewards/margins": 0.3286966383457184, "rewards/rejected": -1.2675360441207886, "step": 11990 }, { "epoch": 2.067539627842867, "grad_norm": 18.454103469848633, "learning_rate": 2.665535090869827e-08, "logits/chosen": -2.0579426288604736, "logits/rejected": -2.03229022026062, "logps/chosen": -145.76255798339844, "logps/rejected": -186.35507202148438, "loss": 0.5807, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9508598446846008, "rewards/margins": 0.3661665916442871, "rewards/rejected": -1.3170263767242432, "step": 12000 }, { "epoch": 2.067539627842867, "eval_logits/chosen": -2.177586555480957, "eval_logits/rejected": -2.166348695755005, "eval_logps/chosen": -139.99208068847656, "eval_logps/rejected": -162.47183227539062, "eval_loss": 0.6399745941162109, "eval_rewards/accuracies": 0.6326673030853271, "eval_rewards/chosen": -0.8128020763397217, "eval_rewards/margins": 0.18011502921581268, "eval_rewards/rejected": -0.992917001247406, "eval_runtime": 385.1708, "eval_samples_per_second": 11.174, "eval_steps_per_second": 1.397, "step": 12000 }, { "epoch": 2.069262577532736, "grad_norm": 17.7545166015625, "learning_rate": 2.656675226100481e-08, "logits/chosen": -2.084832191467285, "logits/rejected": -2.0561680793762207, "logps/chosen": -157.13027954101562, "logps/rejected": -192.3538818359375, "loss": 0.5957, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0220038890838623, "rewards/margins": 0.3444460928440094, "rewards/rejected": -1.3664497137069702, "step": 12010 }, { "epoch": 2.070985527222605, "grad_norm": 18.744640350341797, "learning_rate": 2.6478247800989474e-08, "logits/chosen": -2.0868375301361084, "logits/rejected": -2.0675981044769287, "logps/chosen": -145.5522918701172, "logps/rejected": -181.19834899902344, "loss": 0.5896, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9282581210136414, "rewards/margins": 0.3560255765914917, "rewards/rejected": -1.2842837572097778, "step": 12020 }, { "epoch": 2.072708476912474, "grad_norm": 20.09699249267578, "learning_rate": 2.63898378843874e-08, "logits/chosen": -2.170151948928833, "logits/rejected": -2.151777744293213, "logps/chosen": -141.83226013183594, "logps/rejected": -170.92758178710938, "loss": 0.6227, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9280394315719604, "rewards/margins": 0.26881080865859985, "rewards/rejected": -1.196850061416626, "step": 12030 }, { "epoch": 2.0744314266023434, "grad_norm": 13.343510627746582, "learning_rate": 2.6301522866553714e-08, "logits/chosen": -2.0674965381622314, "logits/rejected": -2.039283514022827, "logps/chosen": -151.79859924316406, "logps/rejected": -180.143798828125, "loss": 0.5961, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9612434506416321, "rewards/margins": 0.32946452498435974, "rewards/rejected": -1.2907079458236694, "step": 12040 }, { "epoch": 2.0761543762922123, "grad_norm": 16.34334373474121, "learning_rate": 2.621330310246208e-08, "logits/chosen": -2.097844362258911, "logits/rejected": -2.0535268783569336, "logps/chosen": -148.69320678710938, "logps/rejected": -188.4891815185547, "loss": 0.5494, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9443635940551758, "rewards/margins": 0.42830634117126465, "rewards/rejected": -1.3726699352264404, "step": 12050 }, { "epoch": 2.0778773259820813, "grad_norm": 17.49907875061035, "learning_rate": 2.6125178946703352e-08, "logits/chosen": -2.0757229328155518, "logits/rejected": -2.053226947784424, "logps/chosen": -150.58302307128906, "logps/rejected": -176.9593505859375, "loss": 0.6189, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9473937153816223, "rewards/margins": 0.28100866079330444, "rewards/rejected": -1.2284022569656372, "step": 12060 }, { "epoch": 2.0796002756719503, "grad_norm": 18.372468948364258, "learning_rate": 2.6037150753484082e-08, "logits/chosen": -2.069251775741577, "logits/rejected": -2.0282840728759766, "logps/chosen": -147.1036376953125, "logps/rejected": -183.5277557373047, "loss": 0.5573, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9245826005935669, "rewards/margins": 0.413900226354599, "rewards/rejected": -1.3384827375411987, "step": 12070 }, { "epoch": 2.0813232253618192, "grad_norm": 19.427927017211914, "learning_rate": 2.594921887662509e-08, "logits/chosen": -1.9900357723236084, "logits/rejected": -1.9685547351837158, "logps/chosen": -147.01913452148438, "logps/rejected": -182.08755493164062, "loss": 0.5748, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9142827987670898, "rewards/margins": 0.35903555154800415, "rewards/rejected": -1.2733182907104492, "step": 12080 }, { "epoch": 2.0830461750516887, "grad_norm": 18.593591690063477, "learning_rate": 2.5861383669560045e-08, "logits/chosen": -2.064598321914673, "logits/rejected": -2.036254405975342, "logps/chosen": -153.2742156982422, "logps/rejected": -180.3940887451172, "loss": 0.6113, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9668952822685242, "rewards/margins": 0.3069581091403961, "rewards/rejected": -1.2738534212112427, "step": 12090 }, { "epoch": 2.0847691247415576, "grad_norm": 19.603769302368164, "learning_rate": 2.5773645485334122e-08, "logits/chosen": -2.071579933166504, "logits/rejected": -2.049017906188965, "logps/chosen": -147.5741424560547, "logps/rejected": -174.26467895507812, "loss": 0.5904, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8953774571418762, "rewards/margins": 0.32461515069007874, "rewards/rejected": -1.2199926376342773, "step": 12100 }, { "epoch": 2.0847691247415576, "eval_logits/chosen": -2.1738572120666504, "eval_logits/rejected": -2.1626193523406982, "eval_logps/chosen": -140.53909301757812, "eval_logps/rejected": -163.14471435546875, "eval_loss": 0.6396486759185791, "eval_rewards/accuracies": 0.6317379474639893, "eval_rewards/chosen": -0.8182719349861145, "eval_rewards/margins": 0.18137399852275848, "eval_rewards/rejected": -0.9996460676193237, "eval_runtime": 384.8097, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 12100 }, { "epoch": 2.0864920744314266, "grad_norm": 16.624221801757812, "learning_rate": 2.568600467660245e-08, "logits/chosen": -2.102628707885742, "logits/rejected": -2.077244997024536, "logps/chosen": -160.05300903320312, "logps/rejected": -184.34898376464844, "loss": 0.6246, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0306799411773682, "rewards/margins": 0.2755560576915741, "rewards/rejected": -1.3062360286712646, "step": 12110 }, { "epoch": 2.0882150241212956, "grad_norm": 19.430625915527344, "learning_rate": 2.5598461595628827e-08, "logits/chosen": -2.0444915294647217, "logits/rejected": -2.0168583393096924, "logps/chosen": -158.15213012695312, "logps/rejected": -182.9921112060547, "loss": 0.6293, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0218346118927002, "rewards/margins": 0.28778260946273804, "rewards/rejected": -1.309617042541504, "step": 12120 }, { "epoch": 2.0899379738111645, "grad_norm": 13.189775466918945, "learning_rate": 2.5511016594284236e-08, "logits/chosen": -2.0396082401275635, "logits/rejected": -2.008270502090454, "logps/chosen": -148.4679718017578, "logps/rejected": -186.8063507080078, "loss": 0.5577, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9405156373977661, "rewards/margins": 0.39370864629745483, "rewards/rejected": -1.3342244625091553, "step": 12130 }, { "epoch": 2.091660923501034, "grad_norm": 19.595312118530273, "learning_rate": 2.5423670024045397e-08, "logits/chosen": -2.0856051445007324, "logits/rejected": -2.0678884983062744, "logps/chosen": -148.15773010253906, "logps/rejected": -179.49420166015625, "loss": 0.6144, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9618323445320129, "rewards/margins": 0.28697267174720764, "rewards/rejected": -1.2488048076629639, "step": 12140 }, { "epoch": 2.093383873190903, "grad_norm": 22.856531143188477, "learning_rate": 2.5336422235993403e-08, "logits/chosen": -2.0565311908721924, "logits/rejected": -2.037199020385742, "logps/chosen": -144.37904357910156, "logps/rejected": -176.53453063964844, "loss": 0.5998, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9095523953437805, "rewards/margins": 0.3278244137763977, "rewards/rejected": -1.2373769283294678, "step": 12150 }, { "epoch": 2.095106822880772, "grad_norm": 20.86622428894043, "learning_rate": 2.5249273580812346e-08, "logits/chosen": -2.003087282180786, "logits/rejected": -1.9731581211090088, "logps/chosen": -148.57054138183594, "logps/rejected": -176.899658203125, "loss": 0.6018, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9260438680648804, "rewards/margins": 0.3059634268283844, "rewards/rejected": -1.2320072650909424, "step": 12160 }, { "epoch": 2.096829772570641, "grad_norm": 17.15337371826172, "learning_rate": 2.5162224408787874e-08, "logits/chosen": -2.0348877906799316, "logits/rejected": -2.0129213333129883, "logps/chosen": -151.60482788085938, "logps/rejected": -190.17022705078125, "loss": 0.5912, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9710845947265625, "rewards/margins": 0.3718888759613037, "rewards/rejected": -1.3429733514785767, "step": 12170 }, { "epoch": 2.09855272226051, "grad_norm": 14.853632926940918, "learning_rate": 2.5075275069805646e-08, "logits/chosen": -1.9775642156600952, "logits/rejected": -1.9640986919403076, "logps/chosen": -147.07089233398438, "logps/rejected": -201.99301147460938, "loss": 0.5418, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9846722483634949, "rewards/margins": 0.45503005385398865, "rewards/rejected": -1.4397022724151611, "step": 12180 }, { "epoch": 2.1002756719503792, "grad_norm": 18.290401458740234, "learning_rate": 2.4988425913350192e-08, "logits/chosen": -2.0202815532684326, "logits/rejected": -1.9880399703979492, "logps/chosen": -157.09011840820312, "logps/rejected": -180.63970947265625, "loss": 0.6178, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9997302293777466, "rewards/margins": 0.27001819014549255, "rewards/rejected": -1.2697484493255615, "step": 12190 }, { "epoch": 2.101998621640248, "grad_norm": 15.475078582763672, "learning_rate": 2.4901677288503326e-08, "logits/chosen": -2.114593267440796, "logits/rejected": -2.0884311199188232, "logps/chosen": -154.83831787109375, "logps/rejected": -189.3814239501953, "loss": 0.5722, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9824504852294922, "rewards/margins": 0.3691680133342743, "rewards/rejected": -1.3516185283660889, "step": 12200 }, { "epoch": 2.101998621640248, "eval_logits/chosen": -2.1704039573669434, "eval_logits/rejected": -2.1590938568115234, "eval_logps/chosen": -141.16712951660156, "eval_logps/rejected": -163.847900390625, "eval_loss": 0.6397427916526794, "eval_rewards/accuracies": 0.6326673030853271, "eval_rewards/chosen": -0.8245523571968079, "eval_rewards/margins": 0.18212565779685974, "eval_rewards/rejected": -1.0066779851913452, "eval_runtime": 385.3021, "eval_samples_per_second": 11.17, "eval_steps_per_second": 1.396, "step": 12200 }, { "epoch": 2.103721571330117, "grad_norm": 16.472347259521484, "learning_rate": 2.4815029543942735e-08, "logits/chosen": -2.083890438079834, "logits/rejected": -2.0623395442962646, "logps/chosen": -155.27090454101562, "logps/rejected": -183.13192749023438, "loss": 0.6007, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0156900882720947, "rewards/margins": 0.298033207654953, "rewards/rejected": -1.3137232065200806, "step": 12210 }, { "epoch": 2.105444521019986, "grad_norm": 19.980323791503906, "learning_rate": 2.4728483027940715e-08, "logits/chosen": -1.9904050827026367, "logits/rejected": -1.9682128429412842, "logps/chosen": -154.36993408203125, "logps/rejected": -185.6859893798828, "loss": 0.5973, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0086687803268433, "rewards/margins": 0.3076714277267456, "rewards/rejected": -1.3163402080535889, "step": 12220 }, { "epoch": 2.107167470709855, "grad_norm": 18.818117141723633, "learning_rate": 2.4642038088362595e-08, "logits/chosen": -2.0151965618133545, "logits/rejected": -1.9999818801879883, "logps/chosen": -151.59066772460938, "logps/rejected": -184.35519409179688, "loss": 0.5971, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9967201948165894, "rewards/margins": 0.33280742168426514, "rewards/rejected": -1.3295276165008545, "step": 12230 }, { "epoch": 2.1088904203997245, "grad_norm": 20.60524559020996, "learning_rate": 2.4555695072665494e-08, "logits/chosen": -1.976142168045044, "logits/rejected": -1.9484245777130127, "logps/chosen": -152.32667541503906, "logps/rejected": -182.46633911132812, "loss": 0.6048, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9965447187423706, "rewards/margins": 0.30195122957229614, "rewards/rejected": -1.298495888710022, "step": 12240 }, { "epoch": 2.1106133700895935, "grad_norm": 20.098731994628906, "learning_rate": 2.446945432789681e-08, "logits/chosen": -2.0497965812683105, "logits/rejected": -2.0212483406066895, "logps/chosen": -150.97396850585938, "logps/rejected": -177.1309051513672, "loss": 0.629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9853955507278442, "rewards/margins": 0.26979175209999084, "rewards/rejected": -1.2551872730255127, "step": 12250 }, { "epoch": 2.1123363197794625, "grad_norm": 17.107200622558594, "learning_rate": 2.4383316200692928e-08, "logits/chosen": -2.026054859161377, "logits/rejected": -2.0070912837982178, "logps/chosen": -146.7823486328125, "logps/rejected": -183.44515991210938, "loss": 0.5913, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9421728849411011, "rewards/margins": 0.3318934142589569, "rewards/rejected": -1.27406644821167, "step": 12260 }, { "epoch": 2.1140592694693314, "grad_norm": 19.780839920043945, "learning_rate": 2.4297281037277694e-08, "logits/chosen": -2.1094722747802734, "logits/rejected": -2.077580451965332, "logps/chosen": -156.1154022216797, "logps/rejected": -188.98558044433594, "loss": 0.6038, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0134097337722778, "rewards/margins": 0.33851879835128784, "rewards/rejected": -1.3519285917282104, "step": 12270 }, { "epoch": 2.1157822191592004, "grad_norm": 15.796158790588379, "learning_rate": 2.4211349183461195e-08, "logits/chosen": -2.0296740531921387, "logits/rejected": -2.009584665298462, "logps/chosen": -157.42874145507812, "logps/rejected": -184.9797821044922, "loss": 0.6087, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0087523460388184, "rewards/margins": 0.3062726855278015, "rewards/rejected": -1.3150250911712646, "step": 12280 }, { "epoch": 2.11750516884907, "grad_norm": 17.406145095825195, "learning_rate": 2.4125520984638177e-08, "logits/chosen": -1.9842334985733032, "logits/rejected": -1.9565746784210205, "logps/chosen": -151.7571563720703, "logps/rejected": -180.76300048828125, "loss": 0.6046, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9797444343566895, "rewards/margins": 0.2966992259025574, "rewards/rejected": -1.2764437198638916, "step": 12290 }, { "epoch": 2.1192281185389388, "grad_norm": 20.89878273010254, "learning_rate": 2.4039796785786827e-08, "logits/chosen": -2.055380344390869, "logits/rejected": -2.027924060821533, "logps/chosen": -153.083740234375, "logps/rejected": -184.66558837890625, "loss": 0.5874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9840517044067383, "rewards/margins": 0.3398807942867279, "rewards/rejected": -1.323932409286499, "step": 12300 }, { "epoch": 2.1192281185389388, "eval_logits/chosen": -2.1689703464508057, "eval_logits/rejected": -2.157639265060425, "eval_logps/chosen": -140.91815185546875, "eval_logps/rejected": -163.52870178222656, "eval_loss": 0.6397192478179932, "eval_rewards/accuracies": 0.6342936754226685, "eval_rewards/chosen": -0.822062611579895, "eval_rewards/margins": 0.18142303824424744, "eval_rewards/rejected": -1.0034856796264648, "eval_runtime": 385.0038, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 12300 }, { "epoch": 2.1209510682288077, "grad_norm": 14.518001556396484, "learning_rate": 2.3954176931467323e-08, "logits/chosen": -2.0466392040252686, "logits/rejected": -2.0172855854034424, "logps/chosen": -148.81753540039062, "logps/rejected": -183.0417022705078, "loss": 0.5869, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.939220130443573, "rewards/margins": 0.36449334025382996, "rewards/rejected": -1.3037135601043701, "step": 12310 }, { "epoch": 2.1226740179186767, "grad_norm": 17.70513153076172, "learning_rate": 2.3868661765820346e-08, "logits/chosen": -2.0486254692077637, "logits/rejected": -2.025284767150879, "logps/chosen": -141.38436889648438, "logps/rejected": -185.02659606933594, "loss": 0.5591, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8708200454711914, "rewards/margins": 0.43535977602005005, "rewards/rejected": -1.3061797618865967, "step": 12320 }, { "epoch": 2.1243969676085457, "grad_norm": 17.476844787597656, "learning_rate": 2.3783251632565875e-08, "logits/chosen": -2.0581068992614746, "logits/rejected": -2.043915271759033, "logps/chosen": -153.53897094726562, "logps/rejected": -179.50608825683594, "loss": 0.613, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9728647470474243, "rewards/margins": 0.27181097865104675, "rewards/rejected": -1.2446755170822144, "step": 12330 }, { "epoch": 2.126119917298415, "grad_norm": 13.769906044006348, "learning_rate": 2.3697946875001725e-08, "logits/chosen": -2.0776586532592773, "logits/rejected": -2.049694538116455, "logps/chosen": -153.2010040283203, "logps/rejected": -192.2355499267578, "loss": 0.5699, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9959951639175415, "rewards/margins": 0.3895551562309265, "rewards/rejected": -1.3855502605438232, "step": 12340 }, { "epoch": 2.127842866988284, "grad_norm": 17.040367126464844, "learning_rate": 2.3612747836002116e-08, "logits/chosen": -1.975243330001831, "logits/rejected": -1.933740258216858, "logps/chosen": -160.90567016601562, "logps/rejected": -188.67076110839844, "loss": 0.6008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.042332410812378, "rewards/margins": 0.3365021049976349, "rewards/rejected": -1.3788344860076904, "step": 12350 }, { "epoch": 2.129565816678153, "grad_norm": 20.185527801513672, "learning_rate": 2.352765485801635e-08, "logits/chosen": -2.0649573802948, "logits/rejected": -2.041565418243408, "logps/chosen": -142.52760314941406, "logps/rejected": -187.16665649414062, "loss": 0.5488, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8960248827934265, "rewards/margins": 0.4587882459163666, "rewards/rejected": -1.3548133373260498, "step": 12360 }, { "epoch": 2.131288766368022, "grad_norm": 14.83284854888916, "learning_rate": 2.3442668283067453e-08, "logits/chosen": -2.0248055458068848, "logits/rejected": -1.9915523529052734, "logps/chosen": -148.557861328125, "logps/rejected": -185.40977478027344, "loss": 0.5752, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9597280621528625, "rewards/margins": 0.39121565222740173, "rewards/rejected": -1.3509438037872314, "step": 12370 }, { "epoch": 2.133011716057891, "grad_norm": 16.076335906982422, "learning_rate": 2.335778845275079e-08, "logits/chosen": -2.0478732585906982, "logits/rejected": -2.042694330215454, "logps/chosen": -144.49351501464844, "logps/rejected": -191.4502716064453, "loss": 0.5583, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9409604072570801, "rewards/margins": 0.4060515761375427, "rewards/rejected": -1.3470120429992676, "step": 12380 }, { "epoch": 2.13473466574776, "grad_norm": 18.74994659423828, "learning_rate": 2.32730157082326e-08, "logits/chosen": -2.1494128704071045, "logits/rejected": -2.130913257598877, "logps/chosen": -149.18856811523438, "logps/rejected": -194.9440460205078, "loss": 0.5635, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.959284782409668, "rewards/margins": 0.4454408586025238, "rewards/rejected": -1.4047255516052246, "step": 12390 }, { "epoch": 2.1364576154376294, "grad_norm": 16.29080581665039, "learning_rate": 2.3188350390248796e-08, "logits/chosen": -2.0609512329101562, "logits/rejected": -2.050800085067749, "logps/chosen": -157.55703735351562, "logps/rejected": -199.2428741455078, "loss": 0.5575, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0240941047668457, "rewards/margins": 0.4005987048149109, "rewards/rejected": -1.424692988395691, "step": 12400 }, { "epoch": 2.1364576154376294, "eval_logits/chosen": -2.154268503189087, "eval_logits/rejected": -2.1426403522491455, "eval_logps/chosen": -145.11883544921875, "eval_logps/rejected": -168.34727478027344, "eval_loss": 0.6391282081604004, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.8640692234039307, "eval_rewards/margins": 0.18760232627391815, "eval_rewards/rejected": -1.0516716241836548, "eval_runtime": 384.8993, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 12400 }, { "epoch": 2.1381805651274983, "grad_norm": 17.62317657470703, "learning_rate": 2.310379283910343e-08, "logits/chosen": -2.0070154666900635, "logits/rejected": -1.9664987325668335, "logps/chosen": -147.045166015625, "logps/rejected": -179.89614868164062, "loss": 0.5781, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9314393997192383, "rewards/margins": 0.3480191230773926, "rewards/rejected": -1.2794584035873413, "step": 12410 }, { "epoch": 2.1399035148173673, "grad_norm": 23.432823181152344, "learning_rate": 2.30193433946674e-08, "logits/chosen": -1.90802800655365, "logits/rejected": -1.8824126720428467, "logps/chosen": -158.4735870361328, "logps/rejected": -187.57484436035156, "loss": 0.6168, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0305227041244507, "rewards/margins": 0.3146332800388336, "rewards/rejected": -1.345155954360962, "step": 12420 }, { "epoch": 2.1416264645072363, "grad_norm": 22.88194465637207, "learning_rate": 2.2935002396377128e-08, "logits/chosen": -2.0144271850585938, "logits/rejected": -1.992938756942749, "logps/chosen": -157.91915893554688, "logps/rejected": -192.0739288330078, "loss": 0.5987, "rewards/accuracies": 0.6875, "rewards/chosen": -1.029906988143921, "rewards/margins": 0.34480687975883484, "rewards/rejected": -1.3747137784957886, "step": 12430 }, { "epoch": 2.1433494141971057, "grad_norm": 19.667869567871094, "learning_rate": 2.2850770183233125e-08, "logits/chosen": -1.9757375717163086, "logits/rejected": -1.9592254161834717, "logps/chosen": -151.20062255859375, "logps/rejected": -186.13052368164062, "loss": 0.5971, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9788811802864075, "rewards/margins": 0.3557373881340027, "rewards/rejected": -1.3346186876296997, "step": 12440 }, { "epoch": 2.1450723638869746, "grad_norm": 18.900495529174805, "learning_rate": 2.276664709379863e-08, "logits/chosen": -2.0327000617980957, "logits/rejected": -2.0135703086853027, "logps/chosen": -149.45333862304688, "logps/rejected": -183.01283264160156, "loss": 0.6035, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9886199235916138, "rewards/margins": 0.3315490484237671, "rewards/rejected": -1.3201689720153809, "step": 12450 }, { "epoch": 2.1467953135768436, "grad_norm": 16.707895278930664, "learning_rate": 2.2682633466198263e-08, "logits/chosen": -2.0818283557891846, "logits/rejected": -2.0511233806610107, "logps/chosen": -156.9738006591797, "logps/rejected": -193.23536682128906, "loss": 0.5789, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0243823528289795, "rewards/margins": 0.37969323992729187, "rewards/rejected": -1.4040757417678833, "step": 12460 }, { "epoch": 2.1485182632667126, "grad_norm": 20.90814971923828, "learning_rate": 2.259872963811672e-08, "logits/chosen": -2.103562355041504, "logits/rejected": -2.0822694301605225, "logps/chosen": -163.7508544921875, "logps/rejected": -209.4212646484375, "loss": 0.561, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0847772359848022, "rewards/margins": 0.45163393020629883, "rewards/rejected": -1.536411166191101, "step": 12470 }, { "epoch": 2.1502412129565815, "grad_norm": 19.61737060546875, "learning_rate": 2.2514935946797347e-08, "logits/chosen": -2.1186139583587646, "logits/rejected": -2.0999228954315186, "logps/chosen": -159.9215087890625, "logps/rejected": -192.8905029296875, "loss": 0.5975, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0548546314239502, "rewards/margins": 0.32799118757247925, "rewards/rejected": -1.3828459978103638, "step": 12480 }, { "epoch": 2.1519641626464505, "grad_norm": 18.626937866210938, "learning_rate": 2.2431252729040796e-08, "logits/chosen": -2.020265817642212, "logits/rejected": -1.9977693557739258, "logps/chosen": -154.8829803466797, "logps/rejected": -194.85934448242188, "loss": 0.5823, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0040762424468994, "rewards/margins": 0.40796977281570435, "rewards/rejected": -1.4120463132858276, "step": 12490 }, { "epoch": 2.15368711233632, "grad_norm": 18.569961547851562, "learning_rate": 2.2347680321203655e-08, "logits/chosen": -2.0453221797943115, "logits/rejected": -2.0203518867492676, "logps/chosen": -162.56271362304688, "logps/rejected": -200.42919921875, "loss": 0.59, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0918738842010498, "rewards/margins": 0.3785688281059265, "rewards/rejected": -1.4704426527023315, "step": 12500 }, { "epoch": 2.15368711233632, "eval_logits/chosen": -2.1480531692504883, "eval_logits/rejected": -2.1363770961761475, "eval_logps/chosen": -145.79530334472656, "eval_logps/rejected": -169.0438690185547, "eval_loss": 0.6392155885696411, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.870834231376648, "eval_rewards/margins": 0.18780316412448883, "eval_rewards/rejected": -1.058637261390686, "eval_runtime": 385.343, "eval_samples_per_second": 11.169, "eval_steps_per_second": 1.396, "step": 12500 }, { "epoch": 2.155410062026189, "grad_norm": 15.100275993347168, "learning_rate": 2.2264219059197174e-08, "logits/chosen": -2.063833475112915, "logits/rejected": -2.038719654083252, "logps/chosen": -153.4787139892578, "logps/rejected": -181.99459838867188, "loss": 0.6026, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0043154954910278, "rewards/margins": 0.2926967144012451, "rewards/rejected": -1.2970123291015625, "step": 12510 }, { "epoch": 2.157133011716058, "grad_norm": 16.546247482299805, "learning_rate": 2.218086927848587e-08, "logits/chosen": -2.000105142593384, "logits/rejected": -1.978142499923706, "logps/chosen": -159.2552032470703, "logps/rejected": -192.9197235107422, "loss": 0.6043, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.057848334312439, "rewards/margins": 0.3225509524345398, "rewards/rejected": -1.380399465560913, "step": 12520 }, { "epoch": 2.158855961405927, "grad_norm": 18.483129501342773, "learning_rate": 2.2097631314086112e-08, "logits/chosen": -2.027190685272217, "logits/rejected": -2.0132484436035156, "logps/chosen": -158.9965362548828, "logps/rejected": -198.91172790527344, "loss": 0.5742, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0460169315338135, "rewards/margins": 0.3841618001461029, "rewards/rejected": -1.4301787614822388, "step": 12530 }, { "epoch": 2.160578911095796, "grad_norm": 16.097341537475586, "learning_rate": 2.201450550056486e-08, "logits/chosen": -2.0286478996276855, "logits/rejected": -2.0013487339019775, "logps/chosen": -154.73428344726562, "logps/rejected": -187.1873321533203, "loss": 0.6123, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0098367929458618, "rewards/margins": 0.32597047090530396, "rewards/rejected": -1.335807204246521, "step": 12540 }, { "epoch": 2.162301860785665, "grad_norm": 20.04486846923828, "learning_rate": 2.193149217203833e-08, "logits/chosen": -2.1235053539276123, "logits/rejected": -2.105464458465576, "logps/chosen": -150.79299926757812, "logps/rejected": -180.28079223632812, "loss": 0.6134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.997290313243866, "rewards/margins": 0.28636062145233154, "rewards/rejected": -1.2836507558822632, "step": 12550 }, { "epoch": 2.164024810475534, "grad_norm": 16.999258041381836, "learning_rate": 2.1848591662170546e-08, "logits/chosen": -2.0906529426574707, "logits/rejected": -2.053553819656372, "logps/chosen": -161.35374450683594, "logps/rejected": -182.81202697753906, "loss": 0.6228, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0412954092025757, "rewards/margins": 0.28862762451171875, "rewards/rejected": -1.3299229145050049, "step": 12560 }, { "epoch": 2.165747760165403, "grad_norm": 18.201045989990234, "learning_rate": 2.1765804304172137e-08, "logits/chosen": -2.0378661155700684, "logits/rejected": -2.004517078399658, "logps/chosen": -147.25869750976562, "logps/rejected": -178.6654815673828, "loss": 0.5884, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9546165466308594, "rewards/margins": 0.32692950963974, "rewards/rejected": -1.2815459966659546, "step": 12570 }, { "epoch": 2.167470709855272, "grad_norm": 22.28134536743164, "learning_rate": 2.1683130430798907e-08, "logits/chosen": -2.046701431274414, "logits/rejected": -2.006303071975708, "logps/chosen": -162.3466796875, "logps/rejected": -190.3459014892578, "loss": 0.5876, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0169700384140015, "rewards/margins": 0.36277857422828674, "rewards/rejected": -1.3797485828399658, "step": 12580 }, { "epoch": 2.169193659545141, "grad_norm": 15.241520881652832, "learning_rate": 2.16005703743505e-08, "logits/chosen": -2.03153920173645, "logits/rejected": -1.9981634616851807, "logps/chosen": -152.40487670898438, "logps/rejected": -188.49636840820312, "loss": 0.5702, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9693043828010559, "rewards/margins": 0.38716763257980347, "rewards/rejected": -1.3564720153808594, "step": 12590 }, { "epoch": 2.1709166092350105, "grad_norm": 16.741743087768555, "learning_rate": 2.151812446666908e-08, "logits/chosen": -2.000462293624878, "logits/rejected": -1.9706224203109741, "logps/chosen": -158.33937072753906, "logps/rejected": -185.6919403076172, "loss": 0.6028, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0003563165664673, "rewards/margins": 0.32089129090309143, "rewards/rejected": -1.3212475776672363, "step": 12600 }, { "epoch": 2.1709166092350105, "eval_logits/chosen": -2.1519031524658203, "eval_logits/rejected": -2.1403234004974365, "eval_logps/chosen": -143.77944946289062, "eval_logps/rejected": -166.8094482421875, "eval_loss": 0.639387845993042, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.850675642490387, "eval_rewards/margins": 0.185617595911026, "eval_rewards/rejected": -1.0362931489944458, "eval_runtime": 384.8477, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 12600 }, { "epoch": 2.1726395589248795, "grad_norm": 12.606042861938477, "learning_rate": 2.1435793039138035e-08, "logits/chosen": -2.114326000213623, "logits/rejected": -2.1089417934417725, "logps/chosen": -149.63531494140625, "logps/rejected": -183.6655731201172, "loss": 0.5782, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9557312726974487, "rewards/margins": 0.33367159962654114, "rewards/rejected": -1.289402961730957, "step": 12610 }, { "epoch": 2.1743625086147484, "grad_norm": 22.117944717407227, "learning_rate": 2.135357642268062e-08, "logits/chosen": -2.1563568115234375, "logits/rejected": -2.1263372898101807, "logps/chosen": -162.57171630859375, "logps/rejected": -194.23548889160156, "loss": 0.6041, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0720674991607666, "rewards/margins": 0.33572691679000854, "rewards/rejected": -1.40779447555542, "step": 12620 }, { "epoch": 2.1760854583046174, "grad_norm": 14.276305198669434, "learning_rate": 2.1271474947758533e-08, "logits/chosen": -2.098055362701416, "logits/rejected": -2.0904479026794434, "logps/chosen": -147.35125732421875, "logps/rejected": -181.5499267578125, "loss": 0.6058, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9696400761604309, "rewards/margins": 0.3259606957435608, "rewards/rejected": -1.2956006526947021, "step": 12630 }, { "epoch": 2.1778084079944864, "grad_norm": 18.19456672668457, "learning_rate": 2.1189488944370753e-08, "logits/chosen": -2.066310405731201, "logits/rejected": -2.04471492767334, "logps/chosen": -149.0919647216797, "logps/rejected": -188.32907104492188, "loss": 0.5684, "rewards/accuracies": 0.75, "rewards/chosen": -0.9499889612197876, "rewards/margins": 0.39808589220046997, "rewards/rejected": -1.3480746746063232, "step": 12640 }, { "epoch": 2.179531357684356, "grad_norm": 18.8416805267334, "learning_rate": 2.110761874205214e-08, "logits/chosen": -2.045043468475342, "logits/rejected": -2.021754264831543, "logps/chosen": -145.10240173339844, "logps/rejected": -171.17100524902344, "loss": 0.6119, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9205414056777954, "rewards/margins": 0.2754775583744049, "rewards/rejected": -1.196018934249878, "step": 12650 }, { "epoch": 2.1812543073742248, "grad_norm": 19.355369567871094, "learning_rate": 2.1025864669872028e-08, "logits/chosen": -1.9548429250717163, "logits/rejected": -1.934769868850708, "logps/chosen": -153.12893676757812, "logps/rejected": -185.69248962402344, "loss": 0.6138, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.010509729385376, "rewards/margins": 0.31319838762283325, "rewards/rejected": -1.323707938194275, "step": 12660 }, { "epoch": 2.1829772570640937, "grad_norm": 20.406373977661133, "learning_rate": 2.0944227056433062e-08, "logits/chosen": -2.2265782356262207, "logits/rejected": -2.189685821533203, "logps/chosen": -148.15615844726562, "logps/rejected": -184.5717010498047, "loss": 0.5781, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9500039219856262, "rewards/margins": 0.3827661871910095, "rewards/rejected": -1.3327701091766357, "step": 12670 }, { "epoch": 2.1847002067539627, "grad_norm": 20.81136703491211, "learning_rate": 2.0862706229869716e-08, "logits/chosen": -1.9960145950317383, "logits/rejected": -1.9710595607757568, "logps/chosen": -152.7157745361328, "logps/rejected": -186.7949676513672, "loss": 0.5969, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0114679336547852, "rewards/margins": 0.33636921644210815, "rewards/rejected": -1.347837209701538, "step": 12680 }, { "epoch": 2.1864231564438317, "grad_norm": 14.440084457397461, "learning_rate": 2.0781302517847115e-08, "logits/chosen": -2.0044655799865723, "logits/rejected": -1.984331488609314, "logps/chosen": -153.7787628173828, "logps/rejected": -183.58425903320312, "loss": 0.6213, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0188877582550049, "rewards/margins": 0.28991761803627014, "rewards/rejected": -1.3088053464889526, "step": 12690 }, { "epoch": 2.188146106133701, "grad_norm": 15.82426929473877, "learning_rate": 2.0700016247559592e-08, "logits/chosen": -2.013751268386841, "logits/rejected": -1.987062692642212, "logps/chosen": -153.21856689453125, "logps/rejected": -188.2808074951172, "loss": 0.5745, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9646704792976379, "rewards/margins": 0.37010273337364197, "rewards/rejected": -1.334773302078247, "step": 12700 }, { "epoch": 2.188146106133701, "eval_logits/chosen": -2.1510603427886963, "eval_logits/rejected": -2.139479398727417, "eval_logps/chosen": -143.47250366210938, "eval_logps/rejected": -166.4607696533203, "eval_loss": 0.6394017934799194, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.8476059436798096, "eval_rewards/margins": 0.18520036339759827, "eval_rewards/rejected": -1.0328062772750854, "eval_runtime": 384.6731, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 12700 }, { "epoch": 2.18986905582357, "grad_norm": 19.46891212463379, "learning_rate": 2.0618847745729506e-08, "logits/chosen": -2.093761920928955, "logits/rejected": -2.082408905029297, "logps/chosen": -155.69656372070312, "logps/rejected": -194.2377166748047, "loss": 0.5891, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0204856395721436, "rewards/margins": 0.3441944420337677, "rewards/rejected": -1.3646801710128784, "step": 12710 }, { "epoch": 2.191592005513439, "grad_norm": 14.17563247680664, "learning_rate": 2.05377973386058e-08, "logits/chosen": -2.100975513458252, "logits/rejected": -2.057400703430176, "logps/chosen": -150.45741271972656, "logps/rejected": -184.09703063964844, "loss": 0.5695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9387782216072083, "rewards/margins": 0.3885608911514282, "rewards/rejected": -1.3273389339447021, "step": 12720 }, { "epoch": 2.193314955203308, "grad_norm": 17.42460060119629, "learning_rate": 2.0456865351962742e-08, "logits/chosen": -1.9818073511123657, "logits/rejected": -1.9522578716278076, "logps/chosen": -148.2295684814453, "logps/rejected": -182.35655212402344, "loss": 0.5767, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9773553609848022, "rewards/margins": 0.3419249653816223, "rewards/rejected": -1.3192803859710693, "step": 12730 }, { "epoch": 2.195037904893177, "grad_norm": 14.746926307678223, "learning_rate": 2.037605211109866e-08, "logits/chosen": -2.0128746032714844, "logits/rejected": -1.9878730773925781, "logps/chosen": -162.36074829101562, "logps/rejected": -195.56887817382812, "loss": 0.6098, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0981024503707886, "rewards/margins": 0.33498185873031616, "rewards/rejected": -1.43308424949646, "step": 12740 }, { "epoch": 2.1967608545830464, "grad_norm": 17.827266693115234, "learning_rate": 2.0295357940834605e-08, "logits/chosen": -1.9999490976333618, "logits/rejected": -1.9698247909545898, "logps/chosen": -147.7600860595703, "logps/rejected": -184.58151245117188, "loss": 0.5751, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9416998624801636, "rewards/margins": 0.3579331636428833, "rewards/rejected": -1.2996331453323364, "step": 12750 }, { "epoch": 2.1984838042729153, "grad_norm": 15.5140380859375, "learning_rate": 2.0214783165512984e-08, "logits/chosen": -2.0031423568725586, "logits/rejected": -1.984412431716919, "logps/chosen": -154.734375, "logps/rejected": -188.3172149658203, "loss": 0.6214, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.026636004447937, "rewards/margins": 0.32841387391090393, "rewards/rejected": -1.3550498485565186, "step": 12760 }, { "epoch": 2.2002067539627843, "grad_norm": 18.524152755737305, "learning_rate": 2.0134328108996308e-08, "logits/chosen": -2.081307888031006, "logits/rejected": -2.045189619064331, "logps/chosen": -157.15953063964844, "logps/rejected": -180.63278198242188, "loss": 0.6119, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9919252395629883, "rewards/margins": 0.29099273681640625, "rewards/rejected": -1.2829179763793945, "step": 12770 }, { "epoch": 2.2019297036526533, "grad_norm": 14.907122611999512, "learning_rate": 2.0053993094665937e-08, "logits/chosen": -2.066694736480713, "logits/rejected": -2.043332815170288, "logps/chosen": -154.47755432128906, "logps/rejected": -186.40017700195312, "loss": 0.6074, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0254974365234375, "rewards/margins": 0.29950347542762756, "rewards/rejected": -1.3250008821487427, "step": 12780 }, { "epoch": 2.2036526533425222, "grad_norm": 18.479488372802734, "learning_rate": 1.9973778445420732e-08, "logits/chosen": -2.0080161094665527, "logits/rejected": -1.985799789428711, "logps/chosen": -162.12893676757812, "logps/rejected": -196.8030548095703, "loss": 0.5861, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0774407386779785, "rewards/margins": 0.355185329914093, "rewards/rejected": -1.4326260089874268, "step": 12790 }, { "epoch": 2.205375603032391, "grad_norm": 21.599821090698242, "learning_rate": 1.9893684483675706e-08, "logits/chosen": -2.0434632301330566, "logits/rejected": -2.0225682258605957, "logps/chosen": -155.9043426513672, "logps/rejected": -186.32249450683594, "loss": 0.6037, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.998375415802002, "rewards/margins": 0.3091413378715515, "rewards/rejected": -1.3075168132781982, "step": 12800 }, { "epoch": 2.205375603032391, "eval_logits/chosen": -2.1457481384277344, "eval_logits/rejected": -2.1340065002441406, "eval_logps/chosen": -143.61268615722656, "eval_logps/rejected": -166.6464080810547, "eval_loss": 0.6395144462585449, "eval_rewards/accuracies": 0.6317379474639893, "eval_rewards/chosen": -0.8490078449249268, "eval_rewards/margins": 0.18565502762794495, "eval_rewards/rejected": -1.0346628427505493, "eval_runtime": 385.1864, "eval_samples_per_second": 11.174, "eval_steps_per_second": 1.397, "step": 12800 }, { "epoch": 2.2070985527222606, "grad_norm": 16.009641647338867, "learning_rate": 1.98137115313608e-08, "logits/chosen": -2.092655658721924, "logits/rejected": -2.079068422317505, "logps/chosen": -152.90447998046875, "logps/rejected": -185.06155395507812, "loss": 0.6249, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.023121953010559, "rewards/margins": 0.27441108226776123, "rewards/rejected": -1.2975329160690308, "step": 12810 }, { "epoch": 2.2088215024121296, "grad_norm": 18.711111068725586, "learning_rate": 1.9733859909919593e-08, "logits/chosen": -1.9710042476654053, "logits/rejected": -1.9577134847640991, "logps/chosen": -150.876953125, "logps/rejected": -183.58116149902344, "loss": 0.5987, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9782198071479797, "rewards/margins": 0.312214732170105, "rewards/rejected": -1.2904345989227295, "step": 12820 }, { "epoch": 2.2105444521019986, "grad_norm": 15.833746910095215, "learning_rate": 1.9654129940307994e-08, "logits/chosen": -1.9803451299667358, "logits/rejected": -1.957166314125061, "logps/chosen": -156.68870544433594, "logps/rejected": -184.51222229003906, "loss": 0.6218, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0626084804534912, "rewards/margins": 0.26002973318099976, "rewards/rejected": -1.3226382732391357, "step": 12830 }, { "epoch": 2.2122674017918675, "grad_norm": 16.070863723754883, "learning_rate": 1.9574521942992884e-08, "logits/chosen": -2.074103832244873, "logits/rejected": -2.055760145187378, "logps/chosen": -163.092041015625, "logps/rejected": -200.02920532226562, "loss": 0.5811, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0788198709487915, "rewards/margins": 0.3750024437904358, "rewards/rejected": -1.453822374343872, "step": 12840 }, { "epoch": 2.213990351481737, "grad_norm": 15.97121524810791, "learning_rate": 1.9495036237950956e-08, "logits/chosen": -2.08003568649292, "logits/rejected": -2.048692464828491, "logps/chosen": -152.99508666992188, "logps/rejected": -188.7675323486328, "loss": 0.5707, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9989916086196899, "rewards/margins": 0.3779844343662262, "rewards/rejected": -1.3769760131835938, "step": 12850 }, { "epoch": 2.215713301171606, "grad_norm": 16.751462936401367, "learning_rate": 1.9415673144667326e-08, "logits/chosen": -1.9713541269302368, "logits/rejected": -1.9472278356552124, "logps/chosen": -162.13833618164062, "logps/rejected": -194.90127563476562, "loss": 0.5786, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0591973066329956, "rewards/margins": 0.3721071481704712, "rewards/rejected": -1.4313045740127563, "step": 12860 }, { "epoch": 2.217436250861475, "grad_norm": 22.097108840942383, "learning_rate": 1.9336432982134266e-08, "logits/chosen": -2.0541343688964844, "logits/rejected": -2.0348217487335205, "logps/chosen": -151.13645935058594, "logps/rejected": -180.7276611328125, "loss": 0.6123, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9875195622444153, "rewards/margins": 0.3130178153514862, "rewards/rejected": -1.300537347793579, "step": 12870 }, { "epoch": 2.219159200551344, "grad_norm": 16.315982818603516, "learning_rate": 1.925731606884998e-08, "logits/chosen": -2.0315511226654053, "logits/rejected": -2.0122642517089844, "logps/chosen": -151.38046264648438, "logps/rejected": -187.273193359375, "loss": 0.5852, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9819539189338684, "rewards/margins": 0.3349192142486572, "rewards/rejected": -1.3168730735778809, "step": 12880 }, { "epoch": 2.220882150241213, "grad_norm": 15.932037353515625, "learning_rate": 1.9178322722817288e-08, "logits/chosen": -2.0865612030029297, "logits/rejected": -2.063091516494751, "logps/chosen": -155.70559692382812, "logps/rejected": -181.44937133789062, "loss": 0.6103, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9882497787475586, "rewards/margins": 0.2861793637275696, "rewards/rejected": -1.2744290828704834, "step": 12890 }, { "epoch": 2.222605099931082, "grad_norm": 16.965890884399414, "learning_rate": 1.9099453261542297e-08, "logits/chosen": -2.020188808441162, "logits/rejected": -1.9903411865234375, "logps/chosen": -161.2719268798828, "logps/rejected": -201.2347412109375, "loss": 0.5773, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.073697566986084, "rewards/margins": 0.3945622444152832, "rewards/rejected": -1.4682600498199463, "step": 12900 }, { "epoch": 2.222605099931082, "eval_logits/chosen": -2.1446316242218018, "eval_logits/rejected": -2.1329281330108643, "eval_logps/chosen": -143.33169555664062, "eval_logps/rejected": -166.38255310058594, "eval_loss": 0.6392897963523865, "eval_rewards/accuracies": 0.63150554895401, "eval_rewards/chosen": -0.8461978435516357, "eval_rewards/margins": 0.18582624197006226, "eval_rewards/rejected": -1.0320241451263428, "eval_runtime": 385.2243, "eval_samples_per_second": 11.173, "eval_steps_per_second": 1.397, "step": 12900 }, { "epoch": 2.224328049620951, "grad_norm": 14.013476371765137, "learning_rate": 1.9020708002033182e-08, "logits/chosen": -2.115291118621826, "logits/rejected": -2.1002984046936035, "logps/chosen": -143.402587890625, "logps/rejected": -182.09503173828125, "loss": 0.5875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9274517297744751, "rewards/margins": 0.3397791385650635, "rewards/rejected": -1.2672309875488281, "step": 12910 }, { "epoch": 2.22605099931082, "grad_norm": 13.010714530944824, "learning_rate": 1.8942087260798933e-08, "logits/chosen": -2.0287556648254395, "logits/rejected": -2.007045030593872, "logps/chosen": -147.4208221435547, "logps/rejected": -188.57379150390625, "loss": 0.5641, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9478145837783813, "rewards/margins": 0.4056933522224426, "rewards/rejected": -1.3535078763961792, "step": 12920 }, { "epoch": 2.227773949000689, "grad_norm": 15.323067665100098, "learning_rate": 1.886359135384805e-08, "logits/chosen": -2.0572996139526367, "logits/rejected": -2.038787603378296, "logps/chosen": -145.76373291015625, "logps/rejected": -175.77511596679688, "loss": 0.6275, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9626489877700806, "rewards/margins": 0.26793935894966125, "rewards/rejected": -1.2305881977081299, "step": 12930 }, { "epoch": 2.229496898690558, "grad_norm": 18.080646514892578, "learning_rate": 1.8785220596687244e-08, "logits/chosen": -1.9975080490112305, "logits/rejected": -1.9690163135528564, "logps/chosen": -153.9081573486328, "logps/rejected": -182.8682098388672, "loss": 0.6037, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0187857151031494, "rewards/margins": 0.3161763846874237, "rewards/rejected": -1.3349621295928955, "step": 12940 }, { "epoch": 2.231219848380427, "grad_norm": 23.47930908203125, "learning_rate": 1.870697530432019e-08, "logits/chosen": -1.9963634014129639, "logits/rejected": -1.9752271175384521, "logps/chosen": -154.6025390625, "logps/rejected": -197.92550659179688, "loss": 0.5545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0039188861846924, "rewards/margins": 0.43386000394821167, "rewards/rejected": -1.4377788305282593, "step": 12950 }, { "epoch": 2.2329427980702965, "grad_norm": 17.822969436645508, "learning_rate": 1.8628855791246323e-08, "logits/chosen": -1.964392066001892, "logits/rejected": -1.9295297861099243, "logps/chosen": -160.80946350097656, "logps/rejected": -186.71607971191406, "loss": 0.6034, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.023686170578003, "rewards/margins": 0.3135334253311157, "rewards/rejected": -1.3372195959091187, "step": 12960 }, { "epoch": 2.2346657477601655, "grad_norm": 16.112186431884766, "learning_rate": 1.8550862371459457e-08, "logits/chosen": -1.9407732486724854, "logits/rejected": -1.907962441444397, "logps/chosen": -150.91415405273438, "logps/rejected": -186.69329833984375, "loss": 0.5681, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9820247888565063, "rewards/margins": 0.39889007806777954, "rewards/rejected": -1.3809149265289307, "step": 12970 }, { "epoch": 2.2363886974500344, "grad_norm": 15.690720558166504, "learning_rate": 1.8472995358446646e-08, "logits/chosen": -1.9751226902008057, "logits/rejected": -1.9606326818466187, "logps/chosen": -148.14199829101562, "logps/rejected": -185.90708923339844, "loss": 0.5739, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9735643267631531, "rewards/margins": 0.34938764572143555, "rewards/rejected": -1.3229520320892334, "step": 12980 }, { "epoch": 2.2381116471399034, "grad_norm": 15.577566146850586, "learning_rate": 1.8395255065186804e-08, "logits/chosen": -2.0838000774383545, "logits/rejected": -2.0450470447540283, "logps/chosen": -163.3155975341797, "logps/rejected": -197.12139892578125, "loss": 0.5828, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0458481311798096, "rewards/margins": 0.3951478600502014, "rewards/rejected": -1.4409959316253662, "step": 12990 }, { "epoch": 2.2398345968297724, "grad_norm": 19.152341842651367, "learning_rate": 1.8317641804149575e-08, "logits/chosen": -2.0044026374816895, "logits/rejected": -1.9810245037078857, "logps/chosen": -157.32891845703125, "logps/rejected": -193.1121368408203, "loss": 0.5747, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.038910984992981, "rewards/margins": 0.37423938512802124, "rewards/rejected": -1.4131505489349365, "step": 13000 }, { "epoch": 2.2398345968297724, "eval_logits/chosen": -2.1380844116210938, "eval_logits/rejected": -2.126225233078003, "eval_logps/chosen": -144.889892578125, "eval_logps/rejected": -168.15794372558594, "eval_loss": 0.6391336917877197, "eval_rewards/accuracies": 0.6319702863693237, "eval_rewards/chosen": -0.861780047416687, "eval_rewards/margins": 0.18799810111522675, "eval_rewards/rejected": -1.0497781038284302, "eval_runtime": 384.3443, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 13000 }, { "epoch": 2.241557546519642, "grad_norm": 13.771430015563965, "learning_rate": 1.8240155887293938e-08, "logits/chosen": -1.9830691814422607, "logits/rejected": -1.9624723196029663, "logps/chosen": -153.43020629882812, "logps/rejected": -184.18038940429688, "loss": 0.6023, "rewards/accuracies": 0.65625, "rewards/chosen": -1.031475305557251, "rewards/margins": 0.3135696053504944, "rewards/rejected": -1.3450448513031006, "step": 13010 }, { "epoch": 2.2432804962095108, "grad_norm": 16.51658058166504, "learning_rate": 1.8162797626067072e-08, "logits/chosen": -1.9825899600982666, "logits/rejected": -1.956128478050232, "logps/chosen": -149.4762420654297, "logps/rejected": -179.665283203125, "loss": 0.5959, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9774316549301147, "rewards/margins": 0.3195692002773285, "rewards/rejected": -1.297000765800476, "step": 13020 }, { "epoch": 2.2450034458993797, "grad_norm": 16.61245346069336, "learning_rate": 1.808556733140306e-08, "logits/chosen": -2.0801331996917725, "logits/rejected": -2.0455222129821777, "logps/chosen": -155.20437622070312, "logps/rejected": -183.345947265625, "loss": 0.6036, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0056549310684204, "rewards/margins": 0.3047105669975281, "rewards/rejected": -1.3103654384613037, "step": 13030 }, { "epoch": 2.2467263955892487, "grad_norm": 26.6284236907959, "learning_rate": 1.800846531372161e-08, "logits/chosen": -2.087765693664551, "logits/rejected": -2.061800479888916, "logps/chosen": -153.1880645751953, "logps/rejected": -199.19517517089844, "loss": 0.5608, "rewards/accuracies": 0.75, "rewards/chosen": -0.9964033365249634, "rewards/margins": 0.4374423921108246, "rewards/rejected": -1.4338457584381104, "step": 13040 }, { "epoch": 2.2484493452791177, "grad_norm": 19.713693618774414, "learning_rate": 1.7931491882926813e-08, "logits/chosen": -2.0577986240386963, "logits/rejected": -2.047492742538452, "logps/chosen": -152.95919799804688, "logps/rejected": -187.91717529296875, "loss": 0.5915, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0116349458694458, "rewards/margins": 0.3282049596309662, "rewards/rejected": -1.3398398160934448, "step": 13050 }, { "epoch": 2.250172294968987, "grad_norm": 19.350893020629883, "learning_rate": 1.7854647348405993e-08, "logits/chosen": -2.0712578296661377, "logits/rejected": -2.052372455596924, "logps/chosen": -152.1072235107422, "logps/rejected": -190.4071044921875, "loss": 0.5861, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9930790662765503, "rewards/margins": 0.36402827501296997, "rewards/rejected": -1.3571072816848755, "step": 13060 }, { "epoch": 2.251895244658856, "grad_norm": 26.568830490112305, "learning_rate": 1.7777932019028314e-08, "logits/chosen": -1.9899612665176392, "logits/rejected": -1.9575088024139404, "logps/chosen": -155.0738983154297, "logps/rejected": -181.45294189453125, "loss": 0.6129, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9760218858718872, "rewards/margins": 0.3040178418159485, "rewards/rejected": -1.2800395488739014, "step": 13070 }, { "epoch": 2.253618194348725, "grad_norm": 17.207115173339844, "learning_rate": 1.770134620314363e-08, "logits/chosen": -1.9565757513046265, "logits/rejected": -1.9381873607635498, "logps/chosen": -151.86679077148438, "logps/rejected": -192.55300903320312, "loss": 0.5657, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0117337703704834, "rewards/margins": 0.3967505097389221, "rewards/rejected": -1.4084843397140503, "step": 13080 }, { "epoch": 2.255341144038594, "grad_norm": 21.145254135131836, "learning_rate": 1.762489020858125e-08, "logits/chosen": -2.0829334259033203, "logits/rejected": -2.048189401626587, "logps/chosen": -160.76290893554688, "logps/rejected": -191.86343383789062, "loss": 0.6047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.067816972732544, "rewards/margins": 0.328390508890152, "rewards/rejected": -1.396207571029663, "step": 13090 }, { "epoch": 2.257064093728463, "grad_norm": 12.942140579223633, "learning_rate": 1.754856434264869e-08, "logits/chosen": -2.118272066116333, "logits/rejected": -2.0803005695343018, "logps/chosen": -155.13015747070312, "logps/rejected": -188.33303833007812, "loss": 0.5788, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0082849264144897, "rewards/margins": 0.3701656758785248, "rewards/rejected": -1.378450632095337, "step": 13100 }, { "epoch": 2.257064093728463, "eval_logits/chosen": -2.1334829330444336, "eval_logits/rejected": -2.1215929985046387, "eval_logps/chosen": -144.7845458984375, "eval_logps/rejected": -168.07273864746094, "eval_loss": 0.6391546726226807, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.860726535320282, "eval_rewards/margins": 0.18819954991340637, "eval_rewards/rejected": -1.0489259958267212, "eval_runtime": 384.5396, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 13100 }, { "epoch": 2.2587870434183324, "grad_norm": 13.660865783691406, "learning_rate": 1.7472368912130365e-08, "logits/chosen": -1.9685924053192139, "logits/rejected": -1.931777000427246, "logps/chosen": -157.3345184326172, "logps/rejected": -195.2469482421875, "loss": 0.586, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0349032878875732, "rewards/margins": 0.40200504660606384, "rewards/rejected": -1.4369083642959595, "step": 13110 }, { "epoch": 2.2605099931082013, "grad_norm": 16.980867385864258, "learning_rate": 1.7396304223286484e-08, "logits/chosen": -2.0967485904693604, "logits/rejected": -2.0817317962646484, "logps/chosen": -158.95480346679688, "logps/rejected": -191.6622772216797, "loss": 0.6029, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0445785522460938, "rewards/margins": 0.3416438698768616, "rewards/rejected": -1.3862224817276, "step": 13120 }, { "epoch": 2.2622329427980703, "grad_norm": 14.808295249938965, "learning_rate": 1.73203705818517e-08, "logits/chosen": -2.1211535930633545, "logits/rejected": -2.097015380859375, "logps/chosen": -153.87583923339844, "logps/rejected": -201.82652282714844, "loss": 0.5618, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9995344877243042, "rewards/margins": 0.43660640716552734, "rewards/rejected": -1.4361408948898315, "step": 13130 }, { "epoch": 2.2639558924879393, "grad_norm": 15.327201843261719, "learning_rate": 1.724456829303399e-08, "logits/chosen": -1.9677460193634033, "logits/rejected": -1.941516637802124, "logps/chosen": -148.75718688964844, "logps/rejected": -185.95790100097656, "loss": 0.5735, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9504595994949341, "rewards/margins": 0.36352819204330444, "rewards/rejected": -1.3139877319335938, "step": 13140 }, { "epoch": 2.2656788421778082, "grad_norm": 16.996061325073242, "learning_rate": 1.71688976615133e-08, "logits/chosen": -1.9891726970672607, "logits/rejected": -1.961618185043335, "logps/chosen": -151.58334350585938, "logps/rejected": -194.8992919921875, "loss": 0.5568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9639384150505066, "rewards/margins": 0.4504380226135254, "rewards/rejected": -1.4143764972686768, "step": 13150 }, { "epoch": 2.2674017918676777, "grad_norm": 17.835615158081055, "learning_rate": 1.7093358991440466e-08, "logits/chosen": -1.9766948223114014, "logits/rejected": -1.945793867111206, "logps/chosen": -164.67056274414062, "logps/rejected": -198.68304443359375, "loss": 0.5775, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0814361572265625, "rewards/margins": 0.3656962215900421, "rewards/rejected": -1.4471323490142822, "step": 13160 }, { "epoch": 2.2691247415575466, "grad_norm": 21.727357864379883, "learning_rate": 1.7017952586435874e-08, "logits/chosen": -2.0284652709960938, "logits/rejected": -2.014500856399536, "logps/chosen": -157.46298217773438, "logps/rejected": -185.32229614257812, "loss": 0.6068, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0364782810211182, "rewards/margins": 0.29229897260665894, "rewards/rejected": -1.3287770748138428, "step": 13170 }, { "epoch": 2.2708476912474156, "grad_norm": 17.38833236694336, "learning_rate": 1.6942678749588263e-08, "logits/chosen": -2.0014171600341797, "logits/rejected": -1.9726762771606445, "logps/chosen": -154.9274444580078, "logps/rejected": -192.7602081298828, "loss": 0.5681, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.015880823135376, "rewards/margins": 0.3989528715610504, "rewards/rejected": -1.414833664894104, "step": 13180 }, { "epoch": 2.2725706409372846, "grad_norm": 18.264333724975586, "learning_rate": 1.686753778345359e-08, "logits/chosen": -2.0150949954986572, "logits/rejected": -1.9915109872817993, "logps/chosen": -159.1856689453125, "logps/rejected": -185.82431030273438, "loss": 0.6098, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0348490476608276, "rewards/margins": 0.3016083240509033, "rewards/rejected": -1.3364574909210205, "step": 13190 }, { "epoch": 2.2742935906271535, "grad_norm": 18.49651527404785, "learning_rate": 1.6792529990053715e-08, "logits/chosen": -2.008836269378662, "logits/rejected": -1.9799654483795166, "logps/chosen": -153.99813842773438, "logps/rejected": -182.87411499023438, "loss": 0.6091, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9913076162338257, "rewards/margins": 0.3085794448852539, "rewards/rejected": -1.2998870611190796, "step": 13200 }, { "epoch": 2.2742935906271535, "eval_logits/chosen": -2.1296310424804688, "eval_logits/rejected": -2.1177053451538086, "eval_logps/chosen": -144.74273681640625, "eval_logps/rejected": -168.11962890625, "eval_loss": 0.6390330791473389, "eval_rewards/accuracies": 0.6326673030853271, "eval_rewards/chosen": -0.8603084087371826, "eval_rewards/margins": 0.18908649682998657, "eval_rewards/rejected": -1.049394965171814, "eval_runtime": 384.5272, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 13200 }, { "epoch": 2.2760165403170225, "grad_norm": 19.388731002807617, "learning_rate": 1.671765567087523e-08, "logits/chosen": -2.097153663635254, "logits/rejected": -2.0835232734680176, "logps/chosen": -147.59706115722656, "logps/rejected": -180.18310546875, "loss": 0.6095, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9546787142753601, "rewards/margins": 0.3158749043941498, "rewards/rejected": -1.2705535888671875, "step": 13210 }, { "epoch": 2.277739490006892, "grad_norm": 16.894678115844727, "learning_rate": 1.6642915126868203e-08, "logits/chosen": -2.018634080886841, "logits/rejected": -2.002504587173462, "logps/chosen": -155.10250854492188, "logps/rejected": -184.6492462158203, "loss": 0.606, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0010809898376465, "rewards/margins": 0.285371333360672, "rewards/rejected": -1.286452293395996, "step": 13220 }, { "epoch": 2.279462439696761, "grad_norm": 16.523439407348633, "learning_rate": 1.6568308658445064e-08, "logits/chosen": -2.0169358253479004, "logits/rejected": -1.9910701513290405, "logps/chosen": -141.59849548339844, "logps/rejected": -183.8115997314453, "loss": 0.5612, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9070154428482056, "rewards/margins": 0.3991144299507141, "rewards/rejected": -1.306129813194275, "step": 13230 }, { "epoch": 2.28118538938663, "grad_norm": 16.436738967895508, "learning_rate": 1.6493836565479324e-08, "logits/chosen": -2.0390543937683105, "logits/rejected": -2.016165256500244, "logps/chosen": -154.6096649169922, "logps/rejected": -189.32794189453125, "loss": 0.585, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.032711148262024, "rewards/margins": 0.33145707845687866, "rewards/rejected": -1.3641681671142578, "step": 13240 }, { "epoch": 2.282908339076499, "grad_norm": 17.20087432861328, "learning_rate": 1.6419499147304366e-08, "logits/chosen": -2.011890411376953, "logits/rejected": -2.0017952919006348, "logps/chosen": -151.43869018554688, "logps/rejected": -193.16395568847656, "loss": 0.5857, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9976789355278015, "rewards/margins": 0.3742561936378479, "rewards/rejected": -1.3719351291656494, "step": 13250 }, { "epoch": 2.2846312887663682, "grad_norm": 17.669790267944336, "learning_rate": 1.634529670271224e-08, "logits/chosen": -2.1008923053741455, "logits/rejected": -2.0809853076934814, "logps/chosen": -153.43072509765625, "logps/rejected": -199.7514190673828, "loss": 0.5602, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.053886890411377, "rewards/margins": 0.41757315397262573, "rewards/rejected": -1.471459984779358, "step": 13260 }, { "epoch": 2.286354238456237, "grad_norm": 19.183948516845703, "learning_rate": 1.6271229529952563e-08, "logits/chosen": -1.9490960836410522, "logits/rejected": -1.9322376251220703, "logps/chosen": -159.45132446289062, "logps/rejected": -198.96778869628906, "loss": 0.5838, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0490272045135498, "rewards/margins": 0.3933560252189636, "rewards/rejected": -1.4423832893371582, "step": 13270 }, { "epoch": 2.288077188146106, "grad_norm": 16.367591857910156, "learning_rate": 1.619729792673114e-08, "logits/chosen": -2.027101755142212, "logits/rejected": -1.9953157901763916, "logps/chosen": -152.5106201171875, "logps/rejected": -184.47877502441406, "loss": 0.6077, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9881440997123718, "rewards/margins": 0.3434295058250427, "rewards/rejected": -1.331573724746704, "step": 13280 }, { "epoch": 2.289800137835975, "grad_norm": 19.335756301879883, "learning_rate": 1.6123502190208944e-08, "logits/chosen": -2.016515016555786, "logits/rejected": -1.994356393814087, "logps/chosen": -150.08535766601562, "logps/rejected": -188.57496643066406, "loss": 0.5723, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9626877903938293, "rewards/margins": 0.393248051404953, "rewards/rejected": -1.3559359312057495, "step": 13290 }, { "epoch": 2.291523087525844, "grad_norm": 19.086881637573242, "learning_rate": 1.6049842617000826e-08, "logits/chosen": -2.03844952583313, "logits/rejected": -2.0162439346313477, "logps/chosen": -155.529052734375, "logps/rejected": -183.83062744140625, "loss": 0.6213, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0434372425079346, "rewards/margins": 0.27912062406539917, "rewards/rejected": -1.3225579261779785, "step": 13300 }, { "epoch": 2.291523087525844, "eval_logits/chosen": -2.1260769367218018, "eval_logits/rejected": -2.1141231060028076, "eval_logps/chosen": -144.87376403808594, "eval_logps/rejected": -168.20582580566406, "eval_loss": 0.6392676830291748, "eval_rewards/accuracies": 0.6301115155220032, "eval_rewards/chosen": -0.8616187572479248, "eval_rewards/margins": 0.18863816559314728, "eval_rewards/rejected": -1.0502568483352661, "eval_runtime": 384.4074, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.4, "step": 13300 }, { "epoch": 2.293246037215713, "grad_norm": 15.481252670288086, "learning_rate": 1.5976319503174313e-08, "logits/chosen": -2.002896547317505, "logits/rejected": -1.981095552444458, "logps/chosen": -163.01007080078125, "logps/rejected": -194.62033081054688, "loss": 0.5993, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0587632656097412, "rewards/margins": 0.34795278310775757, "rewards/rejected": -1.406715989112854, "step": 13310 }, { "epoch": 2.2949689869055825, "grad_norm": 21.10832405090332, "learning_rate": 1.590293314424846e-08, "logits/chosen": -2.08011531829834, "logits/rejected": -2.041942596435547, "logps/chosen": -163.9292755126953, "logps/rejected": -176.40731811523438, "loss": 0.662, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.0805408954620361, "rewards/margins": 0.20480044186115265, "rewards/rejected": -1.2853412628173828, "step": 13320 }, { "epoch": 2.2966919365954515, "grad_norm": 25.467117309570312, "learning_rate": 1.582968383519267e-08, "logits/chosen": -1.9539167881011963, "logits/rejected": -1.9279472827911377, "logps/chosen": -158.21173095703125, "logps/rejected": -188.57064819335938, "loss": 0.6174, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0213114023208618, "rewards/margins": 0.3650321066379547, "rewards/rejected": -1.3863435983657837, "step": 13330 }, { "epoch": 2.2984148862853204, "grad_norm": 20.097253799438477, "learning_rate": 1.5756571870425485e-08, "logits/chosen": -2.0935399532318115, "logits/rejected": -2.076225519180298, "logps/chosen": -160.75918579101562, "logps/rejected": -187.359375, "loss": 0.6327, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0616037845611572, "rewards/margins": 0.27013906836509705, "rewards/rejected": -1.3317428827285767, "step": 13340 }, { "epoch": 2.3001378359751894, "grad_norm": 18.155202865600586, "learning_rate": 1.568359754381337e-08, "logits/chosen": -2.0476527214050293, "logits/rejected": -2.004943370819092, "logps/chosen": -153.63682556152344, "logps/rejected": -186.45098876953125, "loss": 0.5812, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9670025110244751, "rewards/margins": 0.38068169355392456, "rewards/rejected": -1.3476841449737549, "step": 13350 }, { "epoch": 2.301860785665059, "grad_norm": 22.0352725982666, "learning_rate": 1.5610761148669588e-08, "logits/chosen": -2.046896457672119, "logits/rejected": -2.0261380672454834, "logps/chosen": -162.08412170410156, "logps/rejected": -195.78102111816406, "loss": 0.589, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.047942042350769, "rewards/margins": 0.3743414878845215, "rewards/rejected": -1.4222838878631592, "step": 13360 }, { "epoch": 2.3035837353549278, "grad_norm": 14.853294372558594, "learning_rate": 1.5538062977753007e-08, "logits/chosen": -1.9917491674423218, "logits/rejected": -1.959970474243164, "logps/chosen": -156.3326416015625, "logps/rejected": -190.05731201171875, "loss": 0.6061, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0209949016571045, "rewards/margins": 0.3512216806411743, "rewards/rejected": -1.3722165822982788, "step": 13370 }, { "epoch": 2.3053066850447967, "grad_norm": 17.56020736694336, "learning_rate": 1.5465503323266933e-08, "logits/chosen": -1.962013602256775, "logits/rejected": -1.9290409088134766, "logps/chosen": -160.27578735351562, "logps/rejected": -186.50778198242188, "loss": 0.6128, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0338425636291504, "rewards/margins": 0.29664623737335205, "rewards/rejected": -1.3304888010025024, "step": 13380 }, { "epoch": 2.3070296347346657, "grad_norm": 15.716300964355469, "learning_rate": 1.539308247685787e-08, "logits/chosen": -1.937011957168579, "logits/rejected": -1.8979556560516357, "logps/chosen": -151.38088989257812, "logps/rejected": -182.62875366210938, "loss": 0.5913, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9424049258232117, "rewards/margins": 0.3646990656852722, "rewards/rejected": -1.3071041107177734, "step": 13390 }, { "epoch": 2.3087525844245347, "grad_norm": 13.886823654174805, "learning_rate": 1.532080072961442e-08, "logits/chosen": -1.9972772598266602, "logits/rejected": -1.9663184881210327, "logps/chosen": -148.74501037597656, "logps/rejected": -189.42501831054688, "loss": 0.5545, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9377354383468628, "rewards/margins": 0.44995102286338806, "rewards/rejected": -1.3876866102218628, "step": 13400 }, { "epoch": 2.3087525844245347, "eval_logits/chosen": -2.1350319385528564, "eval_logits/rejected": -2.123056411743164, "eval_logps/chosen": -142.32138061523438, "eval_logps/rejected": -165.27003479003906, "eval_loss": 0.6397269368171692, "eval_rewards/accuracies": 0.6310408711433411, "eval_rewards/chosen": -0.8360949754714966, "eval_rewards/margins": 0.18480420112609863, "eval_rewards/rejected": -1.0208991765975952, "eval_runtime": 384.4278, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 13400 }, { "epoch": 2.3104755341144037, "grad_norm": 13.393156051635742, "learning_rate": 1.5248658372066107e-08, "logits/chosen": -2.0323429107666016, "logits/rejected": -2.0077059268951416, "logps/chosen": -162.0485076904297, "logps/rejected": -192.8089141845703, "loss": 0.6096, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0806329250335693, "rewards/margins": 0.334946870803833, "rewards/rejected": -1.4155797958374023, "step": 13410 }, { "epoch": 2.312198483804273, "grad_norm": 22.341718673706055, "learning_rate": 1.5176655694182156e-08, "logits/chosen": -2.022022008895874, "logits/rejected": -2.008124351501465, "logps/chosen": -155.74832153320312, "logps/rejected": -200.64682006835938, "loss": 0.5683, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0213615894317627, "rewards/margins": 0.421269029378891, "rewards/rejected": -1.4426305294036865, "step": 13420 }, { "epoch": 2.313921433494142, "grad_norm": 17.91521453857422, "learning_rate": 1.5104792985370406e-08, "logits/chosen": -2.0967748165130615, "logits/rejected": -2.0678954124450684, "logps/chosen": -156.0296630859375, "logps/rejected": -193.19207763671875, "loss": 0.5899, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0051010847091675, "rewards/margins": 0.3536093831062317, "rewards/rejected": -1.358710527420044, "step": 13430 }, { "epoch": 2.315644383184011, "grad_norm": 18.987171173095703, "learning_rate": 1.5033070534476055e-08, "logits/chosen": -1.9611173868179321, "logits/rejected": -1.9431546926498413, "logps/chosen": -145.51051330566406, "logps/rejected": -169.82398986816406, "loss": 0.6395, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9330219030380249, "rewards/margins": 0.25060832500457764, "rewards/rejected": -1.1836302280426025, "step": 13440 }, { "epoch": 2.31736733287388, "grad_norm": 14.591676712036133, "learning_rate": 1.4961488629780604e-08, "logits/chosen": -1.9967823028564453, "logits/rejected": -1.9735647439956665, "logps/chosen": -147.49501037597656, "logps/rejected": -179.7422637939453, "loss": 0.6089, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9708964228630066, "rewards/margins": 0.3026094138622284, "rewards/rejected": -1.2735059261322021, "step": 13450 }, { "epoch": 2.3190902825637494, "grad_norm": 14.98277759552002, "learning_rate": 1.489004755900058e-08, "logits/chosen": -2.0261378288269043, "logits/rejected": -1.9909276962280273, "logps/chosen": -142.7022705078125, "logps/rejected": -182.27566528320312, "loss": 0.5517, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8867403268814087, "rewards/margins": 0.4272170960903168, "rewards/rejected": -1.3139574527740479, "step": 13460 }, { "epoch": 2.3208132322536184, "grad_norm": 19.073030471801758, "learning_rate": 1.4818747609286486e-08, "logits/chosen": -1.907549500465393, "logits/rejected": -1.8753025531768799, "logps/chosen": -156.7547149658203, "logps/rejected": -184.81369018554688, "loss": 0.6003, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0169099569320679, "rewards/margins": 0.3250698447227478, "rewards/rejected": -1.34197998046875, "step": 13470 }, { "epoch": 2.3225361819434873, "grad_norm": 26.79694366455078, "learning_rate": 1.4747589067221627e-08, "logits/chosen": -2.0537400245666504, "logits/rejected": -2.0167617797851562, "logps/chosen": -148.76553344726562, "logps/rejected": -182.18597412109375, "loss": 0.5803, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9594322443008423, "rewards/margins": 0.34842368960380554, "rewards/rejected": -1.3078558444976807, "step": 13480 }, { "epoch": 2.3242591316333563, "grad_norm": 15.910707473754883, "learning_rate": 1.4676572218820831e-08, "logits/chosen": -2.019862174987793, "logits/rejected": -1.9907255172729492, "logps/chosen": -160.828857421875, "logps/rejected": -200.10427856445312, "loss": 0.5812, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.069057583808899, "rewards/margins": 0.39549151062965393, "rewards/rejected": -1.46454918384552, "step": 13490 }, { "epoch": 2.3259820813232253, "grad_norm": 15.987600326538086, "learning_rate": 1.4605697349529494e-08, "logits/chosen": -2.0376546382904053, "logits/rejected": -2.003347873687744, "logps/chosen": -157.43568420410156, "logps/rejected": -195.1698455810547, "loss": 0.5633, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0176358222961426, "rewards/margins": 0.415839821100235, "rewards/rejected": -1.4334756135940552, "step": 13500 }, { "epoch": 2.3259820813232253, "eval_logits/chosen": -2.130098819732666, "eval_logits/rejected": -2.118140697479248, "eval_logps/chosen": -143.9754638671875, "eval_logps/rejected": -167.23570251464844, "eval_loss": 0.639247715473175, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.8526357412338257, "eval_rewards/margins": 0.18792006373405457, "eval_rewards/rejected": -1.0405558347702026, "eval_runtime": 384.9137, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 13500 }, { "epoch": 2.3277050310130942, "grad_norm": 16.240617752075195, "learning_rate": 1.4534964744222339e-08, "logits/chosen": -1.972324013710022, "logits/rejected": -1.963090181350708, "logps/chosen": -145.39578247070312, "logps/rejected": -183.06991577148438, "loss": 0.5923, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9605167508125305, "rewards/margins": 0.33515670895576477, "rewards/rejected": -1.2956736087799072, "step": 13510 }, { "epoch": 2.3294279807029636, "grad_norm": 16.749074935913086, "learning_rate": 1.4464374687202224e-08, "logits/chosen": -1.8951972723007202, "logits/rejected": -1.8748648166656494, "logps/chosen": -153.66552734375, "logps/rejected": -194.826416015625, "loss": 0.5727, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0282341241836548, "rewards/margins": 0.40179547667503357, "rewards/rejected": -1.4300296306610107, "step": 13520 }, { "epoch": 2.3311509303928326, "grad_norm": 18.259716033935547, "learning_rate": 1.4393927462199062e-08, "logits/chosen": -2.0294575691223145, "logits/rejected": -1.999010682106018, "logps/chosen": -157.4952392578125, "logps/rejected": -179.57333374023438, "loss": 0.6207, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0256043672561646, "rewards/margins": 0.24991169571876526, "rewards/rejected": -1.275516152381897, "step": 13530 }, { "epoch": 2.3328738800827016, "grad_norm": 15.883688926696777, "learning_rate": 1.4323623352368691e-08, "logits/chosen": -1.942350149154663, "logits/rejected": -1.9115841388702393, "logps/chosen": -156.84133911132812, "logps/rejected": -184.58383178710938, "loss": 0.5947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0108706951141357, "rewards/margins": 0.3316653370857239, "rewards/rejected": -1.3425359725952148, "step": 13540 }, { "epoch": 2.3345968297725705, "grad_norm": 15.075319290161133, "learning_rate": 1.4253462640291708e-08, "logits/chosen": -2.0009243488311768, "logits/rejected": -1.9776980876922607, "logps/chosen": -153.23745727539062, "logps/rejected": -186.21145629882812, "loss": 0.5978, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0216188430786133, "rewards/margins": 0.3358477056026459, "rewards/rejected": -1.357466459274292, "step": 13550 }, { "epoch": 2.3363197794624395, "grad_norm": 23.074697494506836, "learning_rate": 1.4183445607972299e-08, "logits/chosen": -2.0485198497772217, "logits/rejected": -2.037168025970459, "logps/chosen": -151.03939819335938, "logps/rejected": -179.73341369628906, "loss": 0.6318, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9932451248168945, "rewards/margins": 0.27533403038978577, "rewards/rejected": -1.268579125404358, "step": 13560 }, { "epoch": 2.338042729152309, "grad_norm": 19.367040634155273, "learning_rate": 1.4113572536837192e-08, "logits/chosen": -1.9444500207901, "logits/rejected": -1.9214109182357788, "logps/chosen": -154.20669555664062, "logps/rejected": -191.89852905273438, "loss": 0.5783, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0325627326965332, "rewards/margins": 0.37548479437828064, "rewards/rejected": -1.4080475568771362, "step": 13570 }, { "epoch": 2.339765678842178, "grad_norm": 18.08199119567871, "learning_rate": 1.4043843707734448e-08, "logits/chosen": -1.989943265914917, "logits/rejected": -1.9770567417144775, "logps/chosen": -146.2432403564453, "logps/rejected": -182.78079223632812, "loss": 0.5861, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9466387629508972, "rewards/margins": 0.34606269001960754, "rewards/rejected": -1.292701244354248, "step": 13580 }, { "epoch": 2.341488628532047, "grad_norm": 16.135709762573242, "learning_rate": 1.3974259400932348e-08, "logits/chosen": -1.960688829421997, "logits/rejected": -1.9561231136322021, "logps/chosen": -155.49755859375, "logps/rejected": -193.28964233398438, "loss": 0.586, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0300614833831787, "rewards/margins": 0.36471375823020935, "rewards/rejected": -1.394775152206421, "step": 13590 }, { "epoch": 2.343211578221916, "grad_norm": 26.572298049926758, "learning_rate": 1.3904819896118314e-08, "logits/chosen": -1.9783565998077393, "logits/rejected": -1.9559013843536377, "logps/chosen": -164.37818908691406, "logps/rejected": -193.47348022460938, "loss": 0.5982, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0572301149368286, "rewards/margins": 0.3041311800479889, "rewards/rejected": -1.3613612651824951, "step": 13600 }, { "epoch": 2.343211578221916, "eval_logits/chosen": -2.1255154609680176, "eval_logits/rejected": -2.113429069519043, "eval_logps/chosen": -144.1549072265625, "eval_logps/rejected": -167.4862060546875, "eval_loss": 0.639102041721344, "eval_rewards/accuracies": 0.6319702863693237, "eval_rewards/chosen": -0.8544301986694336, "eval_rewards/margins": 0.18863078951835632, "eval_rewards/rejected": -1.0430610179901123, "eval_runtime": 384.7333, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 13600 }, { "epoch": 2.344934527911785, "grad_norm": 18.47875213623047, "learning_rate": 1.3835525472397747e-08, "logits/chosen": -2.1500205993652344, "logits/rejected": -2.1270811557769775, "logps/chosen": -148.36859130859375, "logps/rejected": -180.4636688232422, "loss": 0.606, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9556807279586792, "rewards/margins": 0.31611624360084534, "rewards/rejected": -1.2717970609664917, "step": 13610 }, { "epoch": 2.346657477601654, "grad_norm": 18.58573341369629, "learning_rate": 1.376637640829289e-08, "logits/chosen": -2.0150363445281982, "logits/rejected": -1.9701687097549438, "logps/chosen": -155.78707885742188, "logps/rejected": -190.22085571289062, "loss": 0.5752, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9964166879653931, "rewards/margins": 0.3892660439014435, "rewards/rejected": -1.3856825828552246, "step": 13620 }, { "epoch": 2.348380427291523, "grad_norm": 18.710498809814453, "learning_rate": 1.3697372981741707e-08, "logits/chosen": -1.9564098119735718, "logits/rejected": -1.913854956626892, "logps/chosen": -158.7633514404297, "logps/rejected": -196.12355041503906, "loss": 0.5628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0405049324035645, "rewards/margins": 0.41902294754981995, "rewards/rejected": -1.4595280885696411, "step": 13630 }, { "epoch": 2.350103376981392, "grad_norm": 13.309480667114258, "learning_rate": 1.362851547009684e-08, "logits/chosen": -1.9671580791473389, "logits/rejected": -1.94460928440094, "logps/chosen": -155.10699462890625, "logps/rejected": -191.30538940429688, "loss": 0.5832, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0025789737701416, "rewards/margins": 0.36596331000328064, "rewards/rejected": -1.3685424327850342, "step": 13640 }, { "epoch": 2.351826326671261, "grad_norm": 17.40332794189453, "learning_rate": 1.3559804150124421e-08, "logits/chosen": -2.048649311065674, "logits/rejected": -2.0223517417907715, "logps/chosen": -155.4260711669922, "logps/rejected": -190.3490447998047, "loss": 0.5773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0141645669937134, "rewards/margins": 0.3723183572292328, "rewards/rejected": -1.3864829540252686, "step": 13650 }, { "epoch": 2.35354927636113, "grad_norm": 17.542123794555664, "learning_rate": 1.3491239298002954e-08, "logits/chosen": -1.9272973537445068, "logits/rejected": -1.9010194540023804, "logps/chosen": -151.72872924804688, "logps/rejected": -183.93618774414062, "loss": 0.6019, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9869184494018555, "rewards/margins": 0.31352168321609497, "rewards/rejected": -1.3004400730133057, "step": 13660 }, { "epoch": 2.3552722260509995, "grad_norm": 26.70223617553711, "learning_rate": 1.3422821189322231e-08, "logits/chosen": -2.041295289993286, "logits/rejected": -2.0157735347747803, "logps/chosen": -157.9037628173828, "logps/rejected": -189.9707489013672, "loss": 0.6024, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.031822681427002, "rewards/margins": 0.32137739658355713, "rewards/rejected": -1.353200078010559, "step": 13670 }, { "epoch": 2.3569951757408685, "grad_norm": 16.37574577331543, "learning_rate": 1.3354550099082256e-08, "logits/chosen": -2.0462825298309326, "logits/rejected": -2.0164105892181396, "logps/chosen": -151.8605499267578, "logps/rejected": -177.0863037109375, "loss": 0.6039, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9721083641052246, "rewards/margins": 0.2905394434928894, "rewards/rejected": -1.2626478672027588, "step": 13680 }, { "epoch": 2.3587181254307374, "grad_norm": 14.60448169708252, "learning_rate": 1.3286426301692105e-08, "logits/chosen": -2.019700527191162, "logits/rejected": -1.9868907928466797, "logps/chosen": -151.9207000732422, "logps/rejected": -192.3456268310547, "loss": 0.5928, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9810474514961243, "rewards/margins": 0.3942745327949524, "rewards/rejected": -1.3753221035003662, "step": 13690 }, { "epoch": 2.3604410751206064, "grad_norm": 17.9910831451416, "learning_rate": 1.321845007096879e-08, "logits/chosen": -2.0158298015594482, "logits/rejected": -1.9829661846160889, "logps/chosen": -157.7763671875, "logps/rejected": -181.74862670898438, "loss": 0.6165, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9876109957695007, "rewards/margins": 0.2986479103565216, "rewards/rejected": -1.2862588167190552, "step": 13700 }, { "epoch": 2.3604410751206064, "eval_logits/chosen": -2.122077226638794, "eval_logits/rejected": -2.109849214553833, "eval_logps/chosen": -144.521728515625, "eval_logps/rejected": -167.927734375, "eval_loss": 0.6390398144721985, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.8580982685089111, "eval_rewards/margins": 0.18937797844409943, "eval_rewards/rejected": -1.0474762916564941, "eval_runtime": 384.5775, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 13700 }, { "epoch": 2.3621640248104754, "grad_norm": 19.00145149230957, "learning_rate": 1.3150621680136197e-08, "logits/chosen": -1.9748674631118774, "logits/rejected": -1.9406923055648804, "logps/chosen": -149.32943725585938, "logps/rejected": -188.4552459716797, "loss": 0.5713, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9583326578140259, "rewards/margins": 0.3872007131576538, "rewards/rejected": -1.3455334901809692, "step": 13710 }, { "epoch": 2.3638869745003444, "grad_norm": 20.39362907409668, "learning_rate": 1.3082941401824027e-08, "logits/chosen": -1.9518674612045288, "logits/rejected": -1.9131473302841187, "logps/chosen": -148.49801635742188, "logps/rejected": -179.29251098632812, "loss": 0.6003, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9409820437431335, "rewards/margins": 0.3274959623813629, "rewards/rejected": -1.2684780359268188, "step": 13720 }, { "epoch": 2.3656099241902138, "grad_norm": 21.57988739013672, "learning_rate": 1.30154095080666e-08, "logits/chosen": -2.104325294494629, "logits/rejected": -2.075873851776123, "logps/chosen": -151.66932678222656, "logps/rejected": -196.09866333007812, "loss": 0.5697, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9694169759750366, "rewards/margins": 0.40838488936424255, "rewards/rejected": -1.3778020143508911, "step": 13730 }, { "epoch": 2.3673328738800827, "grad_norm": 14.81605052947998, "learning_rate": 1.2948026270301853e-08, "logits/chosen": -2.0773396492004395, "logits/rejected": -2.0453059673309326, "logps/chosen": -150.8756103515625, "logps/rejected": -189.0476837158203, "loss": 0.5544, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.931871235370636, "rewards/margins": 0.4239043593406677, "rewards/rejected": -1.3557754755020142, "step": 13740 }, { "epoch": 2.3690558235699517, "grad_norm": 20.967859268188477, "learning_rate": 1.2880791959370235e-08, "logits/chosen": -2.028982639312744, "logits/rejected": -2.0015342235565186, "logps/chosen": -163.1125030517578, "logps/rejected": -199.9310760498047, "loss": 0.5864, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0688209533691406, "rewards/margins": 0.38359832763671875, "rewards/rejected": -1.4524192810058594, "step": 13750 }, { "epoch": 2.3707787732598207, "grad_norm": 24.959444046020508, "learning_rate": 1.2813706845513556e-08, "logits/chosen": -2.0903074741363525, "logits/rejected": -2.0592691898345947, "logps/chosen": -151.02227783203125, "logps/rejected": -181.03770446777344, "loss": 0.601, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9661828875541687, "rewards/margins": 0.3374853730201721, "rewards/rejected": -1.3036682605743408, "step": 13760 }, { "epoch": 2.37250172294969, "grad_norm": 18.076208114624023, "learning_rate": 1.274677119837393e-08, "logits/chosen": -2.0862984657287598, "logits/rejected": -2.0729031562805176, "logps/chosen": -152.1778564453125, "logps/rejected": -192.85093688964844, "loss": 0.5773, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0140810012817383, "rewards/margins": 0.36892393231391907, "rewards/rejected": -1.383004903793335, "step": 13770 }, { "epoch": 2.374224672639559, "grad_norm": 13.1267671585083, "learning_rate": 1.2679985286992762e-08, "logits/chosen": -2.140918016433716, "logits/rejected": -2.0959179401397705, "logps/chosen": -160.25247192382812, "logps/rejected": -189.6563262939453, "loss": 0.5856, "rewards/accuracies": 0.75, "rewards/chosen": -1.0143978595733643, "rewards/margins": 0.36376023292541504, "rewards/rejected": -1.3781578540802002, "step": 13780 }, { "epoch": 2.375947622329428, "grad_norm": 16.864328384399414, "learning_rate": 1.2613349379809596e-08, "logits/chosen": -2.0300323963165283, "logits/rejected": -2.0034685134887695, "logps/chosen": -158.2185516357422, "logps/rejected": -191.93429565429688, "loss": 0.5928, "rewards/accuracies": 0.6875, "rewards/chosen": -1.017733097076416, "rewards/margins": 0.36832427978515625, "rewards/rejected": -1.3860573768615723, "step": 13790 }, { "epoch": 2.377670572019297, "grad_norm": 22.573379516601562, "learning_rate": 1.2546863744660975e-08, "logits/chosen": -2.0864007472991943, "logits/rejected": -2.051849842071533, "logps/chosen": -160.9171142578125, "logps/rejected": -192.89340209960938, "loss": 0.5863, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0360805988311768, "rewards/margins": 0.36966273188591003, "rewards/rejected": -1.4057433605194092, "step": 13800 }, { "epoch": 2.377670572019297, "eval_logits/chosen": -2.123253345489502, "eval_logits/rejected": -2.1111674308776855, "eval_logps/chosen": -143.51416015625, "eval_logps/rejected": -166.79006958007812, "eval_loss": 0.6392779350280762, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -0.8480228185653687, "eval_rewards/margins": 0.18807663023471832, "eval_rewards/rejected": -1.0360993146896362, "eval_runtime": 384.1952, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 13800 }, { "epoch": 2.379393521709166, "grad_norm": 16.58545684814453, "learning_rate": 1.2480528648779532e-08, "logits/chosen": -1.9805755615234375, "logits/rejected": -1.9594581127166748, "logps/chosen": -146.25344848632812, "logps/rejected": -185.53028869628906, "loss": 0.5842, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9401553869247437, "rewards/margins": 0.37683534622192383, "rewards/rejected": -1.3169907331466675, "step": 13810 }, { "epoch": 2.381116471399035, "grad_norm": 20.78591537475586, "learning_rate": 1.2414344358792784e-08, "logits/chosen": -2.0741939544677734, "logits/rejected": -2.042043685913086, "logps/chosen": -165.21682739257812, "logps/rejected": -190.59774780273438, "loss": 0.6068, "rewards/accuracies": 0.6875, "rewards/chosen": -1.046770453453064, "rewards/margins": 0.32175150513648987, "rewards/rejected": -1.368522047996521, "step": 13820 }, { "epoch": 2.3828394210889043, "grad_norm": 16.735506057739258, "learning_rate": 1.2348311140722079e-08, "logits/chosen": -2.09040904045105, "logits/rejected": -2.074305295944214, "logps/chosen": -151.49021911621094, "logps/rejected": -180.3469696044922, "loss": 0.5934, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9646391868591309, "rewards/margins": 0.3140520453453064, "rewards/rejected": -1.2786911725997925, "step": 13830 }, { "epoch": 2.3845623707787733, "grad_norm": 19.62781524658203, "learning_rate": 1.2282429259981597e-08, "logits/chosen": -2.054668426513672, "logits/rejected": -2.0238184928894043, "logps/chosen": -157.5996856689453, "logps/rejected": -180.1737060546875, "loss": 0.6173, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0088542699813843, "rewards/margins": 0.27592530846595764, "rewards/rejected": -1.2847795486450195, "step": 13840 }, { "epoch": 2.3862853204686423, "grad_norm": 17.908798217773438, "learning_rate": 1.221669898137716e-08, "logits/chosen": -1.9645780324935913, "logits/rejected": -1.9352012872695923, "logps/chosen": -155.58758544921875, "logps/rejected": -181.04794311523438, "loss": 0.6164, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.014997959136963, "rewards/margins": 0.29814934730529785, "rewards/rejected": -1.3131473064422607, "step": 13850 }, { "epoch": 2.3880082701585112, "grad_norm": 19.479312896728516, "learning_rate": 1.2151120569105316e-08, "logits/chosen": -1.9867212772369385, "logits/rejected": -1.9653675556182861, "logps/chosen": -152.99612426757812, "logps/rejected": -188.34559631347656, "loss": 0.5814, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0142691135406494, "rewards/margins": 0.34864503145217896, "rewards/rejected": -1.362913966178894, "step": 13860 }, { "epoch": 2.3897312198483807, "grad_norm": 18.023334503173828, "learning_rate": 1.208569428675214e-08, "logits/chosen": -2.048905849456787, "logits/rejected": -2.0258002281188965, "logps/chosen": -164.04400634765625, "logps/rejected": -199.03273010253906, "loss": 0.5957, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0920708179473877, "rewards/margins": 0.37155622243881226, "rewards/rejected": -1.4636269807815552, "step": 13870 }, { "epoch": 2.3914541695382496, "grad_norm": 19.91959571838379, "learning_rate": 1.2020420397292285e-08, "logits/chosen": -1.9922428131103516, "logits/rejected": -1.9501075744628906, "logps/chosen": -157.5734100341797, "logps/rejected": -192.80783081054688, "loss": 0.5984, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0311752557754517, "rewards/margins": 0.37342900037765503, "rewards/rejected": -1.404604196548462, "step": 13880 }, { "epoch": 2.3931771192281186, "grad_norm": 14.596778869628906, "learning_rate": 1.1955299163087818e-08, "logits/chosen": -2.017054796218872, "logits/rejected": -1.9956953525543213, "logps/chosen": -160.2853240966797, "logps/rejected": -188.4384765625, "loss": 0.6084, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0466159582138062, "rewards/margins": 0.3001475930213928, "rewards/rejected": -1.3467636108398438, "step": 13890 }, { "epoch": 2.3949000689179876, "grad_norm": 15.902459144592285, "learning_rate": 1.1890330845887292e-08, "logits/chosen": -1.9512453079223633, "logits/rejected": -1.924924612045288, "logps/chosen": -150.42723083496094, "logps/rejected": -176.5133819580078, "loss": 0.6023, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9697845578193665, "rewards/margins": 0.29802370071411133, "rewards/rejected": -1.2678083181381226, "step": 13900 }, { "epoch": 2.3949000689179876, "eval_logits/chosen": -2.1268839836120605, "eval_logits/rejected": -2.1148407459259033, "eval_logps/chosen": -142.16603088378906, "eval_logps/rejected": -165.2496795654297, "eval_loss": 0.6394680142402649, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -0.8345414400100708, "eval_rewards/margins": 0.1861540526151657, "eval_rewards/rejected": -1.0206955671310425, "eval_runtime": 384.2999, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 13900 }, { "epoch": 2.3966230186078565, "grad_norm": 14.8185453414917, "learning_rate": 1.1825515706824563e-08, "logits/chosen": -1.986510992050171, "logits/rejected": -1.961790680885315, "logps/chosen": -150.84820556640625, "logps/rejected": -178.35220336914062, "loss": 0.5935, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.94154292345047, "rewards/margins": 0.3133096396923065, "rewards/rejected": -1.2548526525497437, "step": 13910 }, { "epoch": 2.3983459682977255, "grad_norm": 18.360509872436523, "learning_rate": 1.1760854006417848e-08, "logits/chosen": -2.035714626312256, "logits/rejected": -1.9925167560577393, "logps/chosen": -159.39645385742188, "logps/rejected": -189.31198120117188, "loss": 0.5693, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.991812527179718, "rewards/margins": 0.38306179642677307, "rewards/rejected": -1.3748743534088135, "step": 13920 }, { "epoch": 2.400068917987595, "grad_norm": 18.657060623168945, "learning_rate": 1.1696346004568597e-08, "logits/chosen": -1.9965635538101196, "logits/rejected": -1.9670436382293701, "logps/chosen": -143.44058227539062, "logps/rejected": -172.3385009765625, "loss": 0.6085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9190446734428406, "rewards/margins": 0.2936355173587799, "rewards/rejected": -1.2126802206039429, "step": 13930 }, { "epoch": 2.401791867677464, "grad_norm": 16.701251983642578, "learning_rate": 1.1631991960560494e-08, "logits/chosen": -1.94890558719635, "logits/rejected": -1.9219639301300049, "logps/chosen": -143.83834838867188, "logps/rejected": -192.53492736816406, "loss": 0.5591, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9068156480789185, "rewards/margins": 0.4580385088920593, "rewards/rejected": -1.364854097366333, "step": 13940 }, { "epoch": 2.403514817367333, "grad_norm": 18.006498336791992, "learning_rate": 1.1567792133058418e-08, "logits/chosen": -2.0381360054016113, "logits/rejected": -2.00884747505188, "logps/chosen": -154.49293518066406, "logps/rejected": -183.24496459960938, "loss": 0.6173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0042588710784912, "rewards/margins": 0.30468565225601196, "rewards/rejected": -1.3089444637298584, "step": 13950 }, { "epoch": 2.405237767057202, "grad_norm": 17.899837493896484, "learning_rate": 1.1503746780107394e-08, "logits/chosen": -1.8896019458770752, "logits/rejected": -1.881241798400879, "logps/chosen": -154.44520568847656, "logps/rejected": -178.41546630859375, "loss": 0.6192, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.010117530822754, "rewards/margins": 0.2451154738664627, "rewards/rejected": -1.2552330493927002, "step": 13960 }, { "epoch": 2.406960716747071, "grad_norm": 14.456962585449219, "learning_rate": 1.1439856159131528e-08, "logits/chosen": -1.9930963516235352, "logits/rejected": -1.9641220569610596, "logps/chosen": -159.3346710205078, "logps/rejected": -188.92762756347656, "loss": 0.597, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0080400705337524, "rewards/margins": 0.3513377010822296, "rewards/rejected": -1.3593778610229492, "step": 13970 }, { "epoch": 2.40868366643694, "grad_norm": 16.631418228149414, "learning_rate": 1.1376120526932987e-08, "logits/chosen": -1.9666211605072021, "logits/rejected": -1.9483652114868164, "logps/chosen": -151.54281616210938, "logps/rejected": -191.87307739257812, "loss": 0.5646, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9695194363594055, "rewards/margins": 0.39907148480415344, "rewards/rejected": -1.368591070175171, "step": 13980 }, { "epoch": 2.410406616126809, "grad_norm": 26.594629287719727, "learning_rate": 1.1312540139691012e-08, "logits/chosen": -2.0047948360443115, "logits/rejected": -1.9720453023910522, "logps/chosen": -156.214599609375, "logps/rejected": -186.14004516601562, "loss": 0.5926, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9914728999137878, "rewards/margins": 0.3394834101200104, "rewards/rejected": -1.330956220626831, "step": 13990 }, { "epoch": 2.412129565816678, "grad_norm": 19.11248207092285, "learning_rate": 1.1249115252960845e-08, "logits/chosen": -1.9239094257354736, "logits/rejected": -1.8979803323745728, "logps/chosen": -151.89559936523438, "logps/rejected": -196.10821533203125, "loss": 0.551, "rewards/accuracies": 0.75, "rewards/chosen": -1.0082547664642334, "rewards/margins": 0.4158460199832916, "rewards/rejected": -1.4241007566452026, "step": 14000 }, { "epoch": 2.412129565816678, "eval_logits/chosen": -2.1225693225860596, "eval_logits/rejected": -2.1104423999786377, "eval_logps/chosen": -143.11300659179688, "eval_logps/rejected": -166.4650115966797, "eval_loss": 0.6389403343200684, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.8440110683441162, "eval_rewards/margins": 0.188837930560112, "eval_rewards/rejected": -1.0328489542007446, "eval_runtime": 384.3006, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 14000 }, { "epoch": 2.413852515506547, "grad_norm": 20.82906723022461, "learning_rate": 1.1185846121672677e-08, "logits/chosen": -1.9620907306671143, "logits/rejected": -1.944170355796814, "logps/chosen": -156.39244079589844, "logps/rejected": -181.0838623046875, "loss": 0.635, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0100760459899902, "rewards/margins": 0.27003151178359985, "rewards/rejected": -1.2801077365875244, "step": 14010 }, { "epoch": 2.415575465196416, "grad_norm": 16.65471839904785, "learning_rate": 1.1122733000130697e-08, "logits/chosen": -1.9725452661514282, "logits/rejected": -1.9616895914077759, "logps/chosen": -160.29861450195312, "logps/rejected": -179.5715789794922, "loss": 0.6501, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0577062368392944, "rewards/margins": 0.22781126201152802, "rewards/rejected": -1.2855174541473389, "step": 14020 }, { "epoch": 2.4172984148862855, "grad_norm": 20.01896095275879, "learning_rate": 1.1059776142011995e-08, "logits/chosen": -1.9646918773651123, "logits/rejected": -1.9402368068695068, "logps/chosen": -159.73370361328125, "logps/rejected": -183.8528594970703, "loss": 0.6203, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.058144211769104, "rewards/margins": 0.287528395652771, "rewards/rejected": -1.345672845840454, "step": 14030 }, { "epoch": 2.4190213645761545, "grad_norm": 22.854854583740234, "learning_rate": 1.0996975800365577e-08, "logits/chosen": -1.9474046230316162, "logits/rejected": -1.897743582725525, "logps/chosen": -154.50302124023438, "logps/rejected": -187.38052368164062, "loss": 0.5706, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9782811999320984, "rewards/margins": 0.39642828702926636, "rewards/rejected": -1.3747094869613647, "step": 14040 }, { "epoch": 2.4207443142660234, "grad_norm": 15.219006538391113, "learning_rate": 1.0934332227611365e-08, "logits/chosen": -2.026057481765747, "logits/rejected": -1.990783452987671, "logps/chosen": -146.36672973632812, "logps/rejected": -170.61947631835938, "loss": 0.6094, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9137172698974609, "rewards/margins": 0.28696709871292114, "rewards/rejected": -1.2006843090057373, "step": 14050 }, { "epoch": 2.4224672639558924, "grad_norm": 15.002602577209473, "learning_rate": 1.0871845675539166e-08, "logits/chosen": -2.052274227142334, "logits/rejected": -2.0151865482330322, "logps/chosen": -149.68687438964844, "logps/rejected": -188.11338806152344, "loss": 0.5632, "rewards/accuracies": 0.71875, "rewards/chosen": -0.944464385509491, "rewards/margins": 0.41930341720581055, "rewards/rejected": -1.3637678623199463, "step": 14060 }, { "epoch": 2.4241902136457614, "grad_norm": 18.0310001373291, "learning_rate": 1.0809516395307644e-08, "logits/chosen": -1.9927473068237305, "logits/rejected": -1.9632790088653564, "logps/chosen": -155.531982421875, "logps/rejected": -196.14749145507812, "loss": 0.572, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0092281103134155, "rewards/margins": 0.3949144184589386, "rewards/rejected": -1.4041424989700317, "step": 14070 }, { "epoch": 2.425913163335631, "grad_norm": 15.924246788024902, "learning_rate": 1.07473446374433e-08, "logits/chosen": -1.9561008214950562, "logits/rejected": -1.9251344203948975, "logps/chosen": -168.65504455566406, "logps/rejected": -191.71324157714844, "loss": 0.6314, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1003005504608154, "rewards/margins": 0.2860051691532135, "rewards/rejected": -1.386305570602417, "step": 14080 }, { "epoch": 2.4276361130254998, "grad_norm": 17.289323806762695, "learning_rate": 1.0685330651839542e-08, "logits/chosen": -1.970725655555725, "logits/rejected": -1.9323031902313232, "logps/chosen": -157.32803344726562, "logps/rejected": -181.59481811523438, "loss": 0.616, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0144267082214355, "rewards/margins": 0.2824392020702362, "rewards/rejected": -1.2968658208847046, "step": 14090 }, { "epoch": 2.4293590627153687, "grad_norm": 14.066096305847168, "learning_rate": 1.0623474687755607e-08, "logits/chosen": -2.043682336807251, "logits/rejected": -2.005585193634033, "logps/chosen": -155.64065551757812, "logps/rejected": -193.1170654296875, "loss": 0.565, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9713875651359558, "rewards/margins": 0.41372281312942505, "rewards/rejected": -1.3851104974746704, "step": 14100 }, { "epoch": 2.4293590627153687, "eval_logits/chosen": -2.12381649017334, "eval_logits/rejected": -2.111618757247925, "eval_logps/chosen": -142.63906860351562, "eval_logps/rejected": -165.84359741210938, "eval_loss": 0.6394081711769104, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -0.839271605014801, "eval_rewards/margins": 0.1873631775379181, "eval_rewards/rejected": -1.026634931564331, "eval_runtime": 384.4292, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 14100 }, { "epoch": 2.4310820124052377, "grad_norm": 17.91411018371582, "learning_rate": 1.0561776993815563e-08, "logits/chosen": -2.015458345413208, "logits/rejected": -1.9985685348510742, "logps/chosen": -148.95152282714844, "logps/rejected": -183.5349884033203, "loss": 0.6011, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9989355206489563, "rewards/margins": 0.321287602186203, "rewards/rejected": -1.320223093032837, "step": 14110 }, { "epoch": 2.4328049620951067, "grad_norm": 20.556859970092773, "learning_rate": 1.0500237818007318e-08, "logits/chosen": -2.011336326599121, "logits/rejected": -1.984691858291626, "logps/chosen": -150.89601135253906, "logps/rejected": -180.69509887695312, "loss": 0.6038, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9859260320663452, "rewards/margins": 0.2964496612548828, "rewards/rejected": -1.282375693321228, "step": 14120 }, { "epoch": 2.4345279117849756, "grad_norm": 22.44071388244629, "learning_rate": 1.0438857407681683e-08, "logits/chosen": -2.0367536544799805, "logits/rejected": -2.0129051208496094, "logps/chosen": -151.6571044921875, "logps/rejected": -173.5016632080078, "loss": 0.624, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9739078283309937, "rewards/margins": 0.24007806181907654, "rewards/rejected": -1.2139859199523926, "step": 14130 }, { "epoch": 2.436250861474845, "grad_norm": 18.632469177246094, "learning_rate": 1.0377636009551271e-08, "logits/chosen": -2.0937840938568115, "logits/rejected": -2.0704216957092285, "logps/chosen": -156.42971801757812, "logps/rejected": -189.91531372070312, "loss": 0.5969, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0387742519378662, "rewards/margins": 0.34860479831695557, "rewards/rejected": -1.3873790502548218, "step": 14140 }, { "epoch": 2.437973811164714, "grad_norm": 21.75985336303711, "learning_rate": 1.0316573869689605e-08, "logits/chosen": -2.05063796043396, "logits/rejected": -2.0307650566101074, "logps/chosen": -154.0839080810547, "logps/rejected": -191.13221740722656, "loss": 0.5835, "rewards/accuracies": 0.71875, "rewards/chosen": -1.001259446144104, "rewards/margins": 0.35263198614120483, "rewards/rejected": -1.3538914918899536, "step": 14150 }, { "epoch": 2.439696760854583, "grad_norm": 15.552751541137695, "learning_rate": 1.025567123353004e-08, "logits/chosen": -1.9948619604110718, "logits/rejected": -1.9734423160552979, "logps/chosen": -153.45443725585938, "logps/rejected": -192.1945343017578, "loss": 0.5816, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0015238523483276, "rewards/margins": 0.35632625222206116, "rewards/rejected": -1.3578500747680664, "step": 14160 }, { "epoch": 2.441419710544452, "grad_norm": 26.254636764526367, "learning_rate": 1.0194928345864867e-08, "logits/chosen": -1.9262346029281616, "logits/rejected": -1.8959357738494873, "logps/chosen": -159.6402130126953, "logps/rejected": -183.0159149169922, "loss": 0.6367, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0405998229980469, "rewards/margins": 0.2549907863140106, "rewards/rejected": -1.2955906391143799, "step": 14170 }, { "epoch": 2.4431426602343214, "grad_norm": 21.57680320739746, "learning_rate": 1.0134345450844245e-08, "logits/chosen": -1.9975277185440063, "logits/rejected": -1.9621978998184204, "logps/chosen": -162.57911682128906, "logps/rejected": -185.11489868164062, "loss": 0.6114, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0512542724609375, "rewards/margins": 0.28474700450897217, "rewards/rejected": -1.3360012769699097, "step": 14180 }, { "epoch": 2.4448656099241903, "grad_norm": 20.588884353637695, "learning_rate": 1.0073922791975276e-08, "logits/chosen": -2.049851417541504, "logits/rejected": -2.0274546146392822, "logps/chosen": -165.40493774414062, "logps/rejected": -189.0166015625, "loss": 0.6264, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0652960538864136, "rewards/margins": 0.28536009788513184, "rewards/rejected": -1.3506559133529663, "step": 14190 }, { "epoch": 2.4465885596140593, "grad_norm": 17.678224563598633, "learning_rate": 1.0013660612121034e-08, "logits/chosen": -1.8968816995620728, "logits/rejected": -1.8697046041488647, "logps/chosen": -148.82977294921875, "logps/rejected": -191.06394958496094, "loss": 0.555, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.952100932598114, "rewards/margins": 0.4132204055786133, "rewards/rejected": -1.3653212785720825, "step": 14200 }, { "epoch": 2.4465885596140593, "eval_logits/chosen": -2.1250712871551514, "eval_logits/rejected": -2.1129205226898193, "eval_logps/chosen": -142.16831970214844, "eval_logps/rejected": -165.29055786132812, "eval_loss": 0.6395566463470459, "eval_rewards/accuracies": 0.6317379474639893, "eval_rewards/chosen": -0.834564208984375, "eval_rewards/margins": 0.18654029071331024, "eval_rewards/rejected": -1.0211044549942017, "eval_runtime": 384.1959, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 14200 }, { "epoch": 2.4483115093039283, "grad_norm": 23.573081970214844, "learning_rate": 9.953559153499509e-09, "logits/chosen": -2.0430827140808105, "logits/rejected": -2.0197272300720215, "logps/chosen": -156.13330078125, "logps/rejected": -186.78826904296875, "loss": 0.6046, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0321298837661743, "rewards/margins": 0.3123827874660492, "rewards/rejected": -1.344512701034546, "step": 14210 }, { "epoch": 2.4500344589937972, "grad_norm": 18.056289672851562, "learning_rate": 9.893618657682712e-09, "logits/chosen": -2.0957090854644775, "logits/rejected": -2.0685744285583496, "logps/chosen": -157.59153747558594, "logps/rejected": -186.17889404296875, "loss": 0.5984, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9941093325614929, "rewards/margins": 0.3160152733325958, "rewards/rejected": -1.3101245164871216, "step": 14220 }, { "epoch": 2.451757408683666, "grad_norm": 13.619145393371582, "learning_rate": 9.833839365595686e-09, "logits/chosen": -1.9443687200546265, "logits/rejected": -1.9069139957427979, "logps/chosen": -151.8792266845703, "logps/rejected": -189.014404296875, "loss": 0.5697, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9614906311035156, "rewards/margins": 0.40907591581344604, "rewards/rejected": -1.370566725730896, "step": 14230 }, { "epoch": 2.4534803583735356, "grad_norm": 14.615938186645508, "learning_rate": 9.774221517515563e-09, "logits/chosen": -1.9454753398895264, "logits/rejected": -1.9278638362884521, "logps/chosen": -150.30819702148438, "logps/rejected": -193.07888793945312, "loss": 0.5767, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9628839492797852, "rewards/margins": 0.42012038826942444, "rewards/rejected": -1.3830043077468872, "step": 14240 }, { "epoch": 2.4552033080634046, "grad_norm": 16.2730712890625, "learning_rate": 9.71476535307047e-09, "logits/chosen": -1.93965744972229, "logits/rejected": -1.9185863733291626, "logps/chosen": -158.58848571777344, "logps/rejected": -184.422607421875, "loss": 0.612, "rewards/accuracies": 0.65625, "rewards/chosen": -1.036377191543579, "rewards/margins": 0.2775997221469879, "rewards/rejected": -1.3139768838882446, "step": 14250 }, { "epoch": 2.4569262577532736, "grad_norm": 24.47321128845215, "learning_rate": 9.65547111123875e-09, "logits/chosen": -2.0460410118103027, "logits/rejected": -2.000666379928589, "logps/chosen": -158.0436553955078, "logps/rejected": -183.562744140625, "loss": 0.6094, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.010170340538025, "rewards/margins": 0.324282705783844, "rewards/rejected": -1.3344532251358032, "step": 14260 }, { "epoch": 2.4586492074431425, "grad_norm": 13.774347305297852, "learning_rate": 9.596339030347906e-09, "logits/chosen": -2.011545181274414, "logits/rejected": -1.9771140813827515, "logps/chosen": -151.81991577148438, "logps/rejected": -189.76376342773438, "loss": 0.5683, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9760403633117676, "rewards/margins": 0.41053280234336853, "rewards/rejected": -1.3865731954574585, "step": 14270 }, { "epoch": 2.460372157133012, "grad_norm": 17.94484519958496, "learning_rate": 9.537369348073598e-09, "logits/chosen": -1.9080841541290283, "logits/rejected": -1.8955433368682861, "logps/chosen": -155.3813934326172, "logps/rejected": -180.8621826171875, "loss": 0.6406, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0353655815124512, "rewards/margins": 0.24347825348377228, "rewards/rejected": -1.278843641281128, "step": 14280 }, { "epoch": 2.462095106822881, "grad_norm": 15.351448059082031, "learning_rate": 9.478562301438809e-09, "logits/chosen": -1.9359334707260132, "logits/rejected": -1.903080940246582, "logps/chosen": -165.85658264160156, "logps/rejected": -189.58993530273438, "loss": 0.6116, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0559364557266235, "rewards/margins": 0.3225387632846832, "rewards/rejected": -1.3784751892089844, "step": 14290 }, { "epoch": 2.46381805651275, "grad_norm": 18.01544189453125, "learning_rate": 9.419918126812748e-09, "logits/chosen": -2.0868797302246094, "logits/rejected": -2.063182830810547, "logps/chosen": -143.3059844970703, "logps/rejected": -195.7191925048828, "loss": 0.5303, "rewards/accuracies": 0.75, "rewards/chosen": -0.8930876851081848, "rewards/margins": 0.4974890351295471, "rewards/rejected": -1.3905766010284424, "step": 14300 }, { "epoch": 2.46381805651275, "eval_logits/chosen": -2.120243787765503, "eval_logits/rejected": -2.1079373359680176, "eval_logps/chosen": -143.3938751220703, "eval_logps/rejected": -166.7382049560547, "eval_loss": 0.639214038848877, "eval_rewards/accuracies": 0.6312732100486755, "eval_rewards/chosen": -0.8468197584152222, "eval_rewards/margins": 0.18876135349273682, "eval_rewards/rejected": -1.0355809926986694, "eval_runtime": 384.6489, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 14300 }, { "epoch": 2.465541006202619, "grad_norm": 17.958723068237305, "learning_rate": 9.361437059910055e-09, "logits/chosen": -1.9490585327148438, "logits/rejected": -1.918474555015564, "logps/chosen": -152.93356323242188, "logps/rejected": -181.9292755126953, "loss": 0.602, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9822160005569458, "rewards/margins": 0.3143574595451355, "rewards/rejected": -1.2965734004974365, "step": 14310 }, { "epoch": 2.467263955892488, "grad_norm": 14.172632217407227, "learning_rate": 9.303119335789705e-09, "logits/chosen": -1.9744739532470703, "logits/rejected": -1.9451096057891846, "logps/chosen": -142.07589721679688, "logps/rejected": -181.09783935546875, "loss": 0.5662, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8652330636978149, "rewards/margins": 0.39316755533218384, "rewards/rejected": -1.258400559425354, "step": 14320 }, { "epoch": 2.468986905582357, "grad_norm": 20.68508529663086, "learning_rate": 9.244965188854186e-09, "logits/chosen": -2.1301205158233643, "logits/rejected": -2.0914273262023926, "logps/chosen": -156.94268798828125, "logps/rejected": -209.26144409179688, "loss": 0.5365, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0194352865219116, "rewards/margins": 0.5314199924468994, "rewards/rejected": -1.5508553981781006, "step": 14330 }, { "epoch": 2.470709855272226, "grad_norm": 16.806840896606445, "learning_rate": 9.186974852848467e-09, "logits/chosen": -1.9855438470840454, "logits/rejected": -1.9752496480941772, "logps/chosen": -154.06163024902344, "logps/rejected": -197.087646484375, "loss": 0.5689, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9917511940002441, "rewards/margins": 0.41639596223831177, "rewards/rejected": -1.4081472158432007, "step": 14340 }, { "epoch": 2.472432804962095, "grad_norm": 15.451340675354004, "learning_rate": 9.129148560859102e-09, "logits/chosen": -2.0121288299560547, "logits/rejected": -1.9863446950912476, "logps/chosen": -153.16099548339844, "logps/rejected": -187.49951171875, "loss": 0.595, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9934865832328796, "rewards/margins": 0.33748453855514526, "rewards/rejected": -1.330971121788025, "step": 14350 }, { "epoch": 2.474155754651964, "grad_norm": 19.930023193359375, "learning_rate": 9.0714865453133e-09, "logits/chosen": -2.0086300373077393, "logits/rejected": -1.9849426746368408, "logps/chosen": -157.56985473632812, "logps/rejected": -183.5955810546875, "loss": 0.6101, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.00942063331604, "rewards/margins": 0.2993680238723755, "rewards/rejected": -1.308788537979126, "step": 14360 }, { "epoch": 2.475878704341833, "grad_norm": 22.79810333251953, "learning_rate": 9.013989037977977e-09, "logits/chosen": -1.9906909465789795, "logits/rejected": -1.9567381143569946, "logps/chosen": -158.33157348632812, "logps/rejected": -186.85699462890625, "loss": 0.6028, "rewards/accuracies": 0.65625, "rewards/chosen": -1.029017686843872, "rewards/margins": 0.3354260325431824, "rewards/rejected": -1.3644436597824097, "step": 14370 }, { "epoch": 2.4776016540317025, "grad_norm": 18.240234375, "learning_rate": 8.956656269958812e-09, "logits/chosen": -2.113480567932129, "logits/rejected": -2.0854239463806152, "logps/chosen": -151.90036010742188, "logps/rejected": -186.75308227539062, "loss": 0.578, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9992541074752808, "rewards/margins": 0.3485099971294403, "rewards/rejected": -1.347764253616333, "step": 14380 }, { "epoch": 2.4793246037215715, "grad_norm": 14.757161140441895, "learning_rate": 8.899488471699312e-09, "logits/chosen": -1.9855690002441406, "logits/rejected": -1.957798719406128, "logps/chosen": -150.4571075439453, "logps/rejected": -186.6510772705078, "loss": 0.5863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9845079183578491, "rewards/margins": 0.3711647391319275, "rewards/rejected": -1.3556725978851318, "step": 14390 }, { "epoch": 2.4810475534114405, "grad_norm": 20.438915252685547, "learning_rate": 8.842485872979944e-09, "logits/chosen": -2.040426731109619, "logits/rejected": -2.006819248199463, "logps/chosen": -155.3217315673828, "logps/rejected": -183.2150115966797, "loss": 0.5998, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9849168658256531, "rewards/margins": 0.3500329554080963, "rewards/rejected": -1.3349497318267822, "step": 14400 }, { "epoch": 2.4810475534114405, "eval_logits/chosen": -2.116072416305542, "eval_logits/rejected": -2.1038036346435547, "eval_logps/chosen": -144.01414489746094, "eval_logps/rejected": -167.4716033935547, "eval_loss": 0.6390359401702881, "eval_rewards/accuracies": 0.6349906921386719, "eval_rewards/chosen": -0.8530225157737732, "eval_rewards/margins": 0.18989242613315582, "eval_rewards/rejected": -1.0429149866104126, "eval_runtime": 384.8846, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 14400 }, { "epoch": 2.4827705031013094, "grad_norm": 14.78596305847168, "learning_rate": 8.785648702917164e-09, "logits/chosen": -2.0402984619140625, "logits/rejected": -2.016493320465088, "logps/chosen": -146.95201110839844, "logps/rejected": -187.06517028808594, "loss": 0.5708, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9579975008964539, "rewards/margins": 0.3841494023799896, "rewards/rejected": -1.3421467542648315, "step": 14410 }, { "epoch": 2.4844934527911784, "grad_norm": 17.67999839782715, "learning_rate": 8.728977189962484e-09, "logits/chosen": -2.0537607669830322, "logits/rejected": -2.028657913208008, "logps/chosen": -157.0430908203125, "logps/rejected": -185.3281707763672, "loss": 0.6275, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0535532236099243, "rewards/margins": 0.2764922082424164, "rewards/rejected": -1.330045461654663, "step": 14420 }, { "epoch": 2.4862164024810474, "grad_norm": 17.697603225708008, "learning_rate": 8.672471561901563e-09, "logits/chosen": -1.9936397075653076, "logits/rejected": -1.9668951034545898, "logps/chosen": -149.4518280029297, "logps/rejected": -186.0017852783203, "loss": 0.5786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9713097810745239, "rewards/margins": 0.381035715341568, "rewards/rejected": -1.3523452281951904, "step": 14430 }, { "epoch": 2.4879393521709168, "grad_norm": 17.021329879760742, "learning_rate": 8.616132045853341e-09, "logits/chosen": -1.9929511547088623, "logits/rejected": -1.9607025384902954, "logps/chosen": -146.68467712402344, "logps/rejected": -190.72293090820312, "loss": 0.5497, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9224915504455566, "rewards/margins": 0.4398232400417328, "rewards/rejected": -1.3623147010803223, "step": 14440 }, { "epoch": 2.4896623018607857, "grad_norm": 17.291061401367188, "learning_rate": 8.559958868269058e-09, "logits/chosen": -1.9776551723480225, "logits/rejected": -1.9558746814727783, "logps/chosen": -160.75479125976562, "logps/rejected": -186.2601776123047, "loss": 0.6064, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0511369705200195, "rewards/margins": 0.28932785987854004, "rewards/rejected": -1.3404648303985596, "step": 14450 }, { "epoch": 2.4913852515506547, "grad_norm": 13.858305931091309, "learning_rate": 8.50395225493138e-09, "logits/chosen": -1.9648939371109009, "logits/rejected": -1.9434897899627686, "logps/chosen": -158.81265258789062, "logps/rejected": -192.18408203125, "loss": 0.597, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0494775772094727, "rewards/margins": 0.3382503390312195, "rewards/rejected": -1.387727975845337, "step": 14460 }, { "epoch": 2.4931082012405237, "grad_norm": 15.126280784606934, "learning_rate": 8.448112430953502e-09, "logits/chosen": -2.1291208267211914, "logits/rejected": -2.08988881111145, "logps/chosen": -158.90528869628906, "logps/rejected": -188.93397521972656, "loss": 0.5766, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9993063807487488, "rewards/margins": 0.38702961802482605, "rewards/rejected": -1.386336088180542, "step": 14470 }, { "epoch": 2.4948311509303926, "grad_norm": 16.606204986572266, "learning_rate": 8.392439620778197e-09, "logits/chosen": -2.01015305519104, "logits/rejected": -2.000311851501465, "logps/chosen": -157.5637664794922, "logps/rejected": -195.15850830078125, "loss": 0.5788, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0419816970825195, "rewards/margins": 0.35848504304885864, "rewards/rejected": -1.4004666805267334, "step": 14480 }, { "epoch": 2.496554100620262, "grad_norm": 16.415185928344727, "learning_rate": 8.336934048176935e-09, "logits/chosen": -1.9853851795196533, "logits/rejected": -1.9662622213363647, "logps/chosen": -149.88446044921875, "logps/rejected": -185.765869140625, "loss": 0.5849, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9933239817619324, "rewards/margins": 0.34891748428344727, "rewards/rejected": -1.3422415256500244, "step": 14490 }, { "epoch": 2.498277050310131, "grad_norm": 14.806930541992188, "learning_rate": 8.281595936249031e-09, "logits/chosen": -1.9811630249023438, "logits/rejected": -1.9375073909759521, "logps/chosen": -152.71762084960938, "logps/rejected": -190.4132537841797, "loss": 0.5688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9578202962875366, "rewards/margins": 0.4063640534877777, "rewards/rejected": -1.3641846179962158, "step": 14500 }, { "epoch": 2.498277050310131, "eval_logits/chosen": -2.113724946975708, "eval_logits/rejected": -2.1013731956481934, "eval_logps/chosen": -144.60885620117188, "eval_logps/rejected": -168.23812866210938, "eval_loss": 0.6387121677398682, "eval_rewards/accuracies": 0.6338289976119995, "eval_rewards/chosen": -0.8589696884155273, "eval_rewards/margins": 0.1916102170944214, "eval_rewards/rejected": -1.0505800247192383, "eval_runtime": 384.7364, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 14500 }, { "epoch": 2.5, "grad_norm": 21.338985443115234, "learning_rate": 8.226425507420687e-09, "logits/chosen": -2.025588035583496, "logits/rejected": -2.0008339881896973, "logps/chosen": -164.10328674316406, "logps/rejected": -182.0420379638672, "loss": 0.6536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0853534936904907, "rewards/margins": 0.21822969615459442, "rewards/rejected": -1.3035831451416016, "step": 14510 }, { "epoch": 2.501722949689869, "grad_norm": 18.622758865356445, "learning_rate": 8.171422983444116e-09, "logits/chosen": -2.0074095726013184, "logits/rejected": -1.975572943687439, "logps/chosen": -162.18734741210938, "logps/rejected": -195.61839294433594, "loss": 0.6055, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0662273168563843, "rewards/margins": 0.33798521757125854, "rewards/rejected": -1.4042125940322876, "step": 14520 }, { "epoch": 2.503445899379738, "grad_norm": 24.270904541015625, "learning_rate": 8.11658858539664e-09, "logits/chosen": -2.01389741897583, "logits/rejected": -1.985365629196167, "logps/chosen": -164.20822143554688, "logps/rejected": -198.39657592773438, "loss": 0.5903, "rewards/accuracies": 0.6875, "rewards/chosen": -1.065456748008728, "rewards/margins": 0.36622780561447144, "rewards/rejected": -1.4316847324371338, "step": 14530 }, { "epoch": 2.505168849069607, "grad_norm": 13.432212829589844, "learning_rate": 8.061922533679838e-09, "logits/chosen": -1.9972903728485107, "logits/rejected": -1.9680439233779907, "logps/chosen": -153.31585693359375, "logps/rejected": -190.34976196289062, "loss": 0.5788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9934379458427429, "rewards/margins": 0.38658642768859863, "rewards/rejected": -1.3800244331359863, "step": 14540 }, { "epoch": 2.5068917987594763, "grad_norm": 12.799301147460938, "learning_rate": 8.007425048018652e-09, "logits/chosen": -1.9891287088394165, "logits/rejected": -1.9558525085449219, "logps/chosen": -151.38922119140625, "logps/rejected": -182.28224182128906, "loss": 0.5924, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9464647173881531, "rewards/margins": 0.3441544473171234, "rewards/rejected": -1.2906192541122437, "step": 14550 }, { "epoch": 2.5086147484493453, "grad_norm": 17.504226684570312, "learning_rate": 7.953096347460442e-09, "logits/chosen": -1.986769437789917, "logits/rejected": -1.9548423290252686, "logps/chosen": -154.15313720703125, "logps/rejected": -198.58419799804688, "loss": 0.5638, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0116616487503052, "rewards/margins": 0.42689648270606995, "rewards/rejected": -1.4385582208633423, "step": 14560 }, { "epoch": 2.5103376981392143, "grad_norm": 16.162752151489258, "learning_rate": 7.898936650374177e-09, "logits/chosen": -1.9076499938964844, "logits/rejected": -1.9039417505264282, "logps/chosen": -151.77023315429688, "logps/rejected": -183.071533203125, "loss": 0.6153, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9947423934936523, "rewards/margins": 0.2983546853065491, "rewards/rejected": -1.2930970191955566, "step": 14570 }, { "epoch": 2.5120606478290832, "grad_norm": 24.360637664794922, "learning_rate": 7.844946174449552e-09, "logits/chosen": -2.031590461730957, "logits/rejected": -2.010911464691162, "logps/chosen": -147.8909149169922, "logps/rejected": -181.05914306640625, "loss": 0.6036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9838373064994812, "rewards/margins": 0.3161866068840027, "rewards/rejected": -1.3000237941741943, "step": 14580 }, { "epoch": 2.5137835975189526, "grad_norm": 17.314136505126953, "learning_rate": 7.791125136696053e-09, "logits/chosen": -1.965696096420288, "logits/rejected": -1.9488036632537842, "logps/chosen": -149.4911651611328, "logps/rejected": -180.62423706054688, "loss": 0.6136, "rewards/accuracies": 0.625, "rewards/chosen": -0.970258355140686, "rewards/margins": 0.3082885146141052, "rewards/rejected": -1.2785468101501465, "step": 14590 }, { "epoch": 2.5155065472088216, "grad_norm": 18.38401222229004, "learning_rate": 7.737473753442175e-09, "logits/chosen": -1.9814138412475586, "logits/rejected": -1.9383974075317383, "logps/chosen": -154.36642456054688, "logps/rejected": -190.03427124023438, "loss": 0.5601, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9964590072631836, "rewards/margins": 0.3978864550590515, "rewards/rejected": -1.3943455219268799, "step": 14600 }, { "epoch": 2.5155065472088216, "eval_logits/chosen": -2.115797996520996, "eval_logits/rejected": -2.1035284996032715, "eval_logps/chosen": -143.91217041015625, "eval_logps/rejected": -167.47149658203125, "eval_loss": 0.6386252641677856, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.8520029187202454, "eval_rewards/margins": 0.19091065227985382, "eval_rewards/rejected": -1.042913556098938, "eval_runtime": 384.709, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.398, "step": 14600 }, { "epoch": 2.5172294968986906, "grad_norm": 23.114534378051758, "learning_rate": 7.683992240334442e-09, "logits/chosen": -1.9740896224975586, "logits/rejected": -1.938331961631775, "logps/chosen": -157.10739135742188, "logps/rejected": -178.64077758789062, "loss": 0.6129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.007470726966858, "rewards/margins": 0.2828449308872223, "rewards/rejected": -1.2903155088424683, "step": 14610 }, { "epoch": 2.5189524465885595, "grad_norm": 19.145803451538086, "learning_rate": 7.630680812336666e-09, "logits/chosen": -1.93508780002594, "logits/rejected": -1.9264802932739258, "logps/chosen": -154.36521911621094, "logps/rejected": -190.16876220703125, "loss": 0.6109, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0316916704177856, "rewards/margins": 0.33859580755233765, "rewards/rejected": -1.3702874183654785, "step": 14620 }, { "epoch": 2.5206753962784285, "grad_norm": 16.07571029663086, "learning_rate": 7.577539683728963e-09, "logits/chosen": -1.9057121276855469, "logits/rejected": -1.8875715732574463, "logps/chosen": -153.3982391357422, "logps/rejected": -183.86709594726562, "loss": 0.609, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0031397342681885, "rewards/margins": 0.3077995181083679, "rewards/rejected": -1.3109391927719116, "step": 14630 }, { "epoch": 2.5223983459682975, "grad_norm": 15.951969146728516, "learning_rate": 7.524569068106984e-09, "logits/chosen": -1.9527273178100586, "logits/rejected": -1.9341843128204346, "logps/chosen": -149.197509765625, "logps/rejected": -186.84393310546875, "loss": 0.5871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9737323522567749, "rewards/margins": 0.3649931848049164, "rewards/rejected": -1.3387255668640137, "step": 14640 }, { "epoch": 2.524121295658167, "grad_norm": 29.141252517700195, "learning_rate": 7.471769178381032e-09, "logits/chosen": -2.019270420074463, "logits/rejected": -1.9985889196395874, "logps/chosen": -163.63470458984375, "logps/rejected": -177.01535034179688, "loss": 0.6798, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0939924716949463, "rewards/margins": 0.1794067770242691, "rewards/rejected": -1.2733992338180542, "step": 14650 }, { "epoch": 2.525844245348036, "grad_norm": 15.75715446472168, "learning_rate": 7.419140226775117e-09, "logits/chosen": -1.9905866384506226, "logits/rejected": -1.9431712627410889, "logps/chosen": -147.61099243164062, "logps/rejected": -188.6812744140625, "loss": 0.5509, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9468952417373657, "rewards/margins": 0.4186995029449463, "rewards/rejected": -1.365594744682312, "step": 14660 }, { "epoch": 2.527567195037905, "grad_norm": 18.846126556396484, "learning_rate": 7.366682424826259e-09, "logits/chosen": -1.9177242517471313, "logits/rejected": -1.896928071975708, "logps/chosen": -146.061767578125, "logps/rejected": -183.46351623535156, "loss": 0.5812, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9555428624153137, "rewards/margins": 0.36532577872276306, "rewards/rejected": -1.320868730545044, "step": 14670 }, { "epoch": 2.529290144727774, "grad_norm": 20.36220932006836, "learning_rate": 7.314395983383548e-09, "logits/chosen": -2.0085368156433105, "logits/rejected": -1.9800879955291748, "logps/chosen": -144.07467651367188, "logps/rejected": -178.85032653808594, "loss": 0.5955, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9582074880599976, "rewards/margins": 0.3068663477897644, "rewards/rejected": -1.2650738954544067, "step": 14680 }, { "epoch": 2.531013094417643, "grad_norm": 17.909120559692383, "learning_rate": 7.262281112607266e-09, "logits/chosen": -2.003288507461548, "logits/rejected": -1.9820172786712646, "logps/chosen": -153.15850830078125, "logps/rejected": -196.28207397460938, "loss": 0.5759, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0177866220474243, "rewards/margins": 0.4071931838989258, "rewards/rejected": -1.42497980594635, "step": 14690 }, { "epoch": 2.532736044107512, "grad_norm": 14.401308059692383, "learning_rate": 7.210338021968099e-09, "logits/chosen": -2.0783095359802246, "logits/rejected": -2.05433988571167, "logps/chosen": -161.2674560546875, "logps/rejected": -207.3699493408203, "loss": 0.5694, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.065330147743225, "rewards/margins": 0.44841083884239197, "rewards/rejected": -1.5137410163879395, "step": 14700 }, { "epoch": 2.532736044107512, "eval_logits/chosen": -2.114840507507324, "eval_logits/rejected": -2.10246205329895, "eval_logps/chosen": -144.20339965820312, "eval_logps/rejected": -167.83787536621094, "eval_loss": 0.6385432481765747, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.8549151420593262, "eval_rewards/margins": 0.19166231155395508, "eval_rewards/rejected": -1.0465774536132812, "eval_runtime": 384.5031, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 14700 }, { "epoch": 2.534458993797381, "grad_norm": 19.330596923828125, "learning_rate": 7.158566920246306e-09, "logits/chosen": -2.004446506500244, "logits/rejected": -1.9851405620574951, "logps/chosen": -151.1990966796875, "logps/rejected": -186.05673217773438, "loss": 0.5836, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9772917032241821, "rewards/margins": 0.3510001599788666, "rewards/rejected": -1.3282917737960815, "step": 14710 }, { "epoch": 2.53618194348725, "grad_norm": 18.70694351196289, "learning_rate": 7.1069680155308455e-09, "logits/chosen": -1.9518972635269165, "logits/rejected": -1.9193109273910522, "logps/chosen": -158.89915466308594, "logps/rejected": -191.0203399658203, "loss": 0.5923, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0122705698013306, "rewards/margins": 0.36014801263809204, "rewards/rejected": -1.3724186420440674, "step": 14720 }, { "epoch": 2.537904893177119, "grad_norm": 18.281707763671875, "learning_rate": 7.055541515218505e-09, "logits/chosen": -2.0466084480285645, "logits/rejected": -2.017833948135376, "logps/chosen": -150.62435913085938, "logps/rejected": -186.28201293945312, "loss": 0.5952, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9747387170791626, "rewards/margins": 0.38200193643569946, "rewards/rejected": -1.3567407131195068, "step": 14730 }, { "epoch": 2.539627842866988, "grad_norm": 18.14553451538086, "learning_rate": 7.004287626013167e-09, "logits/chosen": -1.951456069946289, "logits/rejected": -1.9448543787002563, "logps/chosen": -162.8822479248047, "logps/rejected": -198.805419921875, "loss": 0.5896, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0774930715560913, "rewards/margins": 0.35956871509552, "rewards/rejected": -1.4370617866516113, "step": 14740 }, { "epoch": 2.5413507925568575, "grad_norm": 18.924482345581055, "learning_rate": 6.9532065539248785e-09, "logits/chosen": -1.9971873760223389, "logits/rejected": -1.9622485637664795, "logps/chosen": -150.1236114501953, "logps/rejected": -181.3514404296875, "loss": 0.5857, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.973942756652832, "rewards/margins": 0.3488963544368744, "rewards/rejected": -1.3228389024734497, "step": 14750 }, { "epoch": 2.5430737422467264, "grad_norm": 15.385965347290039, "learning_rate": 6.902298504269089e-09, "logits/chosen": -2.018479585647583, "logits/rejected": -1.9901787042617798, "logps/chosen": -148.24203491210938, "logps/rejected": -187.40792846679688, "loss": 0.5726, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9536944627761841, "rewards/margins": 0.38367682695388794, "rewards/rejected": -1.3373712301254272, "step": 14760 }, { "epoch": 2.5447966919365954, "grad_norm": 16.74427032470703, "learning_rate": 6.851563681665778e-09, "logits/chosen": -2.018254280090332, "logits/rejected": -1.9969749450683594, "logps/chosen": -159.3584747314453, "logps/rejected": -194.02919006347656, "loss": 0.5971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0322730541229248, "rewards/margins": 0.3494979739189148, "rewards/rejected": -1.3817710876464844, "step": 14770 }, { "epoch": 2.5465196416264644, "grad_norm": 15.392023086547852, "learning_rate": 6.801002290038687e-09, "logits/chosen": -1.9742482900619507, "logits/rejected": -1.9616191387176514, "logps/chosen": -146.07852172851562, "logps/rejected": -176.225830078125, "loss": 0.6166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9681469798088074, "rewards/margins": 0.3012215495109558, "rewards/rejected": -1.2693684101104736, "step": 14780 }, { "epoch": 2.548242591316334, "grad_norm": 15.558985710144043, "learning_rate": 6.750614532614446e-09, "logits/chosen": -2.0351271629333496, "logits/rejected": -2.0066030025482178, "logps/chosen": -165.19740295410156, "logps/rejected": -192.03024291992188, "loss": 0.6226, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.117128610610962, "rewards/margins": 0.28926485776901245, "rewards/rejected": -1.4063934087753296, "step": 14790 }, { "epoch": 2.5499655410062028, "grad_norm": 20.31756019592285, "learning_rate": 6.7004006119217695e-09, "logits/chosen": -1.974603295326233, "logits/rejected": -1.9604946374893188, "logps/chosen": -159.23159790039062, "logps/rejected": -201.53890991210938, "loss": 0.5762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.040924072265625, "rewards/margins": 0.40332651138305664, "rewards/rejected": -1.4442507028579712, "step": 14800 }, { "epoch": 2.5499655410062028, "eval_logits/chosen": -2.11506986618042, "eval_logits/rejected": -2.1026973724365234, "eval_logps/chosen": -143.8544158935547, "eval_logps/rejected": -167.41026306152344, "eval_loss": 0.6387512683868408, "eval_rewards/accuracies": 0.6326673030853271, "eval_rewards/chosen": -0.8514252305030823, "eval_rewards/margins": 0.19087636470794678, "eval_rewards/rejected": -1.0423015356063843, "eval_runtime": 384.6653, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 14800 }, { "epoch": 2.5516884906960717, "grad_norm": 21.273597717285156, "learning_rate": 6.650360729790677e-09, "logits/chosen": -2.015728712081909, "logits/rejected": -1.9715601205825806, "logps/chosen": -170.3062744140625, "logps/rejected": -190.282470703125, "loss": 0.622, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1154944896697998, "rewards/margins": 0.29905885457992554, "rewards/rejected": -1.4145534038543701, "step": 14810 }, { "epoch": 2.5534114403859407, "grad_norm": 23.828311920166016, "learning_rate": 6.600495087351654e-09, "logits/chosen": -2.173076629638672, "logits/rejected": -2.1389033794403076, "logps/chosen": -155.05343627929688, "logps/rejected": -191.4359130859375, "loss": 0.5759, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0160174369812012, "rewards/margins": 0.3760719895362854, "rewards/rejected": -1.3920894861221313, "step": 14820 }, { "epoch": 2.5551343900758097, "grad_norm": 15.176013946533203, "learning_rate": 6.550803885034833e-09, "logits/chosen": -1.9943411350250244, "logits/rejected": -1.9672164916992188, "logps/chosen": -157.0222930908203, "logps/rejected": -187.53219604492188, "loss": 0.6033, "rewards/accuracies": 0.65625, "rewards/chosen": -1.021923303604126, "rewards/margins": 0.34125617146492004, "rewards/rejected": -1.3631794452667236, "step": 14830 }, { "epoch": 2.5568573397656786, "grad_norm": 15.307422637939453, "learning_rate": 6.5012873225691875e-09, "logits/chosen": -2.034871816635132, "logits/rejected": -2.0009121894836426, "logps/chosen": -158.11953735351562, "logps/rejected": -199.80873107910156, "loss": 0.5631, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0207037925720215, "rewards/margins": 0.4128735661506653, "rewards/rejected": -1.433577299118042, "step": 14840 }, { "epoch": 2.558580289455548, "grad_norm": 15.78002643585205, "learning_rate": 6.451945598981784e-09, "logits/chosen": -1.9768062829971313, "logits/rejected": -1.9464294910430908, "logps/chosen": -163.91943359375, "logps/rejected": -197.85679626464844, "loss": 0.5964, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.106640100479126, "rewards/margins": 0.33005601167678833, "rewards/rejected": -1.4366960525512695, "step": 14850 }, { "epoch": 2.560303239145417, "grad_norm": 15.69031047821045, "learning_rate": 6.4027789125969286e-09, "logits/chosen": -1.9618046283721924, "logits/rejected": -1.9428672790527344, "logps/chosen": -148.9631805419922, "logps/rejected": -190.20620727539062, "loss": 0.5801, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9464788436889648, "rewards/margins": 0.43374577164649963, "rewards/rejected": -1.380224585533142, "step": 14860 }, { "epoch": 2.562026188835286, "grad_norm": 16.080961227416992, "learning_rate": 6.353787461035354e-09, "logits/chosen": -2.0421204566955566, "logits/rejected": -2.010331630706787, "logps/chosen": -154.61451721191406, "logps/rejected": -184.98825073242188, "loss": 0.6063, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9666212797164917, "rewards/margins": 0.35497865080833435, "rewards/rejected": -1.3215998411178589, "step": 14870 }, { "epoch": 2.563749138525155, "grad_norm": 16.047571182250977, "learning_rate": 6.304971441213469e-09, "logits/chosen": -1.9380992650985718, "logits/rejected": -1.9198198318481445, "logps/chosen": -150.46047973632812, "logps/rejected": -191.18447875976562, "loss": 0.5721, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9723681211471558, "rewards/margins": 0.4024737477302551, "rewards/rejected": -1.3748419284820557, "step": 14880 }, { "epoch": 2.5654720882150244, "grad_norm": 16.142362594604492, "learning_rate": 6.256331049342572e-09, "logits/chosen": -1.9472945928573608, "logits/rejected": -1.9313589334487915, "logps/chosen": -161.94583129882812, "logps/rejected": -188.8350067138672, "loss": 0.6247, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0478771924972534, "rewards/margins": 0.26567643880844116, "rewards/rejected": -1.3135535717010498, "step": 14890 }, { "epoch": 2.5671950379048933, "grad_norm": 14.584959983825684, "learning_rate": 6.207866480928003e-09, "logits/chosen": -1.8694270849227905, "logits/rejected": -1.8306652307510376, "logps/chosen": -151.4298553466797, "logps/rejected": -186.15223693847656, "loss": 0.5944, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9848653674125671, "rewards/margins": 0.36345672607421875, "rewards/rejected": -1.3483220338821411, "step": 14900 }, { "epoch": 2.5671950379048933, "eval_logits/chosen": -2.115122079849243, "eval_logits/rejected": -2.1027910709381104, "eval_logps/chosen": -143.68251037597656, "eval_logps/rejected": -167.210205078125, "eval_loss": 0.6387660503387451, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -0.8497059345245361, "eval_rewards/margins": 0.1905948668718338, "eval_rewards/rejected": -1.0403008460998535, "eval_runtime": 384.4846, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 14900 }, { "epoch": 2.5689179875947623, "grad_norm": 16.054061889648438, "learning_rate": 6.1595779307684334e-09, "logits/chosen": -1.9949613809585571, "logits/rejected": -1.974852204322815, "logps/chosen": -143.7773895263672, "logps/rejected": -183.30677795410156, "loss": 0.5804, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9319849014282227, "rewards/margins": 0.3644837439060211, "rewards/rejected": -1.2964686155319214, "step": 14910 }, { "epoch": 2.5706409372846313, "grad_norm": 16.666128158569336, "learning_rate": 6.11146559295504e-09, "logits/chosen": -1.9557081460952759, "logits/rejected": -1.9361931085586548, "logps/chosen": -152.83848571777344, "logps/rejected": -200.3578338623047, "loss": 0.5609, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0003231763839722, "rewards/margins": 0.4479997158050537, "rewards/rejected": -1.4483230113983154, "step": 14920 }, { "epoch": 2.5723638869745002, "grad_norm": 20.001590728759766, "learning_rate": 6.063529660870709e-09, "logits/chosen": -2.102743625640869, "logits/rejected": -2.077289581298828, "logps/chosen": -148.93499755859375, "logps/rejected": -193.69276428222656, "loss": 0.5607, "rewards/accuracies": 0.75, "rewards/chosen": -0.9604331851005554, "rewards/margins": 0.44644695520401, "rewards/rejected": -1.4068801403045654, "step": 14930 }, { "epoch": 2.574086836664369, "grad_norm": 19.786619186401367, "learning_rate": 6.015770327189285e-09, "logits/chosen": -2.0179364681243896, "logits/rejected": -1.9870408773422241, "logps/chosen": -147.46142578125, "logps/rejected": -182.6336669921875, "loss": 0.573, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9480903744697571, "rewards/margins": 0.3807341456413269, "rewards/rejected": -1.328824520111084, "step": 14940 }, { "epoch": 2.575809786354238, "grad_norm": 25.684480667114258, "learning_rate": 5.968187783874806e-09, "logits/chosen": -2.0968451499938965, "logits/rejected": -2.0597052574157715, "logps/chosen": -154.89224243164062, "logps/rejected": -180.0705108642578, "loss": 0.6134, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.990331768989563, "rewards/margins": 0.3064814507961273, "rewards/rejected": -1.2968132495880127, "step": 14950 }, { "epoch": 2.5775327360441076, "grad_norm": 20.116313934326172, "learning_rate": 5.920782222180748e-09, "logits/chosen": -1.9334981441497803, "logits/rejected": -1.9067522287368774, "logps/chosen": -159.70523071289062, "logps/rejected": -191.04324340820312, "loss": 0.6043, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0388565063476562, "rewards/margins": 0.3223384618759155, "rewards/rejected": -1.3611948490142822, "step": 14960 }, { "epoch": 2.5792556857339766, "grad_norm": 21.070634841918945, "learning_rate": 5.873553832649137e-09, "logits/chosen": -2.016244888305664, "logits/rejected": -1.983957052230835, "logps/chosen": -155.7210693359375, "logps/rejected": -194.26742553710938, "loss": 0.5864, "rewards/accuracies": 0.6875, "rewards/chosen": -1.018042802810669, "rewards/margins": 0.38634276390075684, "rewards/rejected": -1.4043855667114258, "step": 14970 }, { "epoch": 2.5809786354238455, "grad_norm": 19.42366600036621, "learning_rate": 5.826502805109956e-09, "logits/chosen": -2.0328056812286377, "logits/rejected": -1.9894578456878662, "logps/chosen": -158.17327880859375, "logps/rejected": -201.35845947265625, "loss": 0.5607, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0405741930007935, "rewards/margins": 0.4552154541015625, "rewards/rejected": -1.4957895278930664, "step": 14980 }, { "epoch": 2.582701585113715, "grad_norm": 17.0192813873291, "learning_rate": 5.779629328680275e-09, "logits/chosen": -2.049226999282837, "logits/rejected": -2.029764175415039, "logps/chosen": -150.0392608642578, "logps/rejected": -194.73074340820312, "loss": 0.5634, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9864412546157837, "rewards/margins": 0.4335744380950928, "rewards/rejected": -1.4200156927108765, "step": 14990 }, { "epoch": 2.584424534803584, "grad_norm": 18.224952697753906, "learning_rate": 5.732933591763495e-09, "logits/chosen": -2.0484097003936768, "logits/rejected": -2.028733491897583, "logps/chosen": -159.0215301513672, "logps/rejected": -194.0147247314453, "loss": 0.5766, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.02946937084198, "rewards/margins": 0.37221887707710266, "rewards/rejected": -1.4016883373260498, "step": 15000 }, { "epoch": 2.584424534803584, "eval_logits/chosen": -2.1130645275115967, "eval_logits/rejected": -2.1006994247436523, "eval_logps/chosen": -143.99179077148438, "eval_logps/rejected": -167.61849975585938, "eval_loss": 0.6385829448699951, "eval_rewards/accuracies": 0.6326673030853271, "eval_rewards/chosen": -0.8527990579605103, "eval_rewards/margins": 0.191584512591362, "eval_rewards/rejected": -1.0443834066390991, "eval_runtime": 384.4361, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 15000 }, { "epoch": 2.586147484493453, "grad_norm": 18.351774215698242, "learning_rate": 5.686415782048643e-09, "logits/chosen": -2.069554090499878, "logits/rejected": -2.046079397201538, "logps/chosen": -160.81370544433594, "logps/rejected": -189.15988159179688, "loss": 0.6338, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0584571361541748, "rewards/margins": 0.3008044958114624, "rewards/rejected": -1.3592617511749268, "step": 15010 }, { "epoch": 2.587870434183322, "grad_norm": 20.420055389404297, "learning_rate": 5.640076086509538e-09, "logits/chosen": -1.9658933877944946, "logits/rejected": -1.9583778381347656, "logps/chosen": -149.10696411132812, "logps/rejected": -189.40797424316406, "loss": 0.5805, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9872896075248718, "rewards/margins": 0.3756560683250427, "rewards/rejected": -1.3629456758499146, "step": 15020 }, { "epoch": 2.589593383873191, "grad_norm": 19.33962631225586, "learning_rate": 5.593914691404145e-09, "logits/chosen": -1.9469581842422485, "logits/rejected": -1.9218370914459229, "logps/chosen": -159.7363739013672, "logps/rejected": -191.67098999023438, "loss": 0.6289, "rewards/accuracies": 0.6875, "rewards/chosen": -1.060520887374878, "rewards/margins": 0.3050110340118408, "rewards/rejected": -1.3655316829681396, "step": 15030 }, { "epoch": 2.59131633356306, "grad_norm": 21.10177993774414, "learning_rate": 5.547931782273718e-09, "logits/chosen": -1.9997142553329468, "logits/rejected": -1.9780254364013672, "logps/chosen": -159.2946319580078, "logps/rejected": -188.8558349609375, "loss": 0.6053, "rewards/accuracies": 0.6875, "rewards/chosen": -1.053137183189392, "rewards/margins": 0.31626269221305847, "rewards/rejected": -1.3694000244140625, "step": 15040 }, { "epoch": 2.5930392832529288, "grad_norm": 15.66610336303711, "learning_rate": 5.5021275439421365e-09, "logits/chosen": -2.016746759414673, "logits/rejected": -1.9768556356430054, "logps/chosen": -154.64114379882812, "logps/rejected": -188.32803344726562, "loss": 0.5724, "rewards/accuracies": 0.71875, "rewards/chosen": -1.003481149673462, "rewards/margins": 0.38969749212265015, "rewards/rejected": -1.3931787014007568, "step": 15050 }, { "epoch": 2.594762232942798, "grad_norm": 16.66924476623535, "learning_rate": 5.456502160515097e-09, "logits/chosen": -1.9930648803710938, "logits/rejected": -1.969142198562622, "logps/chosen": -147.95089721679688, "logps/rejected": -183.54043579101562, "loss": 0.5983, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9373061060905457, "rewards/margins": 0.35400331020355225, "rewards/rejected": -1.2913094758987427, "step": 15060 }, { "epoch": 2.596485182632667, "grad_norm": 20.8486328125, "learning_rate": 5.411055815379451e-09, "logits/chosen": -2.0663139820098877, "logits/rejected": -2.0248055458068848, "logps/chosen": -157.30691528320312, "logps/rejected": -181.81141662597656, "loss": 0.6138, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0020601749420166, "rewards/margins": 0.30118146538734436, "rewards/rejected": -1.303241491317749, "step": 15070 }, { "epoch": 2.598208132322536, "grad_norm": 23.188222885131836, "learning_rate": 5.365788691202372e-09, "logits/chosen": -2.0324554443359375, "logits/rejected": -2.002934455871582, "logps/chosen": -151.79844665527344, "logps/rejected": -191.04129028320312, "loss": 0.5807, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9871308207511902, "rewards/margins": 0.38457822799682617, "rewards/rejected": -1.3717091083526611, "step": 15080 }, { "epoch": 2.599931082012405, "grad_norm": 16.64566421508789, "learning_rate": 5.320700969930708e-09, "logits/chosen": -2.025319814682007, "logits/rejected": -1.9949405193328857, "logps/chosen": -152.0048370361328, "logps/rejected": -183.31869506835938, "loss": 0.6014, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.973040759563446, "rewards/margins": 0.32706183195114136, "rewards/rejected": -1.3001024723052979, "step": 15090 }, { "epoch": 2.6016540317022745, "grad_norm": 16.85909652709961, "learning_rate": 5.2757928327902324e-09, "logits/chosen": -1.9706827402114868, "logits/rejected": -1.9501407146453857, "logps/chosen": -150.8965301513672, "logps/rejected": -179.4077911376953, "loss": 0.6066, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9991341829299927, "rewards/margins": 0.287818044424057, "rewards/rejected": -1.286952257156372, "step": 15100 }, { "epoch": 2.6016540317022745, "eval_logits/chosen": -2.1125121116638184, "eval_logits/rejected": -2.100074052810669, "eval_logps/chosen": -144.16322326660156, "eval_logps/rejected": -167.78355407714844, "eval_loss": 0.6386841535568237, "eval_rewards/accuracies": 0.6333643198013306, "eval_rewards/chosen": -0.8545132875442505, "eval_rewards/margins": 0.1915210634469986, "eval_rewards/rejected": -1.046034336090088, "eval_runtime": 384.4813, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 15100 }, { "epoch": 2.6033769813921435, "grad_norm": 26.44088363647461, "learning_rate": 5.231064460284818e-09, "logits/chosen": -2.0024333000183105, "logits/rejected": -1.979744553565979, "logps/chosen": -157.02423095703125, "logps/rejected": -184.419921875, "loss": 0.6213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0399093627929688, "rewards/margins": 0.28710994124412537, "rewards/rejected": -1.327019453048706, "step": 15110 }, { "epoch": 2.6050999310820124, "grad_norm": 17.99903678894043, "learning_rate": 5.1865160321958646e-09, "logits/chosen": -1.9820826053619385, "logits/rejected": -1.960571050643921, "logps/chosen": -164.3445587158203, "logps/rejected": -196.52163696289062, "loss": 0.5953, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.06142258644104, "rewards/margins": 0.3397645056247711, "rewards/rejected": -1.4011870622634888, "step": 15120 }, { "epoch": 2.6068228807718814, "grad_norm": 16.144819259643555, "learning_rate": 5.142147727581498e-09, "logits/chosen": -1.9235252141952515, "logits/rejected": -1.8876409530639648, "logps/chosen": -151.076171875, "logps/rejected": -184.70401000976562, "loss": 0.5875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9811315536499023, "rewards/margins": 0.3692875802516937, "rewards/rejected": -1.3504191637039185, "step": 15130 }, { "epoch": 2.6085458304617504, "grad_norm": 14.799017906188965, "learning_rate": 5.097959724775819e-09, "logits/chosen": -1.9977667331695557, "logits/rejected": -1.9723072052001953, "logps/chosen": -157.48370361328125, "logps/rejected": -198.5254364013672, "loss": 0.5708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.018140435218811, "rewards/margins": 0.4037169814109802, "rewards/rejected": -1.421857476234436, "step": 15140 }, { "epoch": 2.6102687801516193, "grad_norm": 17.75762367248535, "learning_rate": 5.053952201388234e-09, "logits/chosen": -2.1366593837738037, "logits/rejected": -2.0961055755615234, "logps/chosen": -151.1990509033203, "logps/rejected": -184.6768798828125, "loss": 0.5903, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9766638875007629, "rewards/margins": 0.3412805497646332, "rewards/rejected": -1.3179444074630737, "step": 15150 }, { "epoch": 2.6119917298414888, "grad_norm": 14.596528053283691, "learning_rate": 5.010125334302745e-09, "logits/chosen": -1.9628021717071533, "logits/rejected": -1.9478782415390015, "logps/chosen": -148.9315643310547, "logps/rejected": -196.9246826171875, "loss": 0.5379, "rewards/accuracies": 0.75, "rewards/chosen": -0.9466155767440796, "rewards/margins": 0.4711712896823883, "rewards/rejected": -1.417786955833435, "step": 15160 }, { "epoch": 2.6137146795313577, "grad_norm": 15.247135162353516, "learning_rate": 4.9664792996772285e-09, "logits/chosen": -1.959249496459961, "logits/rejected": -1.9353053569793701, "logps/chosen": -145.77493286132812, "logps/rejected": -187.5343017578125, "loss": 0.5726, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.927897572517395, "rewards/margins": 0.4094495177268982, "rewards/rejected": -1.337347149848938, "step": 15170 }, { "epoch": 2.6154376292212267, "grad_norm": 16.38957977294922, "learning_rate": 4.923014272942688e-09, "logits/chosen": -2.0254440307617188, "logits/rejected": -2.01228666305542, "logps/chosen": -160.19168090820312, "logps/rejected": -200.9775390625, "loss": 0.5876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0433802604675293, "rewards/margins": 0.40710577368736267, "rewards/rejected": -1.4504859447479248, "step": 15180 }, { "epoch": 2.6171605789110957, "grad_norm": 24.27712059020996, "learning_rate": 4.87973042880262e-09, "logits/chosen": -1.9452524185180664, "logits/rejected": -1.9317309856414795, "logps/chosen": -154.1034393310547, "logps/rejected": -183.8282928466797, "loss": 0.6039, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0143663883209229, "rewards/margins": 0.3002598285675049, "rewards/rejected": -1.3146262168884277, "step": 15190 }, { "epoch": 2.618883528600965, "grad_norm": 15.714237213134766, "learning_rate": 4.836627941232252e-09, "logits/chosen": -2.0148210525512695, "logits/rejected": -1.982872724533081, "logps/chosen": -156.9523162841797, "logps/rejected": -196.68821716308594, "loss": 0.557, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0139665603637695, "rewards/margins": 0.41901063919067383, "rewards/rejected": -1.4329770803451538, "step": 15200 }, { "epoch": 2.618883528600965, "eval_logits/chosen": -2.1104273796081543, "eval_logits/rejected": -2.0980310440063477, "eval_logps/chosen": -144.62359619140625, "eval_logps/rejected": -168.3309326171875, "eval_loss": 0.6384539604187012, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.8591170310974121, "eval_rewards/margins": 0.19239100813865662, "eval_rewards/rejected": -1.0515079498291016, "eval_runtime": 384.4424, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 15200 }, { "epoch": 2.620606478290834, "grad_norm": 18.513978958129883, "learning_rate": 4.793706983477869e-09, "logits/chosen": -1.9127235412597656, "logits/rejected": -1.8748849630355835, "logps/chosen": -161.8665008544922, "logps/rejected": -194.85324096679688, "loss": 0.5976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0613501071929932, "rewards/margins": 0.3695915639400482, "rewards/rejected": -1.4309414625167847, "step": 15210 }, { "epoch": 2.622329427980703, "grad_norm": 26.127431869506836, "learning_rate": 4.750967728056127e-09, "logits/chosen": -1.9632734060287476, "logits/rejected": -1.926553726196289, "logps/chosen": -147.77464294433594, "logps/rejected": -179.37680053710938, "loss": 0.5827, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9555865526199341, "rewards/margins": 0.34206050634384155, "rewards/rejected": -1.2976468801498413, "step": 15220 }, { "epoch": 2.624052377670572, "grad_norm": 16.548908233642578, "learning_rate": 4.7084103467533384e-09, "logits/chosen": -1.9632158279418945, "logits/rejected": -1.942214012145996, "logps/chosen": -159.75546264648438, "logps/rejected": -196.3821258544922, "loss": 0.5782, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0477434396743774, "rewards/margins": 0.37695199251174927, "rewards/rejected": -1.424695611000061, "step": 15230 }, { "epoch": 2.625775327360441, "grad_norm": 16.57700538635254, "learning_rate": 4.666035010624797e-09, "logits/chosen": -1.9485604763031006, "logits/rejected": -1.915534257888794, "logps/chosen": -163.38113403320312, "logps/rejected": -193.19944763183594, "loss": 0.5932, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.043493628501892, "rewards/margins": 0.3496318757534027, "rewards/rejected": -1.3931255340576172, "step": 15240 }, { "epoch": 2.62749827705031, "grad_norm": 22.92910385131836, "learning_rate": 4.623841889994057e-09, "logits/chosen": -1.978944182395935, "logits/rejected": -1.949502944946289, "logps/chosen": -146.48886108398438, "logps/rejected": -191.34922790527344, "loss": 0.5677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9369332194328308, "rewards/margins": 0.4371851980686188, "rewards/rejected": -1.374118447303772, "step": 15250 }, { "epoch": 2.6292212267401793, "grad_norm": 16.984521865844727, "learning_rate": 4.581831154452304e-09, "logits/chosen": -1.9686250686645508, "logits/rejected": -1.944267988204956, "logps/chosen": -156.27908325195312, "logps/rejected": -185.00186157226562, "loss": 0.5965, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.021148920059204, "rewards/margins": 0.32771751284599304, "rewards/rejected": -1.34886634349823, "step": 15260 }, { "epoch": 2.6309441764300483, "grad_norm": 19.00436019897461, "learning_rate": 4.540002972857654e-09, "logits/chosen": -2.0194756984710693, "logits/rejected": -1.9753892421722412, "logps/chosen": -166.2999725341797, "logps/rejected": -198.1792449951172, "loss": 0.6064, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1126136779785156, "rewards/margins": 0.3623610734939575, "rewards/rejected": -1.4749747514724731, "step": 15270 }, { "epoch": 2.6326671261199173, "grad_norm": 18.892955780029297, "learning_rate": 4.498357513334433e-09, "logits/chosen": -2.0918047428131104, "logits/rejected": -2.068206548690796, "logps/chosen": -153.55828857421875, "logps/rejected": -193.32286071777344, "loss": 0.569, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0138264894485474, "rewards/margins": 0.37680578231811523, "rewards/rejected": -1.390632152557373, "step": 15280 }, { "epoch": 2.6343900758097862, "grad_norm": 17.372209548950195, "learning_rate": 4.456894943272532e-09, "logits/chosen": -2.0045294761657715, "logits/rejected": -1.9611526727676392, "logps/chosen": -155.0989227294922, "logps/rejected": -195.74069213867188, "loss": 0.5726, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0035147666931152, "rewards/margins": 0.4420338273048401, "rewards/rejected": -1.4455485343933105, "step": 15290 }, { "epoch": 2.6361130254996556, "grad_norm": 16.459136962890625, "learning_rate": 4.415615429326769e-09, "logits/chosen": -1.889548659324646, "logits/rejected": -1.8532575368881226, "logps/chosen": -154.39035034179688, "logps/rejected": -193.02871704101562, "loss": 0.5819, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0240978002548218, "rewards/margins": 0.4086844325065613, "rewards/rejected": -1.4327824115753174, "step": 15300 }, { "epoch": 2.6361130254996556, "eval_logits/chosen": -2.1091508865356445, "eval_logits/rejected": -2.0966269969940186, "eval_logps/chosen": -144.9197540283203, "eval_logps/rejected": -168.6975555419922, "eval_loss": 0.638420581817627, "eval_rewards/accuracies": 0.6328996419906616, "eval_rewards/chosen": -0.8620786666870117, "eval_rewards/margins": 0.1930956095457077, "eval_rewards/rejected": -1.0551743507385254, "eval_runtime": 384.6842, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.399, "step": 15300 }, { "epoch": 2.6378359751895246, "grad_norm": 16.342561721801758, "learning_rate": 4.374519137416172e-09, "logits/chosen": -2.0601999759674072, "logits/rejected": -2.0281200408935547, "logps/chosen": -155.6795196533203, "logps/rejected": -196.3771209716797, "loss": 0.5631, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0153143405914307, "rewards/margins": 0.4135005474090576, "rewards/rejected": -1.4288151264190674, "step": 15310 }, { "epoch": 2.6395589248793936, "grad_norm": 22.074016571044922, "learning_rate": 4.333606232723308e-09, "logits/chosen": -1.9733701944351196, "logits/rejected": -1.9525983333587646, "logps/chosen": -153.31787109375, "logps/rejected": -191.0811309814453, "loss": 0.6041, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0211073160171509, "rewards/margins": 0.36150312423706055, "rewards/rejected": -1.3826103210449219, "step": 15320 }, { "epoch": 2.6412818745692626, "grad_norm": 18.97162628173828, "learning_rate": 4.292876879693646e-09, "logits/chosen": -1.9777858257293701, "logits/rejected": -1.9491751194000244, "logps/chosen": -156.2506561279297, "logps/rejected": -191.30422973632812, "loss": 0.5849, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0023764371871948, "rewards/margins": 0.36372584104537964, "rewards/rejected": -1.3661022186279297, "step": 15330 }, { "epoch": 2.6430048242591315, "grad_norm": 18.3383731842041, "learning_rate": 4.252331242034912e-09, "logits/chosen": -1.981006383895874, "logits/rejected": -1.959545373916626, "logps/chosen": -157.00186157226562, "logps/rejected": -193.48870849609375, "loss": 0.5926, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0228397846221924, "rewards/margins": 0.3593161404132843, "rewards/rejected": -1.3821560144424438, "step": 15340 }, { "epoch": 2.6447277739490005, "grad_norm": 24.032392501831055, "learning_rate": 4.211969482716354e-09, "logits/chosen": -1.8748499155044556, "logits/rejected": -1.857659935951233, "logps/chosen": -158.2872314453125, "logps/rejected": -195.51551818847656, "loss": 0.5856, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0382275581359863, "rewards/margins": 0.3772013783454895, "rewards/rejected": -1.4154289960861206, "step": 15350 }, { "epoch": 2.64645072363887, "grad_norm": 18.411428451538086, "learning_rate": 4.171791763968191e-09, "logits/chosen": -2.0040345191955566, "logits/rejected": -1.9880549907684326, "logps/chosen": -155.4371795654297, "logps/rejected": -197.0341033935547, "loss": 0.5916, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0386029481887817, "rewards/margins": 0.3698362708091736, "rewards/rejected": -1.408439040184021, "step": 15360 }, { "epoch": 2.648173673328739, "grad_norm": 20.20896339416504, "learning_rate": 4.131798247280882e-09, "logits/chosen": -2.026376247406006, "logits/rejected": -1.9910662174224854, "logps/chosen": -159.87014770507812, "logps/rejected": -187.96853637695312, "loss": 0.6119, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0375970602035522, "rewards/margins": 0.3267073631286621, "rewards/rejected": -1.3643044233322144, "step": 15370 }, { "epoch": 2.649896623018608, "grad_norm": 18.725107192993164, "learning_rate": 4.091989093404513e-09, "logits/chosen": -2.0294456481933594, "logits/rejected": -2.0028159618377686, "logps/chosen": -155.60501098632812, "logps/rejected": -190.8007354736328, "loss": 0.5836, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0139929056167603, "rewards/margins": 0.36987626552581787, "rewards/rejected": -1.383868932723999, "step": 15380 }, { "epoch": 2.651619572708477, "grad_norm": 14.653457641601562, "learning_rate": 4.052364462348118e-09, "logits/chosen": -2.0725111961364746, "logits/rejected": -2.051964282989502, "logps/chosen": -159.10177612304688, "logps/rejected": -195.51681518554688, "loss": 0.5952, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0287522077560425, "rewards/margins": 0.36905437707901, "rewards/rejected": -1.3978066444396973, "step": 15390 }, { "epoch": 2.6533425223983462, "grad_norm": 15.48855972290039, "learning_rate": 4.01292451337909e-09, "logits/chosen": -2.021852731704712, "logits/rejected": -1.9830955266952515, "logps/chosen": -166.8372344970703, "logps/rejected": -185.28648376464844, "loss": 0.6353, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0809361934661865, "rewards/margins": 0.2534196078777313, "rewards/rejected": -1.3343557119369507, "step": 15400 }, { "epoch": 2.6533425223983462, "eval_logits/chosen": -2.1090543270111084, "eval_logits/rejected": -2.096587657928467, "eval_logps/chosen": -144.8849639892578, "eval_logps/rejected": -168.6601104736328, "eval_loss": 0.638448178768158, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.8617305159568787, "eval_rewards/margins": 0.19306930899620056, "eval_rewards/rejected": -1.0547997951507568, "eval_runtime": 384.7054, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.398, "step": 15400 }, { "epoch": 2.655065472088215, "grad_norm": 15.053346633911133, "learning_rate": 3.973669405022518e-09, "logits/chosen": -2.004343032836914, "logits/rejected": -1.9556457996368408, "logps/chosen": -168.1739959716797, "logps/rejected": -189.31558227539062, "loss": 0.6202, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0859240293502808, "rewards/margins": 0.29226988554000854, "rewards/rejected": -1.378193974494934, "step": 15410 }, { "epoch": 2.656788421778084, "grad_norm": 19.721511840820312, "learning_rate": 3.934599295060481e-09, "logits/chosen": -1.973044991493225, "logits/rejected": -1.9537404775619507, "logps/chosen": -148.76483154296875, "logps/rejected": -197.9149169921875, "loss": 0.5517, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9632238149642944, "rewards/margins": 0.4792202115058899, "rewards/rejected": -1.4424442052841187, "step": 15420 }, { "epoch": 2.658511371467953, "grad_norm": 17.699020385742188, "learning_rate": 3.895714340531542e-09, "logits/chosen": -2.105011463165283, "logits/rejected": -2.06817889213562, "logps/chosen": -160.15054321289062, "logps/rejected": -195.86900329589844, "loss": 0.5579, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0172804594039917, "rewards/margins": 0.40506964921951294, "rewards/rejected": -1.4223501682281494, "step": 15430 }, { "epoch": 2.660234321157822, "grad_norm": 17.31113052368164, "learning_rate": 3.857014697730027e-09, "logits/chosen": -2.119237184524536, "logits/rejected": -2.093204975128174, "logps/chosen": -156.591796875, "logps/rejected": -188.33387756347656, "loss": 0.5907, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0234885215759277, "rewards/margins": 0.3261372447013855, "rewards/rejected": -1.349625825881958, "step": 15440 }, { "epoch": 2.661957270847691, "grad_norm": 17.5867862701416, "learning_rate": 3.818500522205392e-09, "logits/chosen": -1.837068796157837, "logits/rejected": -1.8134334087371826, "logps/chosen": -157.3140411376953, "logps/rejected": -188.42376708984375, "loss": 0.6071, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0475938320159912, "rewards/margins": 0.3060787618160248, "rewards/rejected": -1.353672742843628, "step": 15450 }, { "epoch": 2.66368022053756, "grad_norm": 17.964181900024414, "learning_rate": 3.7801719687616805e-09, "logits/chosen": -2.0408103466033936, "logits/rejected": -2.024808168411255, "logps/chosen": -160.98797607421875, "logps/rejected": -190.78469848632812, "loss": 0.6146, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0094503164291382, "rewards/margins": 0.32158786058425903, "rewards/rejected": -1.3310383558273315, "step": 15460 }, { "epoch": 2.6654031702274295, "grad_norm": 18.33719253540039, "learning_rate": 3.742029191456792e-09, "logits/chosen": -2.041928291320801, "logits/rejected": -2.014810085296631, "logps/chosen": -171.14865112304688, "logps/rejected": -198.5990447998047, "loss": 0.6128, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1157726049423218, "rewards/margins": 0.31600815057754517, "rewards/rejected": -1.4317806959152222, "step": 15470 }, { "epoch": 2.6671261199172984, "grad_norm": 16.115907669067383, "learning_rate": 3.704072343601955e-09, "logits/chosen": -2.0494565963745117, "logits/rejected": -2.0144147872924805, "logps/chosen": -150.64234924316406, "logps/rejected": -184.3823699951172, "loss": 0.5919, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9542343020439148, "rewards/margins": 0.3764815926551819, "rewards/rejected": -1.3307158946990967, "step": 15480 }, { "epoch": 2.6688490696071674, "grad_norm": 15.832812309265137, "learning_rate": 3.666301577761033e-09, "logits/chosen": -2.012834072113037, "logits/rejected": -1.9978433847427368, "logps/chosen": -158.44149780273438, "logps/rejected": -181.6676483154297, "loss": 0.6317, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0397393703460693, "rewards/margins": 0.258789598941803, "rewards/rejected": -1.2985289096832275, "step": 15490 }, { "epoch": 2.670572019297037, "grad_norm": 24.388607025146484, "learning_rate": 3.628717045750007e-09, "logits/chosen": -1.9957958459854126, "logits/rejected": -1.9769662618637085, "logps/chosen": -166.56466674804688, "logps/rejected": -193.4890899658203, "loss": 0.6352, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1249713897705078, "rewards/margins": 0.26610302925109863, "rewards/rejected": -1.3910744190216064, "step": 15500 }, { "epoch": 2.670572019297037, "eval_logits/chosen": -2.109842538833618, "eval_logits/rejected": -2.0973737239837646, "eval_logps/chosen": -144.62452697753906, "eval_logps/rejected": -168.334228515625, "eval_loss": 0.6385110020637512, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.8591262698173523, "eval_rewards/margins": 0.1924147754907608, "eval_rewards/rejected": -1.0515410900115967, "eval_runtime": 384.4335, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 15500 }, { "epoch": 2.6722949689869058, "grad_norm": 18.42940330505371, "learning_rate": 3.591318898636253e-09, "logits/chosen": -1.9370839595794678, "logits/rejected": -1.906272530555725, "logps/chosen": -161.62200927734375, "logps/rejected": -194.5110626220703, "loss": 0.5933, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0326871871948242, "rewards/margins": 0.37472963333129883, "rewards/rejected": -1.4074170589447021, "step": 15510 }, { "epoch": 2.6740179186767747, "grad_norm": 17.27278709411621, "learning_rate": 3.5541072867380174e-09, "logits/chosen": -1.8746843338012695, "logits/rejected": -1.8480383157730103, "logps/chosen": -156.1479034423828, "logps/rejected": -181.21646118164062, "loss": 0.6105, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0187759399414062, "rewards/margins": 0.3010672926902771, "rewards/rejected": -1.3198431730270386, "step": 15520 }, { "epoch": 2.6757408683666437, "grad_norm": 13.988799095153809, "learning_rate": 3.5170823596237852e-09, "logits/chosen": -1.9102674722671509, "logits/rejected": -1.8853724002838135, "logps/chosen": -147.4460906982422, "logps/rejected": -187.87509155273438, "loss": 0.5551, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.934697151184082, "rewards/margins": 0.4121207296848297, "rewards/rejected": -1.3468178510665894, "step": 15530 }, { "epoch": 2.6774638180565127, "grad_norm": 20.964385986328125, "learning_rate": 3.480244266111687e-09, "logits/chosen": -1.9714910984039307, "logits/rejected": -1.9401180744171143, "logps/chosen": -155.9417266845703, "logps/rejected": -189.67864990234375, "loss": 0.6026, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0363017320632935, "rewards/margins": 0.33966827392578125, "rewards/rejected": -1.3759697675704956, "step": 15540 }, { "epoch": 2.6791867677463816, "grad_norm": 16.464771270751953, "learning_rate": 3.4435931542688813e-09, "logits/chosen": -2.0619921684265137, "logits/rejected": -2.027811288833618, "logps/chosen": -160.7606658935547, "logps/rejected": -199.37928771972656, "loss": 0.565, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0543911457061768, "rewards/margins": 0.4127693176269531, "rewards/rejected": -1.4671604633331299, "step": 15550 }, { "epoch": 2.6809097174362506, "grad_norm": 17.458330154418945, "learning_rate": 3.407129171410966e-09, "logits/chosen": -1.99468195438385, "logits/rejected": -1.9875829219818115, "logps/chosen": -155.78231811523438, "logps/rejected": -184.93692016601562, "loss": 0.627, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0481034517288208, "rewards/margins": 0.2652983069419861, "rewards/rejected": -1.3134015798568726, "step": 15560 }, { "epoch": 2.68263266712612, "grad_norm": 16.604339599609375, "learning_rate": 3.3708524641014034e-09, "logits/chosen": -2.024165630340576, "logits/rejected": -2.0003719329833984, "logps/chosen": -156.09683227539062, "logps/rejected": -192.0378875732422, "loss": 0.5906, "rewards/accuracies": 0.65625, "rewards/chosen": -1.036889672279358, "rewards/margins": 0.37345337867736816, "rewards/rejected": -1.4103432893753052, "step": 15570 }, { "epoch": 2.684355616815989, "grad_norm": 17.41901397705078, "learning_rate": 3.3347631781509344e-09, "logits/chosen": -2.0503830909729004, "logits/rejected": -2.0323243141174316, "logps/chosen": -157.08680725097656, "logps/rejected": -196.42340087890625, "loss": 0.585, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0138909816741943, "rewards/margins": 0.372868150472641, "rewards/rejected": -1.3867590427398682, "step": 15580 }, { "epoch": 2.686078566505858, "grad_norm": 18.323165893554688, "learning_rate": 3.298861458616947e-09, "logits/chosen": -2.004791736602783, "logits/rejected": -1.9842307567596436, "logps/chosen": -149.55050659179688, "logps/rejected": -181.10687255859375, "loss": 0.604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9501471519470215, "rewards/margins": 0.32001519203186035, "rewards/rejected": -1.2701623439788818, "step": 15590 }, { "epoch": 2.687801516195727, "grad_norm": 15.208952903747559, "learning_rate": 3.263147449802939e-09, "logits/chosen": -1.9950618743896484, "logits/rejected": -1.9686263799667358, "logps/chosen": -153.86642456054688, "logps/rejected": -191.716796875, "loss": 0.5882, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0199898481369019, "rewards/margins": 0.38182753324508667, "rewards/rejected": -1.4018173217773438, "step": 15600 }, { "epoch": 2.687801516195727, "eval_logits/chosen": -2.1096668243408203, "eval_logits/rejected": -2.0971925258636475, "eval_logps/chosen": -144.52291870117188, "eval_logps/rejected": -168.2864532470703, "eval_loss": 0.638420581817627, "eval_rewards/accuracies": 0.6328996419906616, "eval_rewards/chosen": -0.8581101894378662, "eval_rewards/margins": 0.19295310974121094, "eval_rewards/rejected": -1.0510632991790771, "eval_runtime": 384.764, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 15600 }, { "epoch": 2.6895244658855963, "grad_norm": 19.43353843688965, "learning_rate": 3.227621295257921e-09, "logits/chosen": -2.0761425495147705, "logits/rejected": -2.0522148609161377, "logps/chosen": -157.3712921142578, "logps/rejected": -191.91091918945312, "loss": 0.587, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0132091045379639, "rewards/margins": 0.3645768463611603, "rewards/rejected": -1.3777856826782227, "step": 15610 }, { "epoch": 2.6912474155754653, "grad_norm": 20.663684844970703, "learning_rate": 3.1922831377758586e-09, "logits/chosen": -1.9529736042022705, "logits/rejected": -1.943288803100586, "logps/chosen": -146.19735717773438, "logps/rejected": -188.9971160888672, "loss": 0.571, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.931942343711853, "rewards/margins": 0.40306201577186584, "rewards/rejected": -1.3350043296813965, "step": 15620 }, { "epoch": 2.6929703652653343, "grad_norm": 17.698686599731445, "learning_rate": 3.1571331193950444e-09, "logits/chosen": -1.9933078289031982, "logits/rejected": -1.9454660415649414, "logps/chosen": -156.48092651367188, "logps/rejected": -194.33956909179688, "loss": 0.5634, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0125987529754639, "rewards/margins": 0.42867451906204224, "rewards/rejected": -1.4412733316421509, "step": 15630 }, { "epoch": 2.6946933149552033, "grad_norm": 18.6409912109375, "learning_rate": 3.1221713813976037e-09, "logits/chosen": -1.9911502599716187, "logits/rejected": -1.9645764827728271, "logps/chosen": -154.6221923828125, "logps/rejected": -194.23597717285156, "loss": 0.578, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0123751163482666, "rewards/margins": 0.3959987461566925, "rewards/rejected": -1.4083738327026367, "step": 15640 }, { "epoch": 2.6964162646450722, "grad_norm": 20.507884979248047, "learning_rate": 3.0873980643088603e-09, "logits/chosen": -2.00331449508667, "logits/rejected": -1.9817699193954468, "logps/chosen": -150.68421936035156, "logps/rejected": -185.39425659179688, "loss": 0.5974, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9733768701553345, "rewards/margins": 0.3487773537635803, "rewards/rejected": -1.3221540451049805, "step": 15650 }, { "epoch": 2.698139214334941, "grad_norm": 26.322948455810547, "learning_rate": 3.052813307896801e-09, "logits/chosen": -2.054105043411255, "logits/rejected": -2.0374743938446045, "logps/chosen": -152.2039794921875, "logps/rejected": -189.50692749023438, "loss": 0.5849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.995596706867218, "rewards/margins": 0.3572327494621277, "rewards/rejected": -1.3528294563293457, "step": 15660 }, { "epoch": 2.6998621640248106, "grad_norm": 19.634502410888672, "learning_rate": 3.018417251171529e-09, "logits/chosen": -1.923938512802124, "logits/rejected": -1.8853622674942017, "logps/chosen": -153.29351806640625, "logps/rejected": -183.82281494140625, "loss": 0.584, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9805344343185425, "rewards/margins": 0.34349122643470764, "rewards/rejected": -1.3240257501602173, "step": 15670 }, { "epoch": 2.7015851137146796, "grad_norm": 16.40585708618164, "learning_rate": 2.984210032384671e-09, "logits/chosen": -1.9571565389633179, "logits/rejected": -1.9332664012908936, "logps/chosen": -166.0120849609375, "logps/rejected": -201.9367218017578, "loss": 0.5888, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1163781881332397, "rewards/margins": 0.35976526141166687, "rewards/rejected": -1.476143479347229, "step": 15680 }, { "epoch": 2.7033080634045485, "grad_norm": 14.34471607208252, "learning_rate": 2.9501917890288387e-09, "logits/chosen": -1.9870684146881104, "logits/rejected": -1.966143012046814, "logps/chosen": -152.28228759765625, "logps/rejected": -194.99449157714844, "loss": 0.5781, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9983369708061218, "rewards/margins": 0.4102358818054199, "rewards/rejected": -1.408572793006897, "step": 15690 }, { "epoch": 2.7050310130944175, "grad_norm": 13.25967788696289, "learning_rate": 2.9163626578370736e-09, "logits/chosen": -2.0216376781463623, "logits/rejected": -1.993023157119751, "logps/chosen": -155.1454315185547, "logps/rejected": -197.2701416015625, "loss": 0.5698, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.031031847000122, "rewards/margins": 0.4151713252067566, "rewards/rejected": -1.4462032318115234, "step": 15700 }, { "epoch": 2.7050310130944175, "eval_logits/chosen": -2.109771728515625, "eval_logits/rejected": -2.0972399711608887, "eval_logps/chosen": -144.4972381591797, "eval_logps/rejected": -168.24269104003906, "eval_loss": 0.6383943557739258, "eval_rewards/accuracies": 0.6333643198013306, "eval_rewards/chosen": -0.857853353023529, "eval_rewards/margins": 0.1927722692489624, "eval_rewards/rejected": -1.0506255626678467, "eval_runtime": 384.7532, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 15700 }, { "epoch": 2.706753962784287, "grad_norm": 17.831119537353516, "learning_rate": 2.882722774782315e-09, "logits/chosen": -1.9956130981445312, "logits/rejected": -1.96504807472229, "logps/chosen": -159.65982055664062, "logps/rejected": -201.6159210205078, "loss": 0.5675, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0215837955474854, "rewards/margins": 0.43286770582199097, "rewards/rejected": -1.454451560974121, "step": 15710 }, { "epoch": 2.708476912474156, "grad_norm": 19.918376922607422, "learning_rate": 2.8492722750768305e-09, "logits/chosen": -2.017159938812256, "logits/rejected": -2.002784252166748, "logps/chosen": -159.04013061523438, "logps/rejected": -183.9481658935547, "loss": 0.6265, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0623589754104614, "rewards/margins": 0.2547416090965271, "rewards/rejected": -1.3171006441116333, "step": 15720 }, { "epoch": 2.710199862164025, "grad_norm": 19.61235809326172, "learning_rate": 2.8160112931716663e-09, "logits/chosen": -2.087789297103882, "logits/rejected": -2.0597333908081055, "logps/chosen": -153.22732543945312, "logps/rejected": -190.8411865234375, "loss": 0.5885, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0079309940338135, "rewards/margins": 0.36658722162246704, "rewards/rejected": -1.3745182752609253, "step": 15730 }, { "epoch": 2.711922811853894, "grad_norm": 17.492158889770508, "learning_rate": 2.782939962756126e-09, "logits/chosen": -2.007355213165283, "logits/rejected": -1.9738855361938477, "logps/chosen": -161.60629272460938, "logps/rejected": -185.51075744628906, "loss": 0.6456, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.085443377494812, "rewards/margins": 0.27533307671546936, "rewards/rejected": -1.3607765436172485, "step": 15740 }, { "epoch": 2.713645761543763, "grad_norm": 21.138893127441406, "learning_rate": 2.750058416757245e-09, "logits/chosen": -2.009204149246216, "logits/rejected": -1.983215570449829, "logps/chosen": -164.07150268554688, "logps/rejected": -203.10690307617188, "loss": 0.5728, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1145939826965332, "rewards/margins": 0.36329421401023865, "rewards/rejected": -1.4778882265090942, "step": 15750 }, { "epoch": 2.7153687112336318, "grad_norm": 20.663358688354492, "learning_rate": 2.717366787339209e-09, "logits/chosen": -1.9199457168579102, "logits/rejected": -1.898943305015564, "logps/chosen": -151.90272521972656, "logps/rejected": -178.9022674560547, "loss": 0.6301, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0139811038970947, "rewards/margins": 0.2717815637588501, "rewards/rejected": -1.2857627868652344, "step": 15760 }, { "epoch": 2.717091660923501, "grad_norm": 12.943525314331055, "learning_rate": 2.684865205902881e-09, "logits/chosen": -1.9830385446548462, "logits/rejected": -1.9589884281158447, "logps/chosen": -140.87332153320312, "logps/rejected": -188.70501708984375, "loss": 0.5333, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.890873908996582, "rewards/margins": 0.46456870436668396, "rewards/rejected": -1.3554425239562988, "step": 15770 }, { "epoch": 2.71881461061337, "grad_norm": 16.82303810119629, "learning_rate": 2.6525538030852223e-09, "logits/chosen": -2.1187119483947754, "logits/rejected": -2.1015076637268066, "logps/chosen": -165.7129364013672, "logps/rejected": -183.18618774414062, "loss": 0.6511, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1133878231048584, "rewards/margins": 0.19369569420814514, "rewards/rejected": -1.3070834875106812, "step": 15780 }, { "epoch": 2.720537560303239, "grad_norm": 17.936561584472656, "learning_rate": 2.620432708758802e-09, "logits/chosen": -1.9201587438583374, "logits/rejected": -1.8993772268295288, "logps/chosen": -157.0485382080078, "logps/rejected": -184.74427795410156, "loss": 0.6056, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0300787687301636, "rewards/margins": 0.2902488112449646, "rewards/rejected": -1.3203277587890625, "step": 15790 }, { "epoch": 2.722260509993108, "grad_norm": 19.42104148864746, "learning_rate": 2.5885020520312604e-09, "logits/chosen": -2.056487560272217, "logits/rejected": -2.0100300312042236, "logps/chosen": -156.46908569335938, "logps/rejected": -188.34872436523438, "loss": 0.5774, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9822174310684204, "rewards/margins": 0.3947606682777405, "rewards/rejected": -1.3769781589508057, "step": 15800 }, { "epoch": 2.722260509993108, "eval_logits/chosen": -2.1095082759857178, "eval_logits/rejected": -2.096958875656128, "eval_logps/chosen": -144.47366333007812, "eval_logps/rejected": -168.249755859375, "eval_loss": 0.6382918953895569, "eval_rewards/accuracies": 0.6317379474639893, "eval_rewards/chosen": -0.8576175570487976, "eval_rewards/margins": 0.19307869672775269, "eval_rewards/rejected": -1.0506962537765503, "eval_runtime": 384.649, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 15800 }, { "epoch": 2.7239834596829775, "grad_norm": 22.536474227905273, "learning_rate": 2.5567619612447854e-09, "logits/chosen": -2.051762104034424, "logits/rejected": -2.040144920349121, "logps/chosen": -153.88951110839844, "logps/rejected": -187.16441345214844, "loss": 0.6137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.051102876663208, "rewards/margins": 0.2977401316165924, "rewards/rejected": -1.3488430976867676, "step": 15810 }, { "epoch": 2.7257064093728465, "grad_norm": 16.64583969116211, "learning_rate": 2.5252125639756207e-09, "logits/chosen": -1.9144083261489868, "logits/rejected": -1.889653205871582, "logps/chosen": -160.00820922851562, "logps/rejected": -192.99110412597656, "loss": 0.6064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0836801528930664, "rewards/margins": 0.3282696306705475, "rewards/rejected": -1.4119497537612915, "step": 15820 }, { "epoch": 2.7274293590627154, "grad_norm": 17.02457618713379, "learning_rate": 2.493853987033523e-09, "logits/chosen": -2.0397231578826904, "logits/rejected": -2.0211498737335205, "logps/chosen": -148.70372009277344, "logps/rejected": -185.701416015625, "loss": 0.5975, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9890076518058777, "rewards/margins": 0.35797032713890076, "rewards/rejected": -1.346977949142456, "step": 15830 }, { "epoch": 2.7291523087525844, "grad_norm": 21.867494583129883, "learning_rate": 2.4626863564612467e-09, "logits/chosen": -2.0498619079589844, "logits/rejected": -2.0379576683044434, "logps/chosen": -169.33444213867188, "logps/rejected": -208.8885498046875, "loss": 0.5975, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.160496473312378, "rewards/margins": 0.3727339208126068, "rewards/rejected": -1.5332304239273071, "step": 15840 }, { "epoch": 2.7308752584424534, "grad_norm": 19.149599075317383, "learning_rate": 2.4317097975340985e-09, "logits/chosen": -2.0424647331237793, "logits/rejected": -2.0164361000061035, "logps/chosen": -147.47756958007812, "logps/rejected": -184.2928466796875, "loss": 0.5879, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9420193433761597, "rewards/margins": 0.3545106053352356, "rewards/rejected": -1.29653000831604, "step": 15850 }, { "epoch": 2.7325982081323223, "grad_norm": 20.371034622192383, "learning_rate": 2.4009244347593604e-09, "logits/chosen": -2.0006725788116455, "logits/rejected": -1.9715356826782227, "logps/chosen": -155.3318634033203, "logps/rejected": -177.36599731445312, "loss": 0.642, "rewards/accuracies": 0.625, "rewards/chosen": -1.0221067667007446, "rewards/margins": 0.24764354526996613, "rewards/rejected": -1.2697503566741943, "step": 15860 }, { "epoch": 2.7343211578221913, "grad_norm": 16.468414306640625, "learning_rate": 2.370330391875819e-09, "logits/chosen": -1.9960463047027588, "logits/rejected": -1.9584850072860718, "logps/chosen": -163.8518524169922, "logps/rejected": -204.46884155273438, "loss": 0.5685, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0862796306610107, "rewards/margins": 0.4320918917655945, "rewards/rejected": -1.51837158203125, "step": 15870 }, { "epoch": 2.7360441075120607, "grad_norm": 17.82473373413086, "learning_rate": 2.3399277918532854e-09, "logits/chosen": -1.972062110900879, "logits/rejected": -1.955783486366272, "logps/chosen": -153.34884643554688, "logps/rejected": -190.64968872070312, "loss": 0.5771, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0146900415420532, "rewards/margins": 0.3780738115310669, "rewards/rejected": -1.3927638530731201, "step": 15880 }, { "epoch": 2.7377670572019297, "grad_norm": 19.587627410888672, "learning_rate": 2.309716756892083e-09, "logits/chosen": -2.03208065032959, "logits/rejected": -1.993096947669983, "logps/chosen": -148.71401977539062, "logps/rejected": -183.1342315673828, "loss": 0.5725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9605844616889954, "rewards/margins": 0.38707417249679565, "rewards/rejected": -1.3476585149765015, "step": 15890 }, { "epoch": 2.7394900068917987, "grad_norm": 24.50597381591797, "learning_rate": 2.2796974084225373e-09, "logits/chosen": -2.016630172729492, "logits/rejected": -1.9673206806182861, "logps/chosen": -167.5557403564453, "logps/rejected": -195.54690551757812, "loss": 0.5948, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.077521562576294, "rewards/margins": 0.35842639207839966, "rewards/rejected": -1.4359480142593384, "step": 15900 }, { "epoch": 2.7394900068917987, "eval_logits/chosen": -2.1088409423828125, "eval_logits/rejected": -2.096285820007324, "eval_logps/chosen": -144.5435791015625, "eval_logps/rejected": -168.2884979248047, "eval_loss": 0.6384817361831665, "eval_rewards/accuracies": 0.6328996419906616, "eval_rewards/chosen": -0.8583167791366577, "eval_rewards/margins": 0.19276687502861023, "eval_rewards/rejected": -1.0510836839675903, "eval_runtime": 385.0149, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 15900 }, { "epoch": 2.741212956581668, "grad_norm": 17.732595443725586, "learning_rate": 2.249869867104537e-09, "logits/chosen": -1.9359805583953857, "logits/rejected": -1.9071578979492188, "logps/chosen": -150.79183959960938, "logps/rejected": -177.77679443359375, "loss": 0.617, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9779221415519714, "rewards/margins": 0.30262452363967896, "rewards/rejected": -1.2805465459823608, "step": 15910 }, { "epoch": 2.742935906271537, "grad_norm": 23.250537872314453, "learning_rate": 2.220234252826991e-09, "logits/chosen": -1.9514859914779663, "logits/rejected": -1.933777093887329, "logps/chosen": -163.66555786132812, "logps/rejected": -196.0319366455078, "loss": 0.6128, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1111425161361694, "rewards/margins": 0.30749160051345825, "rewards/rejected": -1.4186341762542725, "step": 15920 }, { "epoch": 2.744658855961406, "grad_norm": 17.208866119384766, "learning_rate": 2.190790684707411e-09, "logits/chosen": -1.8933515548706055, "logits/rejected": -1.8573405742645264, "logps/chosen": -149.70046997070312, "logps/rejected": -180.9241180419922, "loss": 0.5765, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9450284242630005, "rewards/margins": 0.36780503392219543, "rewards/rejected": -1.312833309173584, "step": 15930 }, { "epoch": 2.746381805651275, "grad_norm": 16.14655113220215, "learning_rate": 2.161539281091351e-09, "logits/chosen": -1.9840434789657593, "logits/rejected": -1.9504657983779907, "logps/chosen": -154.9799346923828, "logps/rejected": -204.12124633789062, "loss": 0.5441, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9581102132797241, "rewards/margins": 0.49415311217308044, "rewards/rejected": -1.4522634744644165, "step": 15940 }, { "epoch": 2.748104755341144, "grad_norm": 17.73128318786621, "learning_rate": 2.1324801595520357e-09, "logits/chosen": -2.0518643856048584, "logits/rejected": -2.0237386226654053, "logps/chosen": -152.72918701171875, "logps/rejected": -185.44696044921875, "loss": 0.5762, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9485227465629578, "rewards/margins": 0.37847551703453064, "rewards/rejected": -1.326998233795166, "step": 15950 }, { "epoch": 2.749827705031013, "grad_norm": 15.017594337463379, "learning_rate": 2.1036134368897785e-09, "logits/chosen": -2.02626371383667, "logits/rejected": -1.9976743459701538, "logps/chosen": -156.98794555664062, "logps/rejected": -189.72183227539062, "loss": 0.5933, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0053889751434326, "rewards/margins": 0.3594733476638794, "rewards/rejected": -1.3648624420166016, "step": 15960 }, { "epoch": 2.751550654720882, "grad_norm": 17.680301666259766, "learning_rate": 2.0749392291315894e-09, "logits/chosen": -2.0000274181365967, "logits/rejected": -1.972936987876892, "logps/chosen": -155.306396484375, "logps/rejected": -201.07473754882812, "loss": 0.5476, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.001050591468811, "rewards/margins": 0.451840877532959, "rewards/rejected": -1.4528913497924805, "step": 15970 }, { "epoch": 2.7532736044107513, "grad_norm": 19.339937210083008, "learning_rate": 2.046457651530686e-09, "logits/chosen": -1.9725288152694702, "logits/rejected": -1.9468179941177368, "logps/chosen": -158.36660766601562, "logps/rejected": -186.2929229736328, "loss": 0.6126, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0180027484893799, "rewards/margins": 0.288787841796875, "rewards/rejected": -1.3067904710769653, "step": 15980 }, { "epoch": 2.7549965541006203, "grad_norm": 18.81197738647461, "learning_rate": 2.0181688185660183e-09, "logits/chosen": -2.130431652069092, "logits/rejected": -2.1292641162872314, "logps/chosen": -155.34317016601562, "logps/rejected": -187.73655700683594, "loss": 0.6071, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0065585374832153, "rewards/margins": 0.3134516477584839, "rewards/rejected": -1.3200101852416992, "step": 15990 }, { "epoch": 2.7567195037904892, "grad_norm": 17.611356735229492, "learning_rate": 1.99007284394182e-09, "logits/chosen": -1.9634910821914673, "logits/rejected": -1.9308748245239258, "logps/chosen": -156.92080688476562, "logps/rejected": -188.16241455078125, "loss": 0.5977, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0308951139450073, "rewards/margins": 0.33166080713272095, "rewards/rejected": -1.3625560998916626, "step": 16000 }, { "epoch": 2.7567195037904892, "eval_logits/chosen": -2.108428716659546, "eval_logits/rejected": -2.0958633422851562, "eval_logps/chosen": -144.63160705566406, "eval_logps/rejected": -168.45057678222656, "eval_loss": 0.6381997466087341, "eval_rewards/accuracies": 0.6342936754226685, "eval_rewards/chosen": -0.8591971397399902, "eval_rewards/margins": 0.19350731372833252, "eval_rewards/rejected": -1.0527044534683228, "eval_runtime": 384.7568, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 16000 }, { "epoch": 2.758442453480358, "grad_norm": 28.459753036499023, "learning_rate": 1.9621698405871466e-09, "logits/chosen": -2.0574021339416504, "logits/rejected": -2.039811849594116, "logps/chosen": -162.7482147216797, "logps/rejected": -195.13963317871094, "loss": 0.6146, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.075080156326294, "rewards/margins": 0.33885112404823303, "rewards/rejected": -1.4139313697814941, "step": 16010 }, { "epoch": 2.7601654031702276, "grad_norm": 13.295467376708984, "learning_rate": 1.934459920655429e-09, "logits/chosen": -2.096675157546997, "logits/rejected": -2.075190305709839, "logps/chosen": -160.48056030273438, "logps/rejected": -192.89895629882812, "loss": 0.6157, "rewards/accuracies": 0.625, "rewards/chosen": -1.073760747909546, "rewards/margins": 0.3442454934120178, "rewards/rejected": -1.418006181716919, "step": 16020 }, { "epoch": 2.7618883528600966, "grad_norm": 21.12601089477539, "learning_rate": 1.90694319552403e-09, "logits/chosen": -2.0830025672912598, "logits/rejected": -2.0588667392730713, "logps/chosen": -157.102294921875, "logps/rejected": -191.44021606445312, "loss": 0.6006, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0407729148864746, "rewards/margins": 0.34737643599510193, "rewards/rejected": -1.3881492614746094, "step": 16030 }, { "epoch": 2.7636113025499656, "grad_norm": 21.42576026916504, "learning_rate": 1.879619775793756e-09, "logits/chosen": -2.023306369781494, "logits/rejected": -2.002659559249878, "logps/chosen": -155.07742309570312, "logps/rejected": -195.07296752929688, "loss": 0.5789, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0129282474517822, "rewards/margins": 0.41328755021095276, "rewards/rejected": -1.4262158870697021, "step": 16040 }, { "epoch": 2.7653342522398345, "grad_norm": 16.90118408203125, "learning_rate": 1.8524897712884514e-09, "logits/chosen": -1.9849954843521118, "logits/rejected": -1.9626471996307373, "logps/chosen": -154.33963012695312, "logps/rejected": -197.6629180908203, "loss": 0.5528, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9994105100631714, "rewards/margins": 0.43908247351646423, "rewards/rejected": -1.4384931325912476, "step": 16050 }, { "epoch": 2.7670572019297035, "grad_norm": 17.3250675201416, "learning_rate": 1.8255532910545657e-09, "logits/chosen": -2.029186964035034, "logits/rejected": -2.0078043937683105, "logps/chosen": -152.3870391845703, "logps/rejected": -187.66786193847656, "loss": 0.5805, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.966718316078186, "rewards/margins": 0.35933464765548706, "rewards/rejected": -1.3260529041290283, "step": 16060 }, { "epoch": 2.7687801516195725, "grad_norm": 14.801129341125488, "learning_rate": 1.798810443360671e-09, "logits/chosen": -1.931652307510376, "logits/rejected": -1.901280403137207, "logps/chosen": -156.13992309570312, "logps/rejected": -188.70851135253906, "loss": 0.5804, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9906762838363647, "rewards/margins": 0.3544227182865143, "rewards/rejected": -1.3450990915298462, "step": 16070 }, { "epoch": 2.770503101309442, "grad_norm": 19.460756301879883, "learning_rate": 1.7722613356970728e-09, "logits/chosen": -2.025620937347412, "logits/rejected": -1.983106017112732, "logps/chosen": -162.57888793945312, "logps/rejected": -200.30520629882812, "loss": 0.5703, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0547617673873901, "rewards/margins": 0.4325917661190033, "rewards/rejected": -1.4873535633087158, "step": 16080 }, { "epoch": 2.772226050999311, "grad_norm": 19.991775512695312, "learning_rate": 1.745906074775344e-09, "logits/chosen": -2.0023186206817627, "logits/rejected": -1.9706628322601318, "logps/chosen": -146.0303955078125, "logps/rejected": -183.83497619628906, "loss": 0.5711, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.931313157081604, "rewards/margins": 0.3988117277622223, "rewards/rejected": -1.3301247358322144, "step": 16090 }, { "epoch": 2.77394900068918, "grad_norm": 19.24905014038086, "learning_rate": 1.7197447665279142e-09, "logits/chosen": -2.0427939891815186, "logits/rejected": -2.022566556930542, "logps/chosen": -150.73016357421875, "logps/rejected": -203.5421600341797, "loss": 0.5412, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9851313829421997, "rewards/margins": 0.5049887299537659, "rewards/rejected": -1.4901201725006104, "step": 16100 }, { "epoch": 2.77394900068918, "eval_logits/chosen": -2.108128309249878, "eval_logits/rejected": -2.095674753189087, "eval_logps/chosen": -144.78477478027344, "eval_logps/rejected": -168.52581787109375, "eval_loss": 0.6385473608970642, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.8607288002967834, "eval_rewards/margins": 0.19272837042808533, "eval_rewards/rejected": -1.0534571409225464, "eval_runtime": 384.9125, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 16100 }, { "epoch": 2.775671950379049, "grad_norm": 21.999513626098633, "learning_rate": 1.6937775161076251e-09, "logits/chosen": -1.8801167011260986, "logits/rejected": -1.8405097723007202, "logps/chosen": -151.8373565673828, "logps/rejected": -191.4863739013672, "loss": 0.5687, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9473932385444641, "rewards/margins": 0.42907577753067017, "rewards/rejected": -1.3764688968658447, "step": 16110 }, { "epoch": 2.777394900068918, "grad_norm": 19.513912200927734, "learning_rate": 1.6680044278873428e-09, "logits/chosen": -1.9779163599014282, "logits/rejected": -1.961771011352539, "logps/chosen": -144.7937469482422, "logps/rejected": -179.3964080810547, "loss": 0.5964, "rewards/accuracies": 0.71875, "rewards/chosen": -0.942004382610321, "rewards/margins": 0.3237936794757843, "rewards/rejected": -1.2657980918884277, "step": 16120 }, { "epoch": 2.779117849758787, "grad_norm": 19.13042640686035, "learning_rate": 1.6424256054595187e-09, "logits/chosen": -1.9808464050292969, "logits/rejected": -1.9533973932266235, "logps/chosen": -157.46209716796875, "logps/rejected": -189.03805541992188, "loss": 0.6093, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0505516529083252, "rewards/margins": 0.32341963052749634, "rewards/rejected": -1.3739712238311768, "step": 16130 }, { "epoch": 2.780840799448656, "grad_norm": 18.319740295410156, "learning_rate": 1.6170411516357563e-09, "logits/chosen": -2.0916028022766113, "logits/rejected": -2.0648059844970703, "logps/chosen": -149.63636779785156, "logps/rejected": -188.07699584960938, "loss": 0.5718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9687054753303528, "rewards/margins": 0.3894537389278412, "rewards/rejected": -1.3581594228744507, "step": 16140 }, { "epoch": 2.782563749138525, "grad_norm": 21.89604377746582, "learning_rate": 1.5918511684464008e-09, "logits/chosen": -2.04638409614563, "logits/rejected": -2.0222058296203613, "logps/chosen": -160.7626495361328, "logps/rejected": -193.0424041748047, "loss": 0.5972, "rewards/accuracies": 0.625, "rewards/chosen": -1.0631176233291626, "rewards/margins": 0.343940794467926, "rewards/rejected": -1.4070584774017334, "step": 16150 }, { "epoch": 2.784286698828394, "grad_norm": 20.658361434936523, "learning_rate": 1.5668557571401786e-09, "logits/chosen": -2.0624663829803467, "logits/rejected": -2.0287413597106934, "logps/chosen": -150.15054321289062, "logps/rejected": -195.22293090820312, "loss": 0.5459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9648138284683228, "rewards/margins": 0.45158594846725464, "rewards/rejected": -1.416399598121643, "step": 16160 }, { "epoch": 2.786009648518263, "grad_norm": 14.938721656799316, "learning_rate": 1.5420550181837245e-09, "logits/chosen": -1.9357715845108032, "logits/rejected": -1.9069770574569702, "logps/chosen": -156.93417358398438, "logps/rejected": -191.391357421875, "loss": 0.5867, "rewards/accuracies": 0.6875, "rewards/chosen": -1.029882788658142, "rewards/margins": 0.346625953912735, "rewards/rejected": -1.3765085935592651, "step": 16170 }, { "epoch": 2.7877325982081325, "grad_norm": 21.733293533325195, "learning_rate": 1.517449051261227e-09, "logits/chosen": -1.9778528213500977, "logits/rejected": -1.9300695657730103, "logps/chosen": -168.5574188232422, "logps/rejected": -203.21047973632812, "loss": 0.5801, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0970590114593506, "rewards/margins": 0.4118477702140808, "rewards/rejected": -1.508906602859497, "step": 16180 }, { "epoch": 2.7894555478980014, "grad_norm": 15.982828140258789, "learning_rate": 1.4930379552739791e-09, "logits/chosen": -1.9926446676254272, "logits/rejected": -1.9684278964996338, "logps/chosen": -160.42800903320312, "logps/rejected": -195.6782684326172, "loss": 0.6044, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0718891620635986, "rewards/margins": 0.33643192052841187, "rewards/rejected": -1.4083211421966553, "step": 16190 }, { "epoch": 2.7911784975878704, "grad_norm": 15.929803848266602, "learning_rate": 1.4688218283400334e-09, "logits/chosen": -1.925018072128296, "logits/rejected": -1.882367730140686, "logps/chosen": -155.81529235839844, "logps/rejected": -187.7417755126953, "loss": 0.6015, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0091586112976074, "rewards/margins": 0.3661991059780121, "rewards/rejected": -1.3753578662872314, "step": 16200 }, { "epoch": 2.7911784975878704, "eval_logits/chosen": -2.1086368560791016, "eval_logits/rejected": -2.0961124897003174, "eval_logps/chosen": -144.7053985595703, "eval_logps/rejected": -168.44850158691406, "eval_loss": 0.638496458530426, "eval_rewards/accuracies": 0.6319702863693237, "eval_rewards/chosen": -0.8599350452423096, "eval_rewards/margins": 0.19274887442588806, "eval_rewards/rejected": -1.0526838302612305, "eval_runtime": 384.9545, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 16200 }, { "epoch": 2.7929014472777394, "grad_norm": 18.865571975708008, "learning_rate": 1.4448007677937746e-09, "logits/chosen": -1.8919999599456787, "logits/rejected": -1.8706039190292358, "logps/chosen": -152.81573486328125, "logps/rejected": -187.83419799804688, "loss": 0.601, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0088460445404053, "rewards/margins": 0.3328995406627655, "rewards/rejected": -1.3417456150054932, "step": 16210 }, { "epoch": 2.794624396967609, "grad_norm": 16.069080352783203, "learning_rate": 1.420974870185543e-09, "logits/chosen": -1.9647763967514038, "logits/rejected": -1.938943862915039, "logps/chosen": -151.71485900878906, "logps/rejected": -194.41685485839844, "loss": 0.5638, "rewards/accuracies": 0.75, "rewards/chosen": -0.9527828097343445, "rewards/margins": 0.43891993165016174, "rewards/rejected": -1.3917028903961182, "step": 16220 }, { "epoch": 2.7963473466574778, "grad_norm": 15.390767097473145, "learning_rate": 1.3973442312812278e-09, "logits/chosen": -2.0187325477600098, "logits/rejected": -1.9925727844238281, "logps/chosen": -154.63693237304688, "logps/rejected": -191.4229736328125, "loss": 0.5864, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0308500528335571, "rewards/margins": 0.36913809180259705, "rewards/rejected": -1.399988055229187, "step": 16230 }, { "epoch": 2.7980702963473467, "grad_norm": 17.4558162689209, "learning_rate": 1.373908946061908e-09, "logits/chosen": -1.9780066013336182, "logits/rejected": -1.9576774835586548, "logps/chosen": -158.3379364013672, "logps/rejected": -191.57717895507812, "loss": 0.6197, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.067734718322754, "rewards/margins": 0.31724998354911804, "rewards/rejected": -1.3849847316741943, "step": 16240 }, { "epoch": 2.7997932460372157, "grad_norm": 19.38920021057129, "learning_rate": 1.3506691087234457e-09, "logits/chosen": -2.012394905090332, "logits/rejected": -1.986419677734375, "logps/chosen": -158.73910522460938, "logps/rejected": -189.1681365966797, "loss": 0.5957, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0126402378082275, "rewards/margins": 0.3302934467792511, "rewards/rejected": -1.3429334163665771, "step": 16250 }, { "epoch": 2.8015161957270847, "grad_norm": 20.637210845947266, "learning_rate": 1.3276248126761259e-09, "logits/chosen": -1.9926544427871704, "logits/rejected": -1.9646838903427124, "logps/chosen": -160.13150024414062, "logps/rejected": -203.32470703125, "loss": 0.591, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0744736194610596, "rewards/margins": 0.40492677688598633, "rewards/rejected": -1.479400396347046, "step": 16260 }, { "epoch": 2.8032391454169536, "grad_norm": 18.54698371887207, "learning_rate": 1.304776150544279e-09, "logits/chosen": -2.032370090484619, "logits/rejected": -2.0050148963928223, "logps/chosen": -159.2360076904297, "logps/rejected": -193.20437622070312, "loss": 0.5965, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0389238595962524, "rewards/margins": 0.3588295876979828, "rewards/rejected": -1.397753357887268, "step": 16270 }, { "epoch": 2.804962095106823, "grad_norm": 18.229055404663086, "learning_rate": 1.2821232141658866e-09, "logits/chosen": -2.099778652191162, "logits/rejected": -2.07720685005188, "logps/chosen": -163.57701110839844, "logps/rejected": -193.65328979492188, "loss": 0.6071, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0794527530670166, "rewards/margins": 0.3316441476345062, "rewards/rejected": -1.4110968112945557, "step": 16280 }, { "epoch": 2.806685044796692, "grad_norm": 14.595271110534668, "learning_rate": 1.2596660945922433e-09, "logits/chosen": -2.0501091480255127, "logits/rejected": -2.0319061279296875, "logps/chosen": -150.327392578125, "logps/rejected": -194.3672637939453, "loss": 0.5694, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9848111867904663, "rewards/margins": 0.4135316014289856, "rewards/rejected": -1.3983427286148071, "step": 16290 }, { "epoch": 2.808407994486561, "grad_norm": 16.280427932739258, "learning_rate": 1.2374048820875893e-09, "logits/chosen": -2.0588290691375732, "logits/rejected": -2.0250751972198486, "logps/chosen": -158.71620178222656, "logps/rejected": -193.8098907470703, "loss": 0.5921, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0488011837005615, "rewards/margins": 0.3753056526184082, "rewards/rejected": -1.4241067171096802, "step": 16300 }, { "epoch": 2.808407994486561, "eval_logits/chosen": -2.108409881591797, "eval_logits/rejected": -2.095869779586792, "eval_logps/chosen": -144.73362731933594, "eval_logps/rejected": -168.55264282226562, "eval_loss": 0.6382446885108948, "eval_rewards/accuracies": 0.6338289976119995, "eval_rewards/chosen": -0.8602172136306763, "eval_rewards/margins": 0.1935078501701355, "eval_rewards/rejected": -1.0537251234054565, "eval_runtime": 384.8236, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 16300 }, { "epoch": 2.81013094417643, "grad_norm": 16.540910720825195, "learning_rate": 1.2153396661287007e-09, "logits/chosen": -1.9610528945922852, "logits/rejected": -1.9452483654022217, "logps/chosen": -156.6735076904297, "logps/rejected": -189.00901794433594, "loss": 0.6119, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0357961654663086, "rewards/margins": 0.3104301393032074, "rewards/rejected": -1.346226453781128, "step": 16310 }, { "epoch": 2.8118538938662994, "grad_norm": 24.500307083129883, "learning_rate": 1.1934705354045894e-09, "logits/chosen": -1.992566466331482, "logits/rejected": -1.968497633934021, "logps/chosen": -157.59774780273438, "logps/rejected": -196.737060546875, "loss": 0.5799, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0343701839447021, "rewards/margins": 0.3823673129081726, "rewards/rejected": -1.4167375564575195, "step": 16320 }, { "epoch": 2.8135768435561683, "grad_norm": 14.111525535583496, "learning_rate": 1.1717975778161193e-09, "logits/chosen": -2.036130428314209, "logits/rejected": -2.010246515274048, "logps/chosen": -158.2937774658203, "logps/rejected": -184.37525939941406, "loss": 0.6179, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0042508840560913, "rewards/margins": 0.2979941964149475, "rewards/rejected": -1.302245020866394, "step": 16330 }, { "epoch": 2.8152997932460373, "grad_norm": 16.498504638671875, "learning_rate": 1.1503208804756526e-09, "logits/chosen": -1.914899468421936, "logits/rejected": -1.892896056175232, "logps/chosen": -151.40765380859375, "logps/rejected": -193.80078125, "loss": 0.5526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9697238802909851, "rewards/margins": 0.43620944023132324, "rewards/rejected": -1.4059332609176636, "step": 16340 }, { "epoch": 2.8170227429359063, "grad_norm": 14.339289665222168, "learning_rate": 1.1290405297066984e-09, "logits/chosen": -2.111816883087158, "logits/rejected": -2.077700614929199, "logps/chosen": -161.0608367919922, "logps/rejected": -192.53443908691406, "loss": 0.5923, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0527642965316772, "rewards/margins": 0.33008676767349243, "rewards/rejected": -1.3828508853912354, "step": 16350 }, { "epoch": 2.8187456926257752, "grad_norm": 25.71321678161621, "learning_rate": 1.1079566110435812e-09, "logits/chosen": -1.9828803539276123, "logits/rejected": -1.9541343450546265, "logps/chosen": -149.381591796875, "logps/rejected": -188.98953247070312, "loss": 0.5666, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9693704843521118, "rewards/margins": 0.41178736090660095, "rewards/rejected": -1.381157636642456, "step": 16360 }, { "epoch": 2.820468642315644, "grad_norm": 23.42551040649414, "learning_rate": 1.0870692092310674e-09, "logits/chosen": -1.9621902704238892, "logits/rejected": -1.9371719360351562, "logps/chosen": -152.97149658203125, "logps/rejected": -177.88674926757812, "loss": 0.6181, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9902588725090027, "rewards/margins": 0.28649386763572693, "rewards/rejected": -1.2767528295516968, "step": 16370 }, { "epoch": 2.822191592005513, "grad_norm": 18.16157341003418, "learning_rate": 1.0663784082240556e-09, "logits/chosen": -2.0195364952087402, "logits/rejected": -2.001188278198242, "logps/chosen": -151.81814575195312, "logps/rejected": -191.2435760498047, "loss": 0.5744, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9752138257026672, "rewards/margins": 0.38728389143943787, "rewards/rejected": -1.3624976873397827, "step": 16380 }, { "epoch": 2.8239145416953826, "grad_norm": 15.820906639099121, "learning_rate": 1.0458842911872213e-09, "logits/chosen": -1.944976568222046, "logits/rejected": -1.9171807765960693, "logps/chosen": -148.1985321044922, "logps/rejected": -185.3063201904297, "loss": 0.5975, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9576689600944519, "rewards/margins": 0.373207688331604, "rewards/rejected": -1.3308767080307007, "step": 16390 }, { "epoch": 2.8256374913852516, "grad_norm": 15.738819122314453, "learning_rate": 1.0255869404947049e-09, "logits/chosen": -1.9565589427947998, "logits/rejected": -1.936244010925293, "logps/chosen": -153.927978515625, "logps/rejected": -188.249755859375, "loss": 0.5958, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9944507479667664, "rewards/margins": 0.34070777893066406, "rewards/rejected": -1.3351585865020752, "step": 16400 }, { "epoch": 2.8256374913852516, "eval_logits/chosen": -2.1078200340270996, "eval_logits/rejected": -2.095283269882202, "eval_logps/chosen": -144.73094177246094, "eval_logps/rejected": -168.52134704589844, "eval_loss": 0.638367772102356, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -0.8601904511451721, "eval_rewards/margins": 0.19322170317173004, "eval_rewards/rejected": -1.0534123182296753, "eval_runtime": 384.8196, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 16400 }, { "epoch": 2.8273604410751205, "grad_norm": 18.241600036621094, "learning_rate": 1.0054864377297357e-09, "logits/chosen": -2.0109992027282715, "logits/rejected": -1.9712998867034912, "logps/chosen": -165.53500366210938, "logps/rejected": -191.2516632080078, "loss": 0.6037, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1030426025390625, "rewards/margins": 0.32026731967926025, "rewards/rejected": -1.4233099222183228, "step": 16410 }, { "epoch": 2.82908339076499, "grad_norm": 20.470531463623047, "learning_rate": 9.855828636843422e-10, "logits/chosen": -1.892657995223999, "logits/rejected": -1.876413345336914, "logps/chosen": -159.25308227539062, "logps/rejected": -191.30711364746094, "loss": 0.6226, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0499417781829834, "rewards/margins": 0.29584425687789917, "rewards/rejected": -1.3457859754562378, "step": 16420 }, { "epoch": 2.830806340454859, "grad_norm": 23.88309669494629, "learning_rate": 9.65876298359025e-10, "logits/chosen": -1.9049713611602783, "logits/rejected": -1.8888660669326782, "logps/chosen": -162.74562072753906, "logps/rejected": -189.8667449951172, "loss": 0.6336, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0874176025390625, "rewards/margins": 0.2742784023284912, "rewards/rejected": -1.3616960048675537, "step": 16430 }, { "epoch": 2.832529290144728, "grad_norm": 17.05344009399414, "learning_rate": 9.463668209624298e-10, "logits/chosen": -1.9784685373306274, "logits/rejected": -1.9553331136703491, "logps/chosen": -152.5343780517578, "logps/rejected": -191.367431640625, "loss": 0.5915, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0134600400924683, "rewards/margins": 0.3821795582771301, "rewards/rejected": -1.3956396579742432, "step": 16440 }, { "epoch": 2.834252239834597, "grad_norm": 17.71872329711914, "learning_rate": 9.270545099110072e-10, "logits/chosen": -2.0652825832366943, "logits/rejected": -2.032459259033203, "logps/chosen": -155.99710083007812, "logps/rejected": -203.09974670410156, "loss": 0.5299, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9963186383247375, "rewards/margins": 0.483388751745224, "rewards/rejected": -1.4797072410583496, "step": 16450 }, { "epoch": 2.835975189524466, "grad_norm": 21.26156997680664, "learning_rate": 9.079394428287312e-10, "logits/chosen": -1.8806241750717163, "logits/rejected": -1.866742491722107, "logps/chosen": -142.49191284179688, "logps/rejected": -188.5543212890625, "loss": 0.5601, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9312785267829895, "rewards/margins": 0.4463452696800232, "rewards/rejected": -1.3776237964630127, "step": 16460 }, { "epoch": 2.837698139214335, "grad_norm": 15.324095726013184, "learning_rate": 8.890216965467656e-10, "logits/chosen": -1.9877513647079468, "logits/rejected": -1.9621587991714478, "logps/chosen": -146.3279571533203, "logps/rejected": -190.43270874023438, "loss": 0.5681, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9536417722702026, "rewards/margins": 0.4272822439670563, "rewards/rejected": -1.3809239864349365, "step": 16470 }, { "epoch": 2.8394210889042037, "grad_norm": 21.810314178466797, "learning_rate": 8.70301347103175e-10, "logits/chosen": -1.8843555450439453, "logits/rejected": -1.8597116470336914, "logps/chosen": -150.2646026611328, "logps/rejected": -193.82888793945312, "loss": 0.5544, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9751816987991333, "rewards/margins": 0.4222782254219055, "rewards/rejected": -1.3974599838256836, "step": 16480 }, { "epoch": 2.841144038594073, "grad_norm": 20.657318115234375, "learning_rate": 8.517784697425978e-10, "logits/chosen": -1.9926307201385498, "logits/rejected": -1.9721381664276123, "logps/chosen": -148.2007598876953, "logps/rejected": -175.26956176757812, "loss": 0.6114, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9608391523361206, "rewards/margins": 0.2820325195789337, "rewards/rejected": -1.2428717613220215, "step": 16490 }, { "epoch": 2.842866988283942, "grad_norm": 16.048295974731445, "learning_rate": 8.334531389159349e-10, "logits/chosen": -2.030116558074951, "logits/rejected": -1.979917287826538, "logps/chosen": -153.33575439453125, "logps/rejected": -177.27943420410156, "loss": 0.5977, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9533479809761047, "rewards/margins": 0.320335328578949, "rewards/rejected": -1.2736831903457642, "step": 16500 }, { "epoch": 2.842866988283942, "eval_logits/chosen": -2.1077017784118652, "eval_logits/rejected": -2.0952095985412598, "eval_logps/chosen": -144.718017578125, "eval_logps/rejected": -168.49497985839844, "eval_loss": 0.63838791847229, "eval_rewards/accuracies": 0.6333643198013306, "eval_rewards/chosen": -0.8600613474845886, "eval_rewards/margins": 0.19308704137802124, "eval_rewards/rejected": -1.0531483888626099, "eval_runtime": 384.9223, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 16500 }, { "epoch": 2.844589937973811, "grad_norm": 19.646753311157227, "learning_rate": 8.153254282801114e-10, "logits/chosen": -2.0270133018493652, "logits/rejected": -2.005885601043701, "logps/chosen": -156.2147979736328, "logps/rejected": -188.6864013671875, "loss": 0.594, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9929972887039185, "rewards/margins": 0.3382185101509094, "rewards/rejected": -1.3312156200408936, "step": 16510 }, { "epoch": 2.84631288766368, "grad_norm": 17.678985595703125, "learning_rate": 7.973954106976876e-10, "logits/chosen": -2.102579116821289, "logits/rejected": -2.065721273422241, "logps/chosen": -163.9256134033203, "logps/rejected": -207.6787109375, "loss": 0.5568, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0788968801498413, "rewards/margins": 0.46170586347579956, "rewards/rejected": -1.540602684020996, "step": 16520 }, { "epoch": 2.8480358373535495, "grad_norm": 18.857799530029297, "learning_rate": 7.796631582366486e-10, "logits/chosen": -1.9477729797363281, "logits/rejected": -1.9222466945648193, "logps/chosen": -157.16702270507812, "logps/rejected": -178.43026733398438, "loss": 0.623, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0100066661834717, "rewards/margins": 0.24954771995544434, "rewards/rejected": -1.2595542669296265, "step": 16530 }, { "epoch": 2.8497587870434185, "grad_norm": 17.755603790283203, "learning_rate": 7.621287421700762e-10, "logits/chosen": -1.9886562824249268, "logits/rejected": -1.966334342956543, "logps/chosen": -160.26039123535156, "logps/rejected": -185.20523071289062, "loss": 0.6259, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0719877481460571, "rewards/margins": 0.28443509340286255, "rewards/rejected": -1.356422781944275, "step": 16540 }, { "epoch": 2.8514817367332874, "grad_norm": 20.178653717041016, "learning_rate": 7.447922329758605e-10, "logits/chosen": -1.929281234741211, "logits/rejected": -1.9075992107391357, "logps/chosen": -159.108642578125, "logps/rejected": -203.34298706054688, "loss": 0.5629, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0542900562286377, "rewards/margins": 0.43164771795272827, "rewards/rejected": -1.4859378337860107, "step": 16550 }, { "epoch": 2.8532046864231564, "grad_norm": 17.284324645996094, "learning_rate": 7.276537003364225e-10, "logits/chosen": -2.0200042724609375, "logits/rejected": -1.9907087087631226, "logps/chosen": -152.72119140625, "logps/rejected": -194.5865936279297, "loss": 0.5733, "rewards/accuracies": 0.71875, "rewards/chosen": -0.985319972038269, "rewards/margins": 0.3901628851890564, "rewards/rejected": -1.3754827976226807, "step": 16560 }, { "epoch": 2.8549276361130254, "grad_norm": 20.773216247558594, "learning_rate": 7.107132131384475e-10, "logits/chosen": -2.0711119174957275, "logits/rejected": -2.037046194076538, "logps/chosen": -153.63104248046875, "logps/rejected": -189.26712036132812, "loss": 0.5775, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0029370784759521, "rewards/margins": 0.3758252263069153, "rewards/rejected": -1.3787622451782227, "step": 16570 }, { "epoch": 2.8566505858028943, "grad_norm": 14.877636909484863, "learning_rate": 6.939708394725907e-10, "logits/chosen": -1.966944932937622, "logits/rejected": -1.9517587423324585, "logps/chosen": -155.24636840820312, "logps/rejected": -186.4626922607422, "loss": 0.6154, "rewards/accuracies": 0.6875, "rewards/chosen": -1.043582797050476, "rewards/margins": 0.2935950458049774, "rewards/rejected": -1.3371779918670654, "step": 16580 }, { "epoch": 2.8583735354927637, "grad_norm": 47.11408996582031, "learning_rate": 6.774266466331946e-10, "logits/chosen": -1.9278028011322021, "logits/rejected": -1.8968565464019775, "logps/chosen": -162.50057983398438, "logps/rejected": -183.57374572753906, "loss": 0.6356, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.086142897605896, "rewards/margins": 0.24400201439857483, "rewards/rejected": -1.3301448822021484, "step": 16590 }, { "epoch": 2.8600964851826327, "grad_norm": 22.328306198120117, "learning_rate": 6.610807011180552e-10, "logits/chosen": -1.982736587524414, "logits/rejected": -1.9709069728851318, "logps/chosen": -164.12379455566406, "logps/rejected": -190.572998046875, "loss": 0.6289, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0694471597671509, "rewards/margins": 0.278270959854126, "rewards/rejected": -1.3477180004119873, "step": 16600 }, { "epoch": 2.8600964851826327, "eval_logits/chosen": -2.1076016426086426, "eval_logits/rejected": -2.0950677394866943, "eval_logps/chosen": -144.826171875, "eval_logps/rejected": -168.66868591308594, "eval_loss": 0.6382229924201965, "eval_rewards/accuracies": 0.6338289976119995, "eval_rewards/chosen": -0.8611428141593933, "eval_rewards/margins": 0.1937427520751953, "eval_rewards/rejected": -1.0548856258392334, "eval_runtime": 385.1278, "eval_samples_per_second": 11.176, "eval_steps_per_second": 1.397, "step": 16600 }, { "epoch": 2.8618194348725017, "grad_norm": 16.8627872467041, "learning_rate": 6.449330686281285e-10, "logits/chosen": -1.9572347402572632, "logits/rejected": -1.9240957498550415, "logps/chosen": -161.1675262451172, "logps/rejected": -194.1912384033203, "loss": 0.5876, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0665868520736694, "rewards/margins": 0.3441230356693268, "rewards/rejected": -1.4107099771499634, "step": 16610 }, { "epoch": 2.8635423845623706, "grad_norm": 17.6329345703125, "learning_rate": 6.289838140672521e-10, "logits/chosen": -1.9377361536026, "logits/rejected": -1.909722089767456, "logps/chosen": -157.73252868652344, "logps/rejected": -183.4493408203125, "loss": 0.6341, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0056098699569702, "rewards/margins": 0.2806296646595001, "rewards/rejected": -1.2862396240234375, "step": 16620 }, { "epoch": 2.86526533425224, "grad_norm": 21.062896728515625, "learning_rate": 6.132330015419296e-10, "logits/chosen": -1.923837661743164, "logits/rejected": -1.8817894458770752, "logps/chosen": -166.37649536132812, "logps/rejected": -192.03622436523438, "loss": 0.6025, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0372685194015503, "rewards/margins": 0.36506786942481995, "rewards/rejected": -1.4023363590240479, "step": 16630 }, { "epoch": 2.866988283942109, "grad_norm": 24.25795555114746, "learning_rate": 5.97680694361019e-10, "logits/chosen": -1.9930473566055298, "logits/rejected": -1.9573158025741577, "logps/chosen": -167.95040893554688, "logps/rejected": -197.5785675048828, "loss": 0.605, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1174688339233398, "rewards/margins": 0.3424651324748993, "rewards/rejected": -1.4599339962005615, "step": 16640 }, { "epoch": 2.868711233631978, "grad_norm": 18.075307846069336, "learning_rate": 5.823269550355281e-10, "logits/chosen": -1.9361330270767212, "logits/rejected": -1.9201313257217407, "logps/chosen": -146.63934326171875, "logps/rejected": -182.35215759277344, "loss": 0.5718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9200364351272583, "rewards/margins": 0.3836231231689453, "rewards/rejected": -1.3036595582962036, "step": 16650 }, { "epoch": 2.870434183321847, "grad_norm": 24.269166946411133, "learning_rate": 5.671718452783247e-10, "logits/chosen": -1.9986740350723267, "logits/rejected": -1.9809000492095947, "logps/chosen": -156.65380859375, "logps/rejected": -196.37850952148438, "loss": 0.5837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0288615226745605, "rewards/margins": 0.39067524671554565, "rewards/rejected": -1.4195367097854614, "step": 16660 }, { "epoch": 2.872157133011716, "grad_norm": 18.636051177978516, "learning_rate": 5.522154260039158e-10, "logits/chosen": -1.968778371810913, "logits/rejected": -1.9501502513885498, "logps/chosen": -148.11972045898438, "logps/rejected": -193.38015747070312, "loss": 0.5655, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9666731953620911, "rewards/margins": 0.43354344367980957, "rewards/rejected": -1.4002165794372559, "step": 16670 }, { "epoch": 2.873880082701585, "grad_norm": 20.88432502746582, "learning_rate": 5.374577573281746e-10, "logits/chosen": -1.898890495300293, "logits/rejected": -1.8853000402450562, "logps/chosen": -153.3147735595703, "logps/rejected": -180.6986846923828, "loss": 0.6232, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0441803932189941, "rewards/margins": 0.2746034562587738, "rewards/rejected": -1.3187835216522217, "step": 16680 }, { "epoch": 2.8756030323914543, "grad_norm": 16.627391815185547, "learning_rate": 5.228988985681416e-10, "logits/chosen": -2.0309925079345703, "logits/rejected": -2.0090973377227783, "logps/chosen": -154.4249725341797, "logps/rejected": -183.7175750732422, "loss": 0.6094, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0034997463226318, "rewards/margins": 0.29723650217056274, "rewards/rejected": -1.3007361888885498, "step": 16690 }, { "epoch": 2.8773259820813233, "grad_norm": 23.302265167236328, "learning_rate": 5.085389082417291e-10, "logits/chosen": -2.094181537628174, "logits/rejected": -2.070589065551758, "logps/chosen": -158.85147094726562, "logps/rejected": -185.85000610351562, "loss": 0.6271, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0497138500213623, "rewards/margins": 0.2754163444042206, "rewards/rejected": -1.3251302242279053, "step": 16700 }, { "epoch": 2.8773259820813233, "eval_logits/chosen": -2.107978582382202, "eval_logits/rejected": -2.0954272747039795, "eval_logps/chosen": -144.73019409179688, "eval_logps/rejected": -168.4876251220703, "eval_loss": 0.6385171413421631, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.8601828813552856, "eval_rewards/margins": 0.1928921937942505, "eval_rewards/rejected": -1.0530751943588257, "eval_runtime": 384.4447, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 16700 }, { "epoch": 2.8790489317711923, "grad_norm": 23.203088760375977, "learning_rate": 4.943778440675451e-10, "logits/chosen": -2.0103933811187744, "logits/rejected": -1.9836218357086182, "logps/chosen": -155.67843627929688, "logps/rejected": -192.5723876953125, "loss": 0.5958, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0480306148529053, "rewards/margins": 0.367692232131958, "rewards/rejected": -1.4157229661941528, "step": 16710 }, { "epoch": 2.8807718814610612, "grad_norm": 19.339689254760742, "learning_rate": 4.804157629646144e-10, "logits/chosen": -1.9711185693740845, "logits/rejected": -1.9316680431365967, "logps/chosen": -155.4070281982422, "logps/rejected": -186.94088745117188, "loss": 0.5797, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0109169483184814, "rewards/margins": 0.36868447065353394, "rewards/rejected": -1.3796013593673706, "step": 16720 }, { "epoch": 2.8824948311509306, "grad_norm": 17.771820068359375, "learning_rate": 4.666527210521742e-10, "logits/chosen": -1.9407455921173096, "logits/rejected": -1.9092836380004883, "logps/chosen": -151.50054931640625, "logps/rejected": -188.8268585205078, "loss": 0.5904, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.995007336139679, "rewards/margins": 0.3717944025993347, "rewards/rejected": -1.3668017387390137, "step": 16730 }, { "epoch": 2.8842177808407996, "grad_norm": 19.596471786499023, "learning_rate": 4.53088773649446e-10, "logits/chosen": -1.9432376623153687, "logits/rejected": -1.9184081554412842, "logps/chosen": -158.84629821777344, "logps/rejected": -187.92034912109375, "loss": 0.6184, "rewards/accuracies": 0.625, "rewards/chosen": -1.0340230464935303, "rewards/margins": 0.3159448504447937, "rewards/rejected": -1.3499678373336792, "step": 16740 }, { "epoch": 2.8859407305306686, "grad_norm": 25.13460922241211, "learning_rate": 4.397239752754134e-10, "logits/chosen": -2.035068988800049, "logits/rejected": -2.0174400806427, "logps/chosen": -148.57412719726562, "logps/rejected": -193.9629669189453, "loss": 0.5571, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9474018216133118, "rewards/margins": 0.41335439682006836, "rewards/rejected": -1.360756278038025, "step": 16750 }, { "epoch": 2.8876636802205375, "grad_norm": 14.291999816894531, "learning_rate": 4.265583796485783e-10, "logits/chosen": -1.9213664531707764, "logits/rejected": -1.895361304283142, "logps/chosen": -164.57528686523438, "logps/rejected": -198.3388214111328, "loss": 0.5988, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0931850671768188, "rewards/margins": 0.36222729086875916, "rewards/rejected": -1.4554122686386108, "step": 16760 }, { "epoch": 2.8893866299104065, "grad_norm": 15.239450454711914, "learning_rate": 4.135920396867942e-10, "logits/chosen": -2.060210704803467, "logits/rejected": -2.01619291305542, "logps/chosen": -159.90028381347656, "logps/rejected": -189.2607879638672, "loss": 0.6015, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0336933135986328, "rewards/margins": 0.358843058347702, "rewards/rejected": -1.3925364017486572, "step": 16770 }, { "epoch": 2.8911095796002755, "grad_norm": 19.09183120727539, "learning_rate": 4.0082500750701076e-10, "logits/chosen": -1.9828227758407593, "logits/rejected": -1.9660279750823975, "logps/chosen": -151.65789794921875, "logps/rejected": -192.1558837890625, "loss": 0.5832, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9735589027404785, "rewards/margins": 0.37066230177879333, "rewards/rejected": -1.3442213535308838, "step": 16780 }, { "epoch": 2.892832529290145, "grad_norm": 19.560321807861328, "learning_rate": 3.8825733442507947e-10, "logits/chosen": -1.9794814586639404, "logits/rejected": -1.930318832397461, "logps/chosen": -160.43759155273438, "logps/rejected": -182.524658203125, "loss": 0.6186, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0288574695587158, "rewards/margins": 0.29153260588645935, "rewards/rejected": -1.3203901052474976, "step": 16790 }, { "epoch": 2.894555478980014, "grad_norm": 23.47073745727539, "learning_rate": 3.75889070955554e-10, "logits/chosen": -2.020232915878296, "logits/rejected": -2.004154682159424, "logps/chosen": -150.72743225097656, "logps/rejected": -190.80331420898438, "loss": 0.5918, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.016732096672058, "rewards/margins": 0.3744247853755951, "rewards/rejected": -1.391156792640686, "step": 16800 }, { "epoch": 2.894555478980014, "eval_logits/chosen": -2.107781171798706, "eval_logits/rejected": -2.095280170440674, "eval_logps/chosen": -144.85812377929688, "eval_logps/rejected": -168.63711547851562, "eval_loss": 0.6384297609329224, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.8614622950553894, "eval_rewards/margins": 0.19310744106769562, "eval_rewards/rejected": -1.0545697212219238, "eval_runtime": 384.7492, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 16800 }, { "epoch": 2.896278428669883, "grad_norm": 18.635826110839844, "learning_rate": 3.6372026681146806e-10, "logits/chosen": -1.9736328125, "logits/rejected": -1.9360759258270264, "logps/chosen": -146.03355407714844, "logps/rejected": -179.94650268554688, "loss": 0.5888, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9263818860054016, "rewards/margins": 0.37809523940086365, "rewards/rejected": -1.3044772148132324, "step": 16810 }, { "epoch": 2.898001378359752, "grad_norm": 15.33028507232666, "learning_rate": 3.517509709041688e-10, "logits/chosen": -2.0261218547821045, "logits/rejected": -1.9984285831451416, "logps/chosen": -157.83596801757812, "logps/rejected": -180.45556640625, "loss": 0.6177, "rewards/accuracies": 0.65625, "rewards/chosen": -1.026811122894287, "rewards/margins": 0.25640958547592163, "rewards/rejected": -1.2832207679748535, "step": 16820 }, { "epoch": 2.899724328049621, "grad_norm": 23.343008041381836, "learning_rate": 3.399812313430728e-10, "logits/chosen": -2.0888819694519043, "logits/rejected": -2.0597145557403564, "logps/chosen": -167.93612670898438, "logps/rejected": -210.1898651123047, "loss": 0.5766, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1374194622039795, "rewards/margins": 0.41082635521888733, "rewards/rejected": -1.5482457876205444, "step": 16830 }, { "epoch": 2.90144727773949, "grad_norm": 18.011272430419922, "learning_rate": 3.284110954355157e-10, "logits/chosen": -1.925396203994751, "logits/rejected": -1.8931325674057007, "logps/chosen": -150.8836212158203, "logps/rejected": -186.10324096679688, "loss": 0.5843, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9794955253601074, "rewards/margins": 0.37007173895835876, "rewards/rejected": -1.349567174911499, "step": 16840 }, { "epoch": 2.903170227429359, "grad_norm": 16.46031951904297, "learning_rate": 3.1704060968654746e-10, "logits/chosen": -1.99543035030365, "logits/rejected": -1.9652141332626343, "logps/chosen": -169.80191040039062, "logps/rejected": -188.7667999267578, "loss": 0.6379, "rewards/accuracies": 0.625, "rewards/chosen": -1.1454854011535645, "rewards/margins": 0.25200867652893066, "rewards/rejected": -1.3974940776824951, "step": 16850 }, { "epoch": 2.904893177119228, "grad_norm": 16.11234474182129, "learning_rate": 3.0586981979873747e-10, "logits/chosen": -1.9995830059051514, "logits/rejected": -1.9582990407943726, "logps/chosen": -160.1403045654297, "logps/rejected": -185.46664428710938, "loss": 0.6221, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0485565662384033, "rewards/margins": 0.29739388823509216, "rewards/rejected": -1.3459504842758179, "step": 16860 }, { "epoch": 2.906616126809097, "grad_norm": 19.668710708618164, "learning_rate": 2.9489877067199185e-10, "logits/chosen": -2.047384738922119, "logits/rejected": -2.0174801349639893, "logps/chosen": -155.99368286132812, "logps/rejected": -184.9731903076172, "loss": 0.6139, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0293080806732178, "rewards/margins": 0.31219157576560974, "rewards/rejected": -1.34149968624115, "step": 16870 }, { "epoch": 2.908339076498966, "grad_norm": 17.561004638671875, "learning_rate": 2.8412750640338654e-10, "logits/chosen": -2.070305347442627, "logits/rejected": -2.0443599224090576, "logps/chosen": -147.8803253173828, "logps/rejected": -195.07247924804688, "loss": 0.5673, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9889408946037292, "rewards/margins": 0.4403529167175293, "rewards/rejected": -1.4292938709259033, "step": 16880 }, { "epoch": 2.910062026188835, "grad_norm": 23.26585578918457, "learning_rate": 2.7355607028698437e-10, "logits/chosen": -1.977966070175171, "logits/rejected": -1.959275245666504, "logps/chosen": -157.8607177734375, "logps/rejected": -184.59408569335938, "loss": 0.6132, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.029338002204895, "rewards/margins": 0.2757313549518585, "rewards/rejected": -1.3050692081451416, "step": 16890 }, { "epoch": 2.9117849758787044, "grad_norm": 23.29181480407715, "learning_rate": 2.6318450481365164e-10, "logits/chosen": -2.037574052810669, "logits/rejected": -2.0149717330932617, "logps/chosen": -157.65554809570312, "logps/rejected": -189.41383361816406, "loss": 0.5885, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0444326400756836, "rewards/margins": 0.3486484885215759, "rewards/rejected": -1.3930811882019043, "step": 16900 }, { "epoch": 2.9117849758787044, "eval_logits/chosen": -2.1079790592193604, "eval_logits/rejected": -2.0954182147979736, "eval_logps/chosen": -144.69412231445312, "eval_logps/rejected": -168.51097106933594, "eval_loss": 0.6383413672447205, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.8598222732543945, "eval_rewards/margins": 0.19348619878292084, "eval_rewards/rejected": -1.053308367729187, "eval_runtime": 384.9157, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 16900 }, { "epoch": 2.9135079255685734, "grad_norm": 14.932369232177734, "learning_rate": 2.5301285167088624e-10, "logits/chosen": -2.061954975128174, "logits/rejected": -2.039252758026123, "logps/chosen": -155.24456787109375, "logps/rejected": -186.53692626953125, "loss": 0.6138, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0364755392074585, "rewards/margins": 0.29281336069107056, "rewards/rejected": -1.3292888402938843, "step": 16910 }, { "epoch": 2.9152308752584424, "grad_norm": 15.281264305114746, "learning_rate": 2.430411517426734e-10, "logits/chosen": -1.9630804061889648, "logits/rejected": -1.937941312789917, "logps/chosen": -156.692626953125, "logps/rejected": -184.55279541015625, "loss": 0.6198, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0195591449737549, "rewards/margins": 0.2863544821739197, "rewards/rejected": -1.3059136867523193, "step": 16920 }, { "epoch": 2.9169538249483113, "grad_norm": 13.958650588989258, "learning_rate": 2.332694451092965e-10, "logits/chosen": -2.0544562339782715, "logits/rejected": -2.030432939529419, "logps/chosen": -150.43984985351562, "logps/rejected": -179.63677978515625, "loss": 0.5988, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9417110681533813, "rewards/margins": 0.33717039227485657, "rewards/rejected": -1.2788814306259155, "step": 16930 }, { "epoch": 2.9186767746381808, "grad_norm": 15.688158988952637, "learning_rate": 2.2369777104718768e-10, "logits/chosen": -2.0281169414520264, "logits/rejected": -2.005841016769409, "logps/chosen": -155.18228149414062, "logps/rejected": -190.96182250976562, "loss": 0.5706, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9720762372016907, "rewards/margins": 0.38785520195961, "rewards/rejected": -1.3599315881729126, "step": 16940 }, { "epoch": 2.9203997243280497, "grad_norm": 17.963821411132812, "learning_rate": 2.143261680287667e-10, "logits/chosen": -1.9695152044296265, "logits/rejected": -1.9542038440704346, "logps/chosen": -149.31637573242188, "logps/rejected": -192.3895263671875, "loss": 0.5899, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9927176237106323, "rewards/margins": 0.3815767168998718, "rewards/rejected": -1.3742942810058594, "step": 16950 }, { "epoch": 2.9221226740179187, "grad_norm": 19.658464431762695, "learning_rate": 2.051546737222909e-10, "logits/chosen": -1.9931623935699463, "logits/rejected": -1.9664968252182007, "logps/chosen": -157.6920928955078, "logps/rejected": -203.2550811767578, "loss": 0.5611, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0260777473449707, "rewards/margins": 0.4539663791656494, "rewards/rejected": -1.4800441265106201, "step": 16960 }, { "epoch": 2.9238456237077877, "grad_norm": 20.83079719543457, "learning_rate": 1.9618332499169442e-10, "logits/chosen": -2.070127010345459, "logits/rejected": -2.0350441932678223, "logps/chosen": -161.13937377929688, "logps/rejected": -195.9274444580078, "loss": 0.5862, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0537161827087402, "rewards/margins": 0.3671458065509796, "rewards/rejected": -1.420862078666687, "step": 16970 }, { "epoch": 2.9255685733976566, "grad_norm": 16.276081085205078, "learning_rate": 1.8741215789644936e-10, "logits/chosen": -2.0622806549072266, "logits/rejected": -2.034605026245117, "logps/chosen": -151.7607421875, "logps/rejected": -183.93618774414062, "loss": 0.5904, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9817357063293457, "rewards/margins": 0.33096829056739807, "rewards/rejected": -1.312704086303711, "step": 16980 }, { "epoch": 2.9272915230875256, "grad_norm": 13.918219566345215, "learning_rate": 1.7884120769141032e-10, "logits/chosen": -2.098088502883911, "logits/rejected": -2.064545154571533, "logps/chosen": -142.64236450195312, "logps/rejected": -184.80264282226562, "loss": 0.5617, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8964022397994995, "rewards/margins": 0.4264097809791565, "rewards/rejected": -1.3228119611740112, "step": 16990 }, { "epoch": 2.929014472777395, "grad_norm": 23.878826141357422, "learning_rate": 1.7047050882669223e-10, "logits/chosen": -2.075836181640625, "logits/rejected": -2.0509674549102783, "logps/chosen": -155.60653686523438, "logps/rejected": -184.17105102539062, "loss": 0.6058, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.999384880065918, "rewards/margins": 0.3138217031955719, "rewards/rejected": -1.3132065534591675, "step": 17000 }, { "epoch": 2.929014472777395, "eval_logits/chosen": -2.107468605041504, "eval_logits/rejected": -2.0948989391326904, "eval_logps/chosen": -144.85874938964844, "eval_logps/rejected": -168.65318298339844, "eval_loss": 0.6383748650550842, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -0.8614687323570251, "eval_rewards/margins": 0.19326195120811462, "eval_rewards/rejected": -1.054730772972107, "eval_runtime": 385.2458, "eval_samples_per_second": 11.172, "eval_steps_per_second": 1.397, "step": 17000 }, { "epoch": 2.930737422467264, "grad_norm": 16.489273071289062, "learning_rate": 1.623000949475095e-10, "logits/chosen": -1.976637840270996, "logits/rejected": -1.9516456127166748, "logps/chosen": -159.58668518066406, "logps/rejected": -200.84808349609375, "loss": 0.5799, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0665761232376099, "rewards/margins": 0.3951249420642853, "rewards/rejected": -1.4617010354995728, "step": 17010 }, { "epoch": 2.932460372157133, "grad_norm": 23.817426681518555, "learning_rate": 1.5432999889404274e-10, "logits/chosen": -1.9588006734848022, "logits/rejected": -1.9636266231536865, "logps/chosen": -163.51686096191406, "logps/rejected": -182.13247680664062, "loss": 0.6776, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1013703346252441, "rewards/margins": 0.1832554042339325, "rewards/rejected": -1.284625768661499, "step": 17020 }, { "epoch": 2.934183321847002, "grad_norm": 21.267778396606445, "learning_rate": 1.4656025270133876e-10, "logits/chosen": -2.0164945125579834, "logits/rejected": -1.9917453527450562, "logps/chosen": -162.5743865966797, "logps/rejected": -185.87351989746094, "loss": 0.6272, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0608112812042236, "rewards/margins": 0.2683561444282532, "rewards/rejected": -1.329167366027832, "step": 17030 }, { "epoch": 2.9359062715368713, "grad_norm": 16.53522300720215, "learning_rate": 1.3899088759913302e-10, "logits/chosen": -2.061279058456421, "logits/rejected": -2.031122922897339, "logps/chosen": -163.31857299804688, "logps/rejected": -192.8009033203125, "loss": 0.5969, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0746561288833618, "rewards/margins": 0.3415234684944153, "rewards/rejected": -1.4161796569824219, "step": 17040 }, { "epoch": 2.9376292212267403, "grad_norm": 19.416257858276367, "learning_rate": 1.316219340117608e-10, "logits/chosen": -2.0140838623046875, "logits/rejected": -1.9901149272918701, "logps/chosen": -161.71713256835938, "logps/rejected": -186.2438507080078, "loss": 0.6274, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0627894401550293, "rewards/margins": 0.26462146639823914, "rewards/rejected": -1.3274109363555908, "step": 17050 }, { "epoch": 2.9393521709166093, "grad_norm": 20.334020614624023, "learning_rate": 1.2445342155801842e-10, "logits/chosen": -2.0584394931793213, "logits/rejected": -2.0244102478027344, "logps/chosen": -161.05813598632812, "logps/rejected": -191.13429260253906, "loss": 0.617, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.066583514213562, "rewards/margins": 0.2954621911048889, "rewards/rejected": -1.3620457649230957, "step": 17060 }, { "epoch": 2.9410751206064782, "grad_norm": 18.956140518188477, "learning_rate": 1.1748537905105217e-10, "logits/chosen": -1.9815282821655273, "logits/rejected": -1.955674171447754, "logps/chosen": -162.58248901367188, "logps/rejected": -182.99600219726562, "loss": 0.6428, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0696430206298828, "rewards/margins": 0.24429316818714142, "rewards/rejected": -1.3139361143112183, "step": 17070 }, { "epoch": 2.942798070296347, "grad_norm": 15.89017391204834, "learning_rate": 1.1071783449823624e-10, "logits/chosen": -1.9061216115951538, "logits/rejected": -1.8757600784301758, "logps/chosen": -154.84930419921875, "logps/rejected": -202.68748474121094, "loss": 0.55, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0163583755493164, "rewards/margins": 0.4665466845035553, "rewards/rejected": -1.4829050302505493, "step": 17080 }, { "epoch": 2.944521019986216, "grad_norm": 17.138473510742188, "learning_rate": 1.0415081510106172e-10, "logits/chosen": -1.9880319833755493, "logits/rejected": -1.9613933563232422, "logps/chosen": -156.0173797607422, "logps/rejected": -196.33090209960938, "loss": 0.5735, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0303047895431519, "rewards/margins": 0.4031950831413269, "rewards/rejected": -1.4334999322891235, "step": 17090 }, { "epoch": 2.9462439696760856, "grad_norm": 16.138994216918945, "learning_rate": 9.778434725503105e-11, "logits/chosen": -1.924480676651001, "logits/rejected": -1.8959500789642334, "logps/chosen": -163.9532470703125, "logps/rejected": -196.26712036132812, "loss": 0.5841, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.076259970664978, "rewards/margins": 0.3590271770954132, "rewards/rejected": -1.4352871179580688, "step": 17100 }, { "epoch": 2.9462439696760856, "eval_logits/chosen": -2.1081507205963135, "eval_logits/rejected": -2.0956053733825684, "eval_logps/chosen": -144.7006072998047, "eval_logps/rejected": -168.4869842529297, "eval_loss": 0.6384033560752869, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -0.8598871231079102, "eval_rewards/margins": 0.19318149983882904, "eval_rewards/rejected": -1.05306875705719, "eval_runtime": 384.7293, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 17100 }, { "epoch": 2.9479669193659546, "grad_norm": 20.479286193847656, "learning_rate": 9.161845654954703e-11, "logits/chosen": -1.9812736511230469, "logits/rejected": -1.9665441513061523, "logps/chosen": -167.9789276123047, "logps/rejected": -200.39198303222656, "loss": 0.602, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1308027505874634, "rewards/margins": 0.33174437284469604, "rewards/rejected": -1.4625470638275146, "step": 17110 }, { "epoch": 2.9496898690558235, "grad_norm": 16.917028427124023, "learning_rate": 8.565316776780739e-11, "logits/chosen": -2.0517094135284424, "logits/rejected": -2.0181403160095215, "logps/chosen": -152.10348510742188, "logps/rejected": -189.8842315673828, "loss": 0.5571, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0014479160308838, "rewards/margins": 0.40886181592941284, "rewards/rejected": -1.410309910774231, "step": 17120 }, { "epoch": 2.9514128187456925, "grad_norm": 21.397668838500977, "learning_rate": 7.988850488672705e-11, "logits/chosen": -1.9405648708343506, "logits/rejected": -1.9045183658599854, "logps/chosen": -159.02334594726562, "logps/rejected": -192.76695251464844, "loss": 0.585, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0441601276397705, "rewards/margins": 0.34977632761001587, "rewards/rejected": -1.3939363956451416, "step": 17130 }, { "epoch": 2.953135768435562, "grad_norm": 17.573646545410156, "learning_rate": 7.432449107679928e-11, "logits/chosen": -1.9517736434936523, "logits/rejected": -1.9247581958770752, "logps/chosen": -147.04617309570312, "logps/rejected": -187.03465270996094, "loss": 0.5578, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9340187907218933, "rewards/margins": 0.4135506749153137, "rewards/rejected": -1.347569465637207, "step": 17140 }, { "epoch": 2.954858718125431, "grad_norm": 18.89002799987793, "learning_rate": 6.896114870204583e-11, "logits/chosen": -2.070183277130127, "logits/rejected": -2.036158800125122, "logps/chosen": -160.6278533935547, "logps/rejected": -194.49307250976562, "loss": 0.6017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0642752647399902, "rewards/margins": 0.35668569803237915, "rewards/rejected": -1.420960783958435, "step": 17150 }, { "epoch": 2.9565816678153, "grad_norm": 16.096742630004883, "learning_rate": 6.379849931990034e-11, "logits/chosen": -2.0860087871551514, "logits/rejected": -2.0577282905578613, "logps/chosen": -145.84942626953125, "logps/rejected": -207.4510040283203, "loss": 0.5093, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9580782651901245, "rewards/margins": 0.5999714136123657, "rewards/rejected": -1.5580496788024902, "step": 17160 }, { "epoch": 2.958304617505169, "grad_norm": 15.07766342163086, "learning_rate": 5.883656368114164e-11, "logits/chosen": -2.1240220069885254, "logits/rejected": -2.0937812328338623, "logps/chosen": -174.01303100585938, "logps/rejected": -200.17124938964844, "loss": 0.6415, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1489039659500122, "rewards/margins": 0.27626457810401917, "rewards/rejected": -1.425168514251709, "step": 17170 }, { "epoch": 2.960027567195038, "grad_norm": 17.897275924682617, "learning_rate": 5.407536172978844e-11, "logits/chosen": -2.0208230018615723, "logits/rejected": -1.9964879751205444, "logps/chosen": -159.231689453125, "logps/rejected": -192.90452575683594, "loss": 0.6045, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0813618898391724, "rewards/margins": 0.3416813015937805, "rewards/rejected": -1.4230432510375977, "step": 17180 }, { "epoch": 2.9617505168849068, "grad_norm": 19.49738883972168, "learning_rate": 4.951491260302698e-11, "logits/chosen": -1.9870989322662354, "logits/rejected": -1.9674804210662842, "logps/chosen": -148.41281127929688, "logps/rejected": -181.7845916748047, "loss": 0.5944, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9708064794540405, "rewards/margins": 0.3224068284034729, "rewards/rejected": -1.2932134866714478, "step": 17190 }, { "epoch": 2.963473466574776, "grad_norm": 19.9754638671875, "learning_rate": 4.515523463115012e-11, "logits/chosen": -2.0037624835968018, "logits/rejected": -1.9997937679290771, "logps/chosen": -154.29080200195312, "logps/rejected": -183.92770385742188, "loss": 0.6214, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9983419179916382, "rewards/margins": 0.2957645058631897, "rewards/rejected": -1.2941064834594727, "step": 17200 }, { "epoch": 2.963473466574776, "eval_logits/chosen": -2.1081020832061768, "eval_logits/rejected": -2.0955216884613037, "eval_logps/chosen": -144.7975616455078, "eval_logps/rejected": -168.5644989013672, "eval_loss": 0.6384665369987488, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.8608567714691162, "eval_rewards/margins": 0.19298723340034485, "eval_rewards/rejected": -1.0538440942764282, "eval_runtime": 384.9355, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 17200 }, { "epoch": 2.965196416264645, "grad_norm": 24.927173614501953, "learning_rate": 4.099634533745733e-11, "logits/chosen": -1.9700489044189453, "logits/rejected": -1.9526208639144897, "logps/chosen": -159.6385040283203, "logps/rejected": -200.23995971679688, "loss": 0.5852, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0739831924438477, "rewards/margins": 0.3859753906726837, "rewards/rejected": -1.4599586725234985, "step": 17210 }, { "epoch": 2.966919365954514, "grad_norm": 15.874503135681152, "learning_rate": 3.7038261438204765e-11, "logits/chosen": -1.9668464660644531, "logits/rejected": -1.9534298181533813, "logps/chosen": -161.64364624023438, "logps/rejected": -211.6537628173828, "loss": 0.5667, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0618740320205688, "rewards/margins": 0.4875417649745941, "rewards/rejected": -1.5494158267974854, "step": 17220 }, { "epoch": 2.968642315644383, "grad_norm": 16.003583908081055, "learning_rate": 3.3280998842527554e-11, "logits/chosen": -2.0272841453552246, "logits/rejected": -2.0240020751953125, "logps/chosen": -157.9180908203125, "logps/rejected": -192.9403839111328, "loss": 0.6067, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0556585788726807, "rewards/margins": 0.3211142420768738, "rewards/rejected": -1.3767727613449097, "step": 17230 }, { "epoch": 2.9703652653342525, "grad_norm": 17.43435287475586, "learning_rate": 2.972457265237871e-11, "logits/chosen": -1.9952207803726196, "logits/rejected": -1.98117196559906, "logps/chosen": -150.1788787841797, "logps/rejected": -182.0748291015625, "loss": 0.6044, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9629311561584473, "rewards/margins": 0.3097112774848938, "rewards/rejected": -1.2726423740386963, "step": 17240 }, { "epoch": 2.9720882150241215, "grad_norm": 19.4127254486084, "learning_rate": 2.6368997162479202e-11, "logits/chosen": -1.924861192703247, "logits/rejected": -1.9038633108139038, "logps/chosen": -154.8731231689453, "logps/rejected": -194.99069213867188, "loss": 0.5656, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0260493755340576, "rewards/margins": 0.3980260491371155, "rewards/rejected": -1.4240756034851074, "step": 17250 }, { "epoch": 2.9738111647139904, "grad_norm": 18.594249725341797, "learning_rate": 2.321428586022911e-11, "logits/chosen": -1.9672596454620361, "logits/rejected": -1.9421987533569336, "logps/chosen": -155.21603393554688, "logps/rejected": -192.7557373046875, "loss": 0.5733, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9993821978569031, "rewards/margins": 0.4022952616214752, "rewards/rejected": -1.4016773700714111, "step": 17260 }, { "epoch": 2.9755341144038594, "grad_norm": 16.317119598388672, "learning_rate": 2.0260451425690994e-11, "logits/chosen": -1.9887855052947998, "logits/rejected": -1.9616997241973877, "logps/chosen": -164.8186492919922, "logps/rejected": -189.7857666015625, "loss": 0.6478, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0915124416351318, "rewards/margins": 0.26181191205978394, "rewards/rejected": -1.3533244132995605, "step": 17270 }, { "epoch": 2.9772570640937284, "grad_norm": 17.891714096069336, "learning_rate": 1.7507505731523266e-11, "logits/chosen": -1.9692672491073608, "logits/rejected": -1.9360361099243164, "logps/chosen": -153.45986938476562, "logps/rejected": -190.44488525390625, "loss": 0.5758, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9893794059753418, "rewards/margins": 0.3972127437591553, "rewards/rejected": -1.386592149734497, "step": 17280 }, { "epoch": 2.9789800137835973, "grad_norm": 19.996606826782227, "learning_rate": 1.4955459842913576e-11, "logits/chosen": -2.0136358737945557, "logits/rejected": -1.988731026649475, "logps/chosen": -163.03054809570312, "logps/rejected": -199.68728637695312, "loss": 0.5723, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.03960382938385, "rewards/margins": 0.3829970359802246, "rewards/rejected": -1.4226009845733643, "step": 17290 }, { "epoch": 2.9807029634734663, "grad_norm": 21.40375518798828, "learning_rate": 1.2604324017573276e-11, "logits/chosen": -2.0269322395324707, "logits/rejected": -2.0076136589050293, "logps/chosen": -157.98045349121094, "logps/rejected": -191.09596252441406, "loss": 0.5905, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.027301549911499, "rewards/margins": 0.3616834878921509, "rewards/rejected": -1.3889849185943604, "step": 17300 }, { "epoch": 2.9807029634734663, "eval_logits/chosen": -2.107619524002075, "eval_logits/rejected": -2.095050096511841, "eval_logps/chosen": -144.81857299804688, "eval_logps/rejected": -168.5945281982422, "eval_loss": 0.6384586691856384, "eval_rewards/accuracies": 0.6326673030853271, "eval_rewards/chosen": -0.8610668778419495, "eval_rewards/margins": 0.1930774748325348, "eval_rewards/rejected": -1.054144263267517, "eval_runtime": 384.7906, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 17300 }, { "epoch": 2.9824259131633357, "grad_norm": 21.375167846679688, "learning_rate": 1.0454107705665238e-11, "logits/chosen": -2.006045341491699, "logits/rejected": -1.9824981689453125, "logps/chosen": -157.21923828125, "logps/rejected": -193.86965942382812, "loss": 0.5713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0215353965759277, "rewards/margins": 0.3951588571071625, "rewards/rejected": -1.4166942834854126, "step": 17310 }, { "epoch": 2.9841488628532047, "grad_norm": 15.415755271911621, "learning_rate": 8.504819549770559e-12, "logits/chosen": -2.0083327293395996, "logits/rejected": -1.9831466674804688, "logps/chosen": -160.80548095703125, "logps/rejected": -189.94680786132812, "loss": 0.6067, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0678905248641968, "rewards/margins": 0.31099414825439453, "rewards/rejected": -1.3788847923278809, "step": 17320 }, { "epoch": 2.9858718125430737, "grad_norm": 17.275476455688477, "learning_rate": 6.7564673848719e-12, "logits/chosen": -1.9689480066299438, "logits/rejected": -1.9437649250030518, "logps/chosen": -156.7157745361328, "logps/rejected": -186.25534057617188, "loss": 0.5982, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0146048069000244, "rewards/margins": 0.33314546942710876, "rewards/rejected": -1.3477500677108765, "step": 17330 }, { "epoch": 2.987594762232943, "grad_norm": 14.532931327819824, "learning_rate": 5.2090582382924295e-12, "logits/chosen": -2.0598437786102295, "logits/rejected": -2.024522542953491, "logps/chosen": -144.4849090576172, "logps/rejected": -186.29039001464844, "loss": 0.5577, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9200469851493835, "rewards/margins": 0.42522692680358887, "rewards/rejected": -1.3452739715576172, "step": 17340 }, { "epoch": 2.989317711922812, "grad_norm": 24.645767211914062, "learning_rate": 3.8625983297069234e-12, "logits/chosen": -2.028348922729492, "logits/rejected": -1.991878867149353, "logps/chosen": -148.47042846679688, "logps/rejected": -182.27371215820312, "loss": 0.5889, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9489768147468567, "rewards/margins": 0.3502461016178131, "rewards/rejected": -1.2992229461669922, "step": 17350 }, { "epoch": 2.991040661612681, "grad_norm": 18.422821044921875, "learning_rate": 2.7170930710695983e-12, "logits/chosen": -2.075244426727295, "logits/rejected": -2.0619418621063232, "logps/chosen": -153.32546997070312, "logps/rejected": -195.39002990722656, "loss": 0.5642, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9891951680183411, "rewards/margins": 0.4152054786682129, "rewards/rejected": -1.4044005870819092, "step": 17360 }, { "epoch": 2.99276361130255, "grad_norm": 18.606843948364258, "learning_rate": 1.7725470666363208e-12, "logits/chosen": -2.009625196456909, "logits/rejected": -1.9910959005355835, "logps/chosen": -153.252197265625, "logps/rejected": -186.7181854248047, "loss": 0.6095, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0108263492584229, "rewards/margins": 0.3082529604434967, "rewards/rejected": -1.3190791606903076, "step": 17370 }, { "epoch": 2.994486560992419, "grad_norm": 25.723548889160156, "learning_rate": 1.0289641129146431e-12, "logits/chosen": -1.9877243041992188, "logits/rejected": -1.9729607105255127, "logps/chosen": -159.61920166015625, "logps/rejected": -191.86083984375, "loss": 0.6148, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0662343502044678, "rewards/margins": 0.30018362402915955, "rewards/rejected": -1.3664178848266602, "step": 17380 }, { "epoch": 2.996209510682288, "grad_norm": 20.49443244934082, "learning_rate": 4.863471986693568e-13, "logits/chosen": -2.036599636077881, "logits/rejected": -2.0125949382781982, "logps/chosen": -153.33013916015625, "logps/rejected": -185.20269775390625, "loss": 0.6027, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9938996434211731, "rewards/margins": 0.32384979724884033, "rewards/rejected": -1.3177493810653687, "step": 17390 }, { "epoch": 2.997932460372157, "grad_norm": 15.323660850524902, "learning_rate": 1.4469850488918467e-13, "logits/chosen": -1.994148850440979, "logits/rejected": -1.9644842147827148, "logps/chosen": -161.00686645507812, "logps/rejected": -194.1676483154297, "loss": 0.5878, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0578213930130005, "rewards/margins": 0.38855668902397156, "rewards/rejected": -1.4463779926300049, "step": 17400 }, { "epoch": 2.997932460372157, "eval_logits/chosen": -2.107677698135376, "eval_logits/rejected": -2.095085382461548, "eval_logps/chosen": -144.84811401367188, "eval_logps/rejected": -168.68983459472656, "eval_loss": 0.6382296681404114, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -0.8613619208335876, "eval_rewards/margins": 0.1937350332736969, "eval_rewards/rejected": -1.0550971031188965, "eval_runtime": 384.9065, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 17400 }, { "epoch": 2.9996554100620263, "grad_norm": 24.605453491210938, "learning_rate": 4.019404797883652e-15, "logits/chosen": -1.9687122106552124, "logits/rejected": -1.9464643001556396, "logps/chosen": -154.16351318359375, "logps/rejected": -200.9946746826172, "loss": 0.5456, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0015041828155518, "rewards/margins": 0.45887383818626404, "rewards/rejected": -1.460377812385559, "step": 17410 }, { "epoch": 3.0, "step": 17412, "total_flos": 0.0, "train_loss": 0.6290887609490465, "train_runtime": 142032.5586, "train_samples_per_second": 1.961, "train_steps_per_second": 0.123 } ], "logging_steps": 10, "max_steps": 17412, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }