{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 5733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 8.710801393728223e-10, "logits/chosen": 0.3170108497142792, "logits/rejected": 0.35767874121665955, "logps/chosen": -271.9781494140625, "logps/rejected": -165.8260955810547, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 8.710801393728223e-09, "logits/chosen": 0.34384429454803467, "logits/rejected": 0.2845779359340668, "logps/chosen": -362.0292053222656, "logps/rejected": -280.8023376464844, "loss": 0.693, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.0007958578644320369, "rewards/margins": 0.0013568435097113252, "rewards/rejected": -0.0005609856452792883, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.7421602787456446e-08, "logits/chosen": 0.41451185941696167, "logits/rejected": 0.33514469861984253, "logps/chosen": -354.496337890625, "logps/rejected": -268.55401611328125, "loss": 0.6932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0009157696040347219, "rewards/margins": 0.0009497471037320793, "rewards/rejected": -3.397750697331503e-05, "step": 20 }, { "epoch": 0.02, "learning_rate": 2.6132404181184667e-08, "logits/chosen": 0.3903641700744629, "logits/rejected": 0.32494959235191345, "logps/chosen": -281.6260986328125, "logps/rejected": -235.763427734375, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007924304227344692, "rewards/margins": 9.540463361190632e-05, "rewards/rejected": 0.0006970257963985205, "step": 30 }, { "epoch": 0.02, "learning_rate": 3.484320557491289e-08, "logits/chosen": 0.36575648188591003, "logits/rejected": 0.4160943925380707, "logps/chosen": -225.0307159423828, "logps/rejected": -243.137451171875, "loss": 0.6933, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0002304415247635916, "rewards/margins": -0.000740631076041609, "rewards/rejected": 0.0005101895658299327, "step": 40 }, { "epoch": 0.03, "learning_rate": 4.355400696864111e-08, "logits/chosen": 0.37944719195365906, "logits/rejected": 0.37457937002182007, "logps/chosen": -354.24530029296875, "logps/rejected": -278.2007141113281, "loss": 0.693, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0003198208869434893, "rewards/margins": -0.0002704802027437836, "rewards/rejected": -4.93407242174726e-05, "step": 50 }, { "epoch": 0.03, "learning_rate": 5.2264808362369334e-08, "logits/chosen": 0.2979043424129486, "logits/rejected": 0.2968657910823822, "logps/chosen": -308.49493408203125, "logps/rejected": -271.253662109375, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0003536358126439154, "rewards/margins": -0.0003051784005947411, "rewards/rejected": -4.8457364755449817e-05, "step": 60 }, { "epoch": 0.04, "learning_rate": 6.097560975609756e-08, "logits/chosen": 0.3355613946914673, "logits/rejected": 0.3047105073928833, "logps/chosen": -308.9933166503906, "logps/rejected": -264.9416198730469, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0006061201565898955, "rewards/margins": -0.0004816774744540453, "rewards/rejected": -0.00012444249296095222, "step": 70 }, { "epoch": 0.04, "learning_rate": 6.968641114982578e-08, "logits/chosen": 0.4044817090034485, "logits/rejected": 0.3871970772743225, "logps/chosen": -311.4313049316406, "logps/rejected": -269.52813720703125, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.000310670118778944, "rewards/margins": 0.00034131400752812624, "rewards/rejected": -3.064396514673717e-05, "step": 80 }, { "epoch": 0.05, "learning_rate": 7.8397212543554e-08, "logits/chosen": 0.33987581729888916, "logits/rejected": 0.37897253036499023, "logps/chosen": -255.73159790039062, "logps/rejected": -286.35845947265625, "loss": 0.6929, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -9.91291381069459e-05, "rewards/margins": 3.401337380637415e-05, "rewards/rejected": -0.00013314261741470546, "step": 90 }, { "epoch": 0.05, "learning_rate": 8.710801393728223e-08, "logits/chosen": 0.32025259733200073, "logits/rejected": 0.31265729665756226, "logps/chosen": -308.8668518066406, "logps/rejected": -258.37359619140625, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0013080101925879717, "rewards/margins": 0.0005450951284728944, "rewards/rejected": 0.0007629150059074163, "step": 100 }, { "epoch": 0.06, "learning_rate": 9.581881533101045e-08, "logits/chosen": 0.3391476273536682, "logits/rejected": 0.3699817955493927, "logps/chosen": -294.4396057128906, "logps/rejected": -265.50604248046875, "loss": 0.6932, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0004913022858090699, "rewards/margins": -0.0006068542716093361, "rewards/rejected": 0.00011555196397239342, "step": 110 }, { "epoch": 0.06, "learning_rate": 1.0452961672473867e-07, "logits/chosen": 0.2914244830608368, "logits/rejected": 0.2608393430709839, "logps/chosen": -322.7389831542969, "logps/rejected": -296.3192138671875, "loss": 0.6931, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0007188282324932516, "rewards/margins": -0.000560021901037544, "rewards/rejected": -0.00015880633145570755, "step": 120 }, { "epoch": 0.07, "learning_rate": 1.132404181184669e-07, "logits/chosen": 0.2627529799938202, "logits/rejected": 0.21750828623771667, "logps/chosen": -391.84979248046875, "logps/rejected": -304.40533447265625, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 5.764817979070358e-05, "rewards/margins": 0.0006713207112625241, "rewards/rejected": -0.0006136724841780961, "step": 130 }, { "epoch": 0.07, "learning_rate": 1.219512195121951e-07, "logits/chosen": 0.3835189938545227, "logits/rejected": 0.2935212552547455, "logps/chosen": -296.1155700683594, "logps/rejected": -293.04443359375, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0009465150651521981, "rewards/margins": 0.0007651118794456124, "rewards/rejected": 0.0001814032148104161, "step": 140 }, { "epoch": 0.08, "learning_rate": 1.3066202090592334e-07, "logits/chosen": 0.3177749514579773, "logits/rejected": 0.2864537835121155, "logps/chosen": -346.4631652832031, "logps/rejected": -260.0641784667969, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005447001312859356, "rewards/margins": -0.00026119090034626424, "rewards/rejected": -0.00028350926004350185, "step": 150 }, { "epoch": 0.08, "learning_rate": 1.3937282229965157e-07, "logits/chosen": 0.32583457231521606, "logits/rejected": 0.27274638414382935, "logps/chosen": -319.1077575683594, "logps/rejected": -266.54486083984375, "loss": 0.693, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0006812402862124145, "rewards/margins": -0.0006552303093485534, "rewards/rejected": -2.6010035071521997e-05, "step": 160 }, { "epoch": 0.09, "learning_rate": 1.480836236933798e-07, "logits/chosen": 0.2908182144165039, "logits/rejected": 0.27819719910621643, "logps/chosen": -338.5760498046875, "logps/rejected": -307.96539306640625, "loss": 0.6927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -1.5355797586380504e-05, "rewards/margins": -0.0009833310032263398, "rewards/rejected": 0.0009679750655777752, "step": 170 }, { "epoch": 0.09, "learning_rate": 1.56794425087108e-07, "logits/chosen": 0.44243597984313965, "logits/rejected": 0.3705524206161499, "logps/chosen": -322.1753234863281, "logps/rejected": -295.217529296875, "loss": 0.6928, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0010572379687801003, "rewards/margins": 0.0020315528381615877, "rewards/rejected": -0.000974315102212131, "step": 180 }, { "epoch": 0.1, "learning_rate": 1.6550522648083622e-07, "logits/chosen": 0.47350215911865234, "logits/rejected": 0.40756964683532715, "logps/chosen": -276.59906005859375, "logps/rejected": -268.8476867675781, "loss": 0.6927, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0002765903191175312, "rewards/margins": 0.00016240756667684764, "rewards/rejected": 0.00011418270878493786, "step": 190 }, { "epoch": 0.1, "learning_rate": 1.7421602787456445e-07, "logits/chosen": 0.3433852195739746, "logits/rejected": 0.2721015214920044, "logps/chosen": -349.0690002441406, "logps/rejected": -322.2076416015625, "loss": 0.6927, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.002144585596397519, "rewards/margins": 0.0025809439830482006, "rewards/rejected": -0.0004363584448583424, "step": 200 }, { "epoch": 0.11, "learning_rate": 1.8292682926829268e-07, "logits/chosen": 0.3642790913581848, "logits/rejected": 0.4311489164829254, "logps/chosen": -271.95513916015625, "logps/rejected": -248.6610870361328, "loss": 0.6929, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00040192221058532596, "rewards/margins": -0.000289176736259833, "rewards/rejected": -0.00011274554708506912, "step": 210 }, { "epoch": 0.12, "learning_rate": 1.916376306620209e-07, "logits/chosen": 0.3174039125442505, "logits/rejected": 0.36144882440567017, "logps/chosen": -295.30340576171875, "logps/rejected": -248.4581756591797, "loss": 0.6928, "rewards/accuracies": 0.625, "rewards/chosen": 0.00016610624152235687, "rewards/margins": 0.0011444597039371729, "rewards/rejected": -0.0009783534333109856, "step": 220 }, { "epoch": 0.12, "learning_rate": 2.003484320557491e-07, "logits/chosen": 0.33461135625839233, "logits/rejected": 0.2857970595359802, "logps/chosen": -317.2145690917969, "logps/rejected": -250.4669952392578, "loss": 0.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0003413598460610956, "rewards/margins": 0.0002269690448883921, "rewards/rejected": 0.00011439078662078828, "step": 230 }, { "epoch": 0.13, "learning_rate": 2.0905923344947734e-07, "logits/chosen": 0.3559264540672302, "logits/rejected": 0.3971884846687317, "logps/chosen": -249.724365234375, "logps/rejected": -211.0904541015625, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": 0.0007045454694889486, "rewards/margins": 0.0008303613285534084, "rewards/rejected": -0.00012581582996062934, "step": 240 }, { "epoch": 0.13, "learning_rate": 2.1777003484320556e-07, "logits/chosen": 0.2798416018486023, "logits/rejected": 0.30008456110954285, "logps/chosen": -309.72613525390625, "logps/rejected": -267.5157470703125, "loss": 0.693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0006688380381092429, "rewards/margins": 0.0011650488013401628, "rewards/rejected": -0.0004962105304002762, "step": 250 }, { "epoch": 0.14, "learning_rate": 2.264808362369338e-07, "logits/chosen": 0.32978278398513794, "logits/rejected": 0.3426227867603302, "logps/chosen": -377.54766845703125, "logps/rejected": -277.21502685546875, "loss": 0.6925, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.001659739762544632, "rewards/margins": 0.0014490484027191997, "rewards/rejected": 0.00021069117065053433, "step": 260 }, { "epoch": 0.14, "learning_rate": 2.3519163763066202e-07, "logits/chosen": 0.2989436984062195, "logits/rejected": 0.3548172414302826, "logps/chosen": -225.9089813232422, "logps/rejected": -210.8122100830078, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.00030976219568401575, "rewards/margins": 0.00035605434095487, "rewards/rejected": -4.629206523532048e-05, "step": 270 }, { "epoch": 0.15, "learning_rate": 2.439024390243902e-07, "logits/chosen": 0.30432480573654175, "logits/rejected": 0.3280238211154938, "logps/chosen": -294.1086730957031, "logps/rejected": -242.9200897216797, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0018314539920538664, "rewards/margins": 0.0012845260789617896, "rewards/rejected": 0.0005469276802614331, "step": 280 }, { "epoch": 0.15, "learning_rate": 2.526132404181184e-07, "logits/chosen": 0.3513438105583191, "logits/rejected": 0.3013264238834381, "logps/chosen": -258.6016540527344, "logps/rejected": -270.3151550292969, "loss": 0.6927, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0009280891390517354, "rewards/margins": 0.0018027920741587877, "rewards/rejected": -0.0008747029933147132, "step": 290 }, { "epoch": 0.16, "learning_rate": 2.613240418118467e-07, "logits/chosen": 0.361990362405777, "logits/rejected": 0.36707669496536255, "logps/chosen": -347.64947509765625, "logps/rejected": -320.00689697265625, "loss": 0.6924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0015204499941319227, "rewards/margins": 0.0014906801516190171, "rewards/rejected": 2.9769911634502932e-05, "step": 300 }, { "epoch": 0.16, "learning_rate": 2.700348432055749e-07, "logits/chosen": 0.3724101483821869, "logits/rejected": 0.3676129877567291, "logps/chosen": -317.96209716796875, "logps/rejected": -273.7264709472656, "loss": 0.6922, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0012596590677276254, "rewards/margins": 0.0006816141540184617, "rewards/rejected": 0.0005780447972938418, "step": 310 }, { "epoch": 0.17, "learning_rate": 2.7874564459930313e-07, "logits/chosen": 0.34939244389533997, "logits/rejected": 0.35409045219421387, "logps/chosen": -308.7851867675781, "logps/rejected": -339.3348083496094, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": 0.002038495149463415, "rewards/margins": 0.0028217299841344357, "rewards/rejected": -0.0007832351257093251, "step": 320 }, { "epoch": 0.17, "learning_rate": 2.874564459930314e-07, "logits/chosen": 0.4071926176548004, "logits/rejected": 0.3380196690559387, "logps/chosen": -287.54901123046875, "logps/rejected": -277.5677795410156, "loss": 0.6924, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0005766181275248528, "rewards/margins": -0.00022260425612330437, "rewards/rejected": 0.0007992222672328353, "step": 330 }, { "epoch": 0.18, "learning_rate": 2.961672473867596e-07, "logits/chosen": 0.43199872970581055, "logits/rejected": 0.33364588022232056, "logps/chosen": -308.38134765625, "logps/rejected": -210.3172149658203, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0019256987143307924, "rewards/margins": 0.0027559720911085606, "rewards/rejected": -0.00083027349319309, "step": 340 }, { "epoch": 0.18, "learning_rate": 3.048780487804878e-07, "logits/chosen": 0.3490511476993561, "logits/rejected": 0.39305615425109863, "logps/chosen": -244.5614013671875, "logps/rejected": -290.2716369628906, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005816185148432851, "rewards/margins": 0.00035765691427513957, "rewards/rejected": 0.00022396161512006074, "step": 350 }, { "epoch": 0.19, "learning_rate": 3.13588850174216e-07, "logits/chosen": 0.4072600305080414, "logits/rejected": 0.45239806175231934, "logps/chosen": -251.4489288330078, "logps/rejected": -262.99395751953125, "loss": 0.6924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00011031948088202626, "rewards/margins": 0.0024239744525402784, "rewards/rejected": -0.0023136548697948456, "step": 360 }, { "epoch": 0.19, "learning_rate": 3.2229965156794425e-07, "logits/chosen": 0.33437368273735046, "logits/rejected": 0.2984531819820404, "logps/chosen": -348.13482666015625, "logps/rejected": -295.0039978027344, "loss": 0.6921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0037342619616538286, "rewards/margins": 0.004230900201946497, "rewards/rejected": -0.000496637774631381, "step": 370 }, { "epoch": 0.2, "learning_rate": 3.3101045296167245e-07, "logits/chosen": 0.4131089746952057, "logits/rejected": 0.3828733563423157, "logps/chosen": -255.59707641601562, "logps/rejected": -194.83799743652344, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008168669301085174, "rewards/margins": 0.002202791627496481, "rewards/rejected": -0.0013859247555956244, "step": 380 }, { "epoch": 0.2, "learning_rate": 3.3972125435540065e-07, "logits/chosen": 0.31555676460266113, "logits/rejected": 0.36338990926742554, "logps/chosen": -309.2490234375, "logps/rejected": -355.1193542480469, "loss": 0.692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0020399349741637707, "rewards/margins": 0.0011781295761466026, "rewards/rejected": 0.0008618049323558807, "step": 390 }, { "epoch": 0.21, "learning_rate": 3.484320557491289e-07, "logits/chosen": 0.3376355767250061, "logits/rejected": 0.35191774368286133, "logps/chosen": -291.76214599609375, "logps/rejected": -286.9822692871094, "loss": 0.692, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0032814189326018095, "rewards/margins": 0.001435564481653273, "rewards/rejected": 0.0018458545673638582, "step": 400 }, { "epoch": 0.21, "learning_rate": 3.5714285714285716e-07, "logits/chosen": 0.2917638421058655, "logits/rejected": 0.22457854449748993, "logps/chosen": -359.77838134765625, "logps/rejected": -265.8029479980469, "loss": 0.6917, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0006081667961552739, "rewards/margins": 0.0009180738707073033, "rewards/rejected": -0.0003099074529018253, "step": 410 }, { "epoch": 0.22, "learning_rate": 3.6585365853658536e-07, "logits/chosen": 0.35560792684555054, "logits/rejected": 0.28562021255493164, "logps/chosen": -377.1800842285156, "logps/rejected": -215.2724609375, "loss": 0.6914, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0032722880132496357, "rewards/margins": 0.005252937786281109, "rewards/rejected": -0.0019806502386927605, "step": 420 }, { "epoch": 0.23, "learning_rate": 3.7456445993031356e-07, "logits/chosen": 0.3571794629096985, "logits/rejected": 0.2926293611526489, "logps/chosen": -344.73358154296875, "logps/rejected": -269.90374755859375, "loss": 0.6916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003971777856349945, "rewards/margins": 0.0038075477350503206, "rewards/rejected": 0.0001642298884689808, "step": 430 }, { "epoch": 0.23, "learning_rate": 3.832752613240418e-07, "logits/chosen": 0.33091968297958374, "logits/rejected": 0.33137187361717224, "logps/chosen": -269.0303955078125, "logps/rejected": -273.2547302246094, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": 0.002539347391575575, "rewards/margins": 0.0036257575266063213, "rewards/rejected": -0.001086410484276712, "step": 440 }, { "epoch": 0.24, "learning_rate": 3.9198606271777e-07, "logits/chosen": 0.34520870447158813, "logits/rejected": 0.3429456353187561, "logps/chosen": -292.42303466796875, "logps/rejected": -263.06427001953125, "loss": 0.6914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0004831695114262402, "rewards/margins": 0.002628723159432411, "rewards/rejected": -0.0021455539390444756, "step": 450 }, { "epoch": 0.24, "learning_rate": 4.006968641114982e-07, "logits/chosen": 0.41168397665023804, "logits/rejected": 0.33299416303634644, "logps/chosen": -328.90643310546875, "logps/rejected": -249.24636840820312, "loss": 0.6909, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0023158483672887087, "rewards/margins": 0.0044514830224215984, "rewards/rejected": -0.0021356348879635334, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.0940766550522647e-07, "logits/chosen": 0.3450584411621094, "logits/rejected": 0.3537040650844574, "logps/chosen": -277.84783935546875, "logps/rejected": -229.71878051757812, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": 0.0038313809782266617, "rewards/margins": 0.005424472503364086, "rewards/rejected": -0.001593091175891459, "step": 470 }, { "epoch": 0.25, "learning_rate": 4.1811846689895467e-07, "logits/chosen": 0.41893109679222107, "logits/rejected": 0.3723496198654175, "logps/chosen": -303.4540100097656, "logps/rejected": -216.4768524169922, "loss": 0.6914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004440109245479107, "rewards/margins": 0.0051439059898257256, "rewards/rejected": -0.0007037969189696014, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.268292682926829e-07, "logits/chosen": 0.24236135184764862, "logits/rejected": 0.28322139382362366, "logps/chosen": -281.379638671875, "logps/rejected": -330.65777587890625, "loss": 0.6915, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0020877670031040907, "rewards/margins": 0.004719098098576069, "rewards/rejected": -0.002631331095471978, "step": 490 }, { "epoch": 0.26, "learning_rate": 4.3554006968641113e-07, "logits/chosen": 0.4597233235836029, "logits/rejected": 0.35320740938186646, "logps/chosen": -331.86407470703125, "logps/rejected": -262.0279541015625, "loss": 0.6916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006418133620172739, "rewards/margins": 0.005038060247898102, "rewards/rejected": 0.0013800726737827063, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.442508710801394e-07, "logits/chosen": 0.2914946973323822, "logits/rejected": 0.24324540793895721, "logps/chosen": -312.9110107421875, "logps/rejected": -339.0050048828125, "loss": 0.6911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.003171132178977132, "rewards/margins": 0.0027220987249165773, "rewards/rejected": 0.0004490331339184195, "step": 510 }, { "epoch": 0.27, "learning_rate": 4.529616724738676e-07, "logits/chosen": 0.3802244961261749, "logits/rejected": 0.3113190531730652, "logps/chosen": -324.1048889160156, "logps/rejected": -262.59661865234375, "loss": 0.6912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004197067115455866, "rewards/margins": 0.006333489902317524, "rewards/rejected": -0.002136422088369727, "step": 520 }, { "epoch": 0.28, "learning_rate": 4.616724738675958e-07, "logits/chosen": 0.336688756942749, "logits/rejected": 0.32058995962142944, "logps/chosen": -253.60986328125, "logps/rejected": -264.2114562988281, "loss": 0.6907, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.002829137723892927, "rewards/margins": 0.005678371526300907, "rewards/rejected": -0.0028492335695773363, "step": 530 }, { "epoch": 0.28, "learning_rate": 4.7038327526132404e-07, "logits/chosen": 0.4176596701145172, "logits/rejected": 0.3263845443725586, "logps/chosen": -376.275146484375, "logps/rejected": -259.18896484375, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": 0.0038488004356622696, "rewards/margins": 0.0056008342653512955, "rewards/rejected": -0.0017520335968583822, "step": 540 }, { "epoch": 0.29, "learning_rate": 4.790940766550523e-07, "logits/chosen": 0.3719063103199005, "logits/rejected": 0.37896281480789185, "logps/chosen": -302.3629455566406, "logps/rejected": -264.2347106933594, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": 0.0020447494462132454, "rewards/margins": 0.004400859586894512, "rewards/rejected": -0.0023561103735119104, "step": 550 }, { "epoch": 0.29, "learning_rate": 4.878048780487804e-07, "logits/chosen": 0.36305055022239685, "logits/rejected": 0.24923209846019745, "logps/chosen": -411.765869140625, "logps/rejected": -349.3866271972656, "loss": 0.6903, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009467719122767448, "rewards/margins": 0.007618948817253113, "rewards/rejected": 0.001848770072683692, "step": 560 }, { "epoch": 0.3, "learning_rate": 4.965156794425087e-07, "logits/chosen": 0.3964191675186157, "logits/rejected": 0.35975709557533264, "logps/chosen": -298.0080261230469, "logps/rejected": -267.7530517578125, "loss": 0.6906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0032093741465359926, "rewards/margins": 0.0026827100664377213, "rewards/rejected": 0.0005266641965135932, "step": 570 }, { "epoch": 0.3, "learning_rate": 4.999983312905696e-07, "logits/chosen": 0.36271387338638306, "logits/rejected": 0.27819180488586426, "logps/chosen": -289.54852294921875, "logps/rejected": -221.93734741210938, "loss": 0.6898, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003947210498154163, "rewards/margins": 0.005013973917812109, "rewards/rejected": -0.0010667633032426238, "step": 580 }, { "epoch": 0.31, "learning_rate": 4.999881337025014e-07, "logits/chosen": 0.26377126574516296, "logits/rejected": 0.2488066852092743, "logps/chosen": -268.46112060546875, "logps/rejected": -226.2292022705078, "loss": 0.6908, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0002535443636588752, "rewards/margins": 0.001007495797239244, "rewards/rejected": -0.0007539513753727078, "step": 590 }, { "epoch": 0.31, "learning_rate": 4.999686659648518e-07, "logits/chosen": 0.3953297436237335, "logits/rejected": 0.35871651768684387, "logps/chosen": -231.27572631835938, "logps/rejected": -255.6056671142578, "loss": 0.6894, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.003411300480365753, "rewards/margins": 0.004523693583905697, "rewards/rejected": -0.0011123933363705873, "step": 600 }, { "epoch": 0.32, "learning_rate": 4.999399287995302e-07, "logits/chosen": 0.37459492683410645, "logits/rejected": 0.46103644371032715, "logps/chosen": -246.6116180419922, "logps/rejected": -293.78448486328125, "loss": 0.6894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00318206031806767, "rewards/margins": 0.006666205823421478, "rewards/rejected": -0.003484147135168314, "step": 610 }, { "epoch": 0.32, "learning_rate": 4.999019232721791e-07, "logits/chosen": 0.33487755060195923, "logits/rejected": 0.3410380482673645, "logps/chosen": -358.2688903808594, "logps/rejected": -283.22314453125, "loss": 0.6895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006958150304853916, "rewards/margins": 0.009823032654821873, "rewards/rejected": -0.002864880720153451, "step": 620 }, { "epoch": 0.33, "learning_rate": 4.998546507921325e-07, "logits/chosen": 0.33910471200942993, "logits/rejected": 0.3406114876270294, "logps/chosen": -256.4313049316406, "logps/rejected": -276.10064697265625, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.000572996330447495, "rewards/margins": 0.002004786394536495, "rewards/rejected": -0.0014317900640890002, "step": 630 }, { "epoch": 0.33, "learning_rate": 4.997981131123656e-07, "logits/chosen": 0.30870598554611206, "logits/rejected": 0.27435797452926636, "logps/chosen": -383.6075439453125, "logps/rejected": -331.48614501953125, "loss": 0.6886, "rewards/accuracies": 0.75, "rewards/chosen": 0.009466699324548244, "rewards/margins": 0.011064816266298294, "rewards/rejected": -0.0015981157775968313, "step": 640 }, { "epoch": 0.34, "learning_rate": 4.997323123294291e-07, "logits/chosen": 0.35772138833999634, "logits/rejected": 0.34295064210891724, "logps/chosen": -316.08355712890625, "logps/rejected": -233.4802703857422, "loss": 0.6904, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.005766328424215317, "rewards/margins": 0.008546034805476665, "rewards/rejected": -0.0027797059156000614, "step": 650 }, { "epoch": 0.35, "learning_rate": 4.99657250883371e-07, "logits/chosen": 0.3746064305305481, "logits/rejected": 0.3391752243041992, "logps/chosen": -264.4750061035156, "logps/rejected": -204.13467407226562, "loss": 0.6897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.003877130104228854, "rewards/margins": 0.005806891247630119, "rewards/rejected": -0.0019297614926472306, "step": 660 }, { "epoch": 0.35, "learning_rate": 4.995729315576468e-07, "logits/chosen": 0.40420597791671753, "logits/rejected": 0.33833855390548706, "logps/chosen": -276.43310546875, "logps/rejected": -219.8043212890625, "loss": 0.6894, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.006816673092544079, "rewards/margins": 0.008805202320218086, "rewards/rejected": -0.0019885296933352947, "step": 670 }, { "epoch": 0.36, "learning_rate": 4.99479357479016e-07, "logits/chosen": 0.22999343276023865, "logits/rejected": 0.3004499673843384, "logps/chosen": -210.9781951904297, "logps/rejected": -218.9191131591797, "loss": 0.6902, "rewards/accuracies": 0.625, "rewards/chosen": -0.0006085085915401578, "rewards/margins": 0.002932955976575613, "rewards/rejected": -0.0035414646845310926, "step": 680 }, { "epoch": 0.36, "learning_rate": 4.993765321174261e-07, "logits/chosen": 0.36885711550712585, "logits/rejected": 0.33634617924690247, "logps/chosen": -306.89495849609375, "logps/rejected": -315.4165954589844, "loss": 0.6891, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0054230643436312675, "rewards/margins": 0.010317849926650524, "rewards/rejected": -0.004894785117357969, "step": 690 }, { "epoch": 0.37, "learning_rate": 4.992644592858842e-07, "logits/chosen": 0.4698862135410309, "logits/rejected": 0.46879833936691284, "logps/chosen": -224.7325439453125, "logps/rejected": -254.5215606689453, "loss": 0.6889, "rewards/accuracies": 0.625, "rewards/chosen": 0.004678064025938511, "rewards/margins": 0.008384588174521923, "rewards/rejected": -0.003706523682922125, "step": 700 }, { "epoch": 0.37, "learning_rate": 4.991431431403148e-07, "logits/chosen": 0.16423548758029938, "logits/rejected": 0.24411818385124207, "logps/chosen": -275.36859130859375, "logps/rejected": -309.1749267578125, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": 0.00013561574451159686, "rewards/margins": -0.0016555249458178878, "rewards/rejected": 0.001791140646673739, "step": 710 }, { "epoch": 0.38, "learning_rate": 4.99012588179407e-07, "logits/chosen": 0.26659077405929565, "logits/rejected": 0.2297864407300949, "logps/chosen": -337.4370422363281, "logps/rejected": -240.46920776367188, "loss": 0.6887, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0036331997253000736, "rewards/margins": 0.010251880623400211, "rewards/rejected": -0.006618679966777563, "step": 720 }, { "epoch": 0.38, "learning_rate": 4.988727992444467e-07, "logits/chosen": 0.4517492651939392, "logits/rejected": 0.4217531085014343, "logps/chosen": -292.5274353027344, "logps/rejected": -297.12554931640625, "loss": 0.6883, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.007517491467297077, "rewards/margins": 0.007671818137168884, "rewards/rejected": -0.0001543281105114147, "step": 730 }, { "epoch": 0.39, "learning_rate": 4.98723781519137e-07, "logits/chosen": 0.26132479310035706, "logits/rejected": 0.28247007727622986, "logps/chosen": -238.7399444580078, "logps/rejected": -270.9232482910156, "loss": 0.6891, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0010348012438043952, "rewards/margins": 0.005723314359784126, "rewards/rejected": -0.0046885134652256966, "step": 740 }, { "epoch": 0.39, "learning_rate": 4.98565540529407e-07, "logits/chosen": 0.36184945702552795, "logits/rejected": 0.33014923334121704, "logps/chosen": -332.16546630859375, "logps/rejected": -268.51068115234375, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002668363507837057, "rewards/margins": 0.003941279370337725, "rewards/rejected": -0.0012729157460853457, "step": 750 }, { "epoch": 0.4, "learning_rate": 4.983980821432054e-07, "logits/chosen": 0.3811706602573395, "logits/rejected": 0.4038323760032654, "logps/chosen": -318.3126220703125, "logps/rejected": -300.29571533203125, "loss": 0.6881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008891480974853039, "rewards/margins": 0.014289113692939281, "rewards/rejected": -0.005397631786763668, "step": 760 }, { "epoch": 0.4, "learning_rate": 4.982214125702845e-07, "logits/chosen": 0.37229424715042114, "logits/rejected": 0.38184842467308044, "logps/chosen": -255.6134796142578, "logps/rejected": -256.50103759765625, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": 0.004782395903021097, "rewards/margins": 0.011424211785197258, "rewards/rejected": -0.0066418154165148735, "step": 770 }, { "epoch": 0.41, "learning_rate": 4.980355383619684e-07, "logits/chosen": 0.36553826928138733, "logits/rejected": 0.33468884229660034, "logps/chosen": -290.0172424316406, "logps/rejected": -183.47616577148438, "loss": 0.6886, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004404271487146616, "rewards/margins": 0.009764621965587139, "rewards/rejected": -0.0053603509441018105, "step": 780 }, { "epoch": 0.41, "learning_rate": 4.978404664109113e-07, "logits/chosen": 0.3125520348548889, "logits/rejected": 0.2413829267024994, "logps/chosen": -377.26361083984375, "logps/rejected": -280.342041015625, "loss": 0.6873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006130435969680548, "rewards/margins": 0.011444391682744026, "rewards/rejected": -0.0053139557130634785, "step": 790 }, { "epoch": 0.42, "learning_rate": 4.97636203950841e-07, "logits/chosen": 0.3133540749549866, "logits/rejected": 0.2622864246368408, "logps/chosen": -335.50579833984375, "logps/rejected": -279.72174072265625, "loss": 0.688, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011037923395633698, "rewards/margins": 0.015714969485998154, "rewards/rejected": -0.004677046090364456, "step": 800 }, { "epoch": 0.42, "learning_rate": 4.974227585562916e-07, "logits/chosen": 0.2856473922729492, "logits/rejected": 0.21030691266059875, "logps/chosen": -351.89117431640625, "logps/rejected": -308.099365234375, "loss": 0.6871, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013937944546341896, "rewards/margins": 0.019101398065686226, "rewards/rejected": -0.005163452588021755, "step": 810 }, { "epoch": 0.43, "learning_rate": 4.972001381423214e-07, "logits/chosen": 0.38940927386283875, "logits/rejected": 0.38752490282058716, "logps/chosen": -377.74261474609375, "logps/rejected": -341.78033447265625, "loss": 0.6886, "rewards/accuracies": 0.75, "rewards/chosen": 0.012508844025433064, "rewards/margins": 0.011580301448702812, "rewards/rejected": 0.0009285411797463894, "step": 820 }, { "epoch": 0.43, "learning_rate": 4.969683509642206e-07, "logits/chosen": 0.30638226866722107, "logits/rejected": 0.33688345551490784, "logps/chosen": -257.49383544921875, "logps/rejected": -234.21389770507812, "loss": 0.6864, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0007948580896481872, "rewards/margins": 0.0011654215632006526, "rewards/rejected": -0.001960279420018196, "step": 830 }, { "epoch": 0.44, "learning_rate": 4.967274056172044e-07, "logits/chosen": 0.436321496963501, "logits/rejected": 0.3269069790840149, "logps/chosen": -264.94000244140625, "logps/rejected": -235.1780242919922, "loss": 0.6884, "rewards/accuracies": 0.75, "rewards/chosen": 0.004068738780915737, "rewards/margins": 0.012811449356377125, "rewards/rejected": -0.008742708712816238, "step": 840 }, { "epoch": 0.44, "learning_rate": 4.964773110360944e-07, "logits/chosen": 0.3955819606781006, "logits/rejected": 0.4442331790924072, "logps/chosen": -211.5360107421875, "logps/rejected": -221.88296508789062, "loss": 0.6868, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0063387490808963776, "rewards/margins": 0.009760014712810516, "rewards/rejected": -0.003421264933422208, "step": 850 }, { "epoch": 0.45, "learning_rate": 4.962180764949876e-07, "logits/chosen": 0.3990401327610016, "logits/rejected": 0.3605530261993408, "logps/chosen": -293.21966552734375, "logps/rejected": -233.4999237060547, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.008997827768325806, "rewards/margins": 0.01379807572811842, "rewards/rejected": -0.004800247959792614, "step": 860 }, { "epoch": 0.46, "learning_rate": 4.959497116069122e-07, "logits/chosen": 0.4126970171928406, "logits/rejected": 0.306619793176651, "logps/chosen": -291.76324462890625, "logps/rejected": -283.7310791015625, "loss": 0.6873, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005910122767090797, "rewards/margins": 0.011911979876458645, "rewards/rejected": -0.0060018557123839855, "step": 870 }, { "epoch": 0.46, "learning_rate": 4.956722263234711e-07, "logits/chosen": 0.39139634370803833, "logits/rejected": 0.36863797903060913, "logps/chosen": -271.8345947265625, "logps/rejected": -251.8036651611328, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": 0.0030030894558876753, "rewards/margins": 0.009311128407716751, "rewards/rejected": -0.006308038718998432, "step": 880 }, { "epoch": 0.47, "learning_rate": 4.95385630934473e-07, "logits/chosen": 0.37733954191207886, "logits/rejected": 0.3702600598335266, "logps/chosen": -239.25918579101562, "logps/rejected": -221.35079956054688, "loss": 0.6881, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0026471063029021025, "rewards/margins": 0.005196579732000828, "rewards/rejected": -0.002549473661929369, "step": 890 }, { "epoch": 0.47, "learning_rate": 4.950899360675511e-07, "logits/chosen": 0.42558950185775757, "logits/rejected": 0.44111356139183044, "logps/chosen": -251.04159545898438, "logps/rejected": -266.29266357421875, "loss": 0.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.004267251584678888, "rewards/margins": 0.00734188174828887, "rewards/rejected": -0.003074630629271269, "step": 900 }, { "epoch": 0.48, "learning_rate": 4.947851526877681e-07, "logits/chosen": 0.3045174181461334, "logits/rejected": 0.2997366189956665, "logps/chosen": -311.9551696777344, "logps/rejected": -285.98370361328125, "loss": 0.6847, "rewards/accuracies": 0.625, "rewards/chosen": 0.003712692065164447, "rewards/margins": 0.0008770062704570591, "rewards/rejected": 0.0028356860857456923, "step": 910 }, { "epoch": 0.48, "learning_rate": 4.944712920972108e-07, "logits/chosen": 0.34604376554489136, "logits/rejected": 0.3185519278049469, "logps/chosen": -317.5331115722656, "logps/rejected": -317.90008544921875, "loss": 0.6867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010279752314090729, "rewards/margins": 0.013352531008422375, "rewards/rejected": -0.0030727770645171404, "step": 920 }, { "epoch": 0.49, "learning_rate": 4.9414836593457e-07, "logits/chosen": 0.24341456592082977, "logits/rejected": 0.33237552642822266, "logps/chosen": -277.3923034667969, "logps/rejected": -256.6651611328125, "loss": 0.6847, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0015904136234894395, "rewards/margins": 0.01141836866736412, "rewards/rejected": -0.013008782640099525, "step": 930 }, { "epoch": 0.49, "learning_rate": 4.938163861747094e-07, "logits/chosen": 0.37131667137145996, "logits/rejected": 0.30167001485824585, "logps/chosen": -365.1321716308594, "logps/rejected": -231.3142547607422, "loss": 0.6855, "rewards/accuracies": 0.75, "rewards/chosen": 0.016867507249116898, "rewards/margins": 0.02898784913122654, "rewards/rejected": -0.012120342813432217, "step": 940 }, { "epoch": 0.5, "learning_rate": 4.934753651282215e-07, "logits/chosen": 0.34957343339920044, "logits/rejected": 0.3650115132331848, "logps/chosen": -249.6451873779297, "logps/rejected": -238.812255859375, "loss": 0.6888, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0006063526379875839, "rewards/margins": 0.00888439454138279, "rewards/rejected": -0.008278042078018188, "step": 950 }, { "epoch": 0.5, "learning_rate": 4.93125315440971e-07, "logits/chosen": 0.28446871042251587, "logits/rejected": 0.33393317461013794, "logps/chosen": -221.7951202392578, "logps/rejected": -222.3719940185547, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00030084262834861875, "rewards/margins": 0.0024019861593842506, "rewards/rejected": -0.002101143589243293, "step": 960 }, { "epoch": 0.51, "learning_rate": 4.92766250093626e-07, "logits/chosen": 0.3894736170768738, "logits/rejected": 0.42692357301712036, "logps/chosen": -278.0799560546875, "logps/rejected": -277.92578125, "loss": 0.6857, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.005217918194830418, "rewards/margins": 0.01182524859905243, "rewards/rejected": -0.006607329938560724, "step": 970 }, { "epoch": 0.51, "learning_rate": 4.92398182401176e-07, "logits/chosen": 0.36386704444885254, "logits/rejected": 0.34401121735572815, "logps/chosen": -272.7413330078125, "logps/rejected": -258.5047607421875, "loss": 0.6855, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.007803539279848337, "rewards/margins": 0.012347839772701263, "rewards/rejected": -0.004544299561530352, "step": 980 }, { "epoch": 0.52, "learning_rate": 4.920211260124395e-07, "logits/chosen": 0.31229716539382935, "logits/rejected": 0.33428624272346497, "logps/chosen": -271.045166015625, "logps/rejected": -287.309326171875, "loss": 0.6865, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011570685543119907, "rewards/margins": 0.013975699432194233, "rewards/rejected": -0.002405013656243682, "step": 990 }, { "epoch": 0.52, "learning_rate": 4.916350949095566e-07, "logits/chosen": 0.35035276412963867, "logits/rejected": 0.3598233163356781, "logps/chosen": -321.33612060546875, "logps/rejected": -259.4188537597656, "loss": 0.6868, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.012103838846087456, "rewards/margins": 0.023051228374242783, "rewards/rejected": -0.010947388596832752, "step": 1000 }, { "epoch": 0.53, "learning_rate": 4.912401034074708e-07, "logits/chosen": 0.3462555408477783, "logits/rejected": 0.35642725229263306, "logps/chosen": -245.738525390625, "logps/rejected": -185.628173828125, "loss": 0.6847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0006697209319099784, "rewards/margins": 0.014949485659599304, "rewards/rejected": -0.014279766008257866, "step": 1010 }, { "epoch": 0.53, "learning_rate": 4.908361661533989e-07, "logits/chosen": 0.2858627438545227, "logits/rejected": 0.2974536120891571, "logps/chosen": -314.45263671875, "logps/rejected": -301.51995849609375, "loss": 0.6851, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004904334433376789, "rewards/margins": 0.014790430665016174, "rewards/rejected": -0.009886096231639385, "step": 1020 }, { "epoch": 0.54, "learning_rate": 4.904232981262866e-07, "logits/chosen": 0.3912216126918793, "logits/rejected": 0.3437424898147583, "logps/chosen": -318.65496826171875, "logps/rejected": -290.3804931640625, "loss": 0.6842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.014556080102920532, "rewards/margins": 0.027198363095521927, "rewards/rejected": -0.012642279267311096, "step": 1030 }, { "epoch": 0.54, "learning_rate": 4.900015146362544e-07, "logits/chosen": 0.32560548186302185, "logits/rejected": 0.32246512174606323, "logps/chosen": -285.6897277832031, "logps/rejected": -261.7762145996094, "loss": 0.6845, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.013799789361655712, "rewards/margins": 0.030853647738695145, "rewards/rejected": -0.017053861171007156, "step": 1040 }, { "epoch": 0.55, "learning_rate": 4.895708313240285e-07, "logits/chosen": 0.34826239943504333, "logits/rejected": 0.36765581369400024, "logps/chosen": -280.7046813964844, "logps/rejected": -252.820556640625, "loss": 0.6842, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0033273245207965374, "rewards/margins": 0.010536163114011288, "rewards/rejected": -0.007208839058876038, "step": 1050 }, { "epoch": 0.55, "learning_rate": 4.891312641603623e-07, "logits/chosen": 0.3264276087284088, "logits/rejected": 0.3249642252922058, "logps/chosen": -367.77685546875, "logps/rejected": -318.3257751464844, "loss": 0.6841, "rewards/accuracies": 0.75, "rewards/chosen": 0.017363909631967545, "rewards/margins": 0.02406318113207817, "rewards/rejected": -0.00669927429407835, "step": 1060 }, { "epoch": 0.56, "learning_rate": 4.886828294454426e-07, "logits/chosen": 0.3919423222541809, "logits/rejected": 0.3063061833381653, "logps/chosen": -351.09088134765625, "logps/rejected": -332.01043701171875, "loss": 0.6845, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009391558356583118, "rewards/margins": 0.020744940266013145, "rewards/rejected": -0.011353382840752602, "step": 1070 }, { "epoch": 0.57, "learning_rate": 4.882255438082863e-07, "logits/chosen": 0.45179158449172974, "logits/rejected": 0.41871339082717896, "logps/chosen": -283.4506530761719, "logps/rejected": -272.11578369140625, "loss": 0.6848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.012173324823379517, "rewards/margins": 0.019558671861886978, "rewards/rejected": -0.007385346107184887, "step": 1080 }, { "epoch": 0.57, "learning_rate": 4.877594242061233e-07, "logits/chosen": 0.32700738310813904, "logits/rejected": 0.3105686604976654, "logps/chosen": -324.06292724609375, "logps/rejected": -242.7261199951172, "loss": 0.6857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0017802860820665956, "rewards/margins": 0.01658160611987114, "rewards/rejected": -0.014801318757236004, "step": 1090 }, { "epoch": 0.58, "learning_rate": 4.87284487923768e-07, "logits/chosen": 0.3384407162666321, "logits/rejected": 0.2847151458263397, "logps/chosen": -283.078857421875, "logps/rejected": -243.68801879882812, "loss": 0.6847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01340520940721035, "rewards/margins": 0.025466054677963257, "rewards/rejected": -0.012060845270752907, "step": 1100 }, { "epoch": 0.58, "learning_rate": 4.868007525729775e-07, "logits/chosen": 0.39033299684524536, "logits/rejected": 0.3908047676086426, "logps/chosen": -270.23675537109375, "logps/rejected": -233.9051971435547, "loss": 0.6853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00840029213577509, "rewards/margins": 0.017927926033735275, "rewards/rejected": -0.00952763482928276, "step": 1110 }, { "epoch": 0.59, "learning_rate": 4.863082360917998e-07, "logits/chosen": 0.4334394335746765, "logits/rejected": 0.37194570899009705, "logps/chosen": -301.4029846191406, "logps/rejected": -237.0880126953125, "loss": 0.6815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.008443480357527733, "rewards/margins": 0.023213211447000504, "rewards/rejected": -0.014769727364182472, "step": 1120 }, { "epoch": 0.59, "learning_rate": 4.858069567439072e-07, "logits/chosen": 0.33737045526504517, "logits/rejected": 0.335934579372406, "logps/chosen": -253.21676635742188, "logps/rejected": -264.36468505859375, "loss": 0.6867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.003204932902008295, "rewards/margins": 0.007723759859800339, "rewards/rejected": -0.004518826492130756, "step": 1130 }, { "epoch": 0.6, "learning_rate": 4.852969331179206e-07, "logits/chosen": 0.31442832946777344, "logits/rejected": 0.2894567549228668, "logps/chosen": -292.9564208984375, "logps/rejected": -292.08966064453125, "loss": 0.6845, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.007902948185801506, "rewards/margins": 0.018108747899532318, "rewards/rejected": -0.010205797851085663, "step": 1140 }, { "epoch": 0.6, "learning_rate": 4.847781841267185e-07, "logits/chosen": 0.3472171425819397, "logits/rejected": 0.30964428186416626, "logps/chosen": -306.6994934082031, "logps/rejected": -229.86032104492188, "loss": 0.6832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.011110781691968441, "rewards/margins": 0.023439984768629074, "rewards/rejected": -0.012329204939305782, "step": 1150 }, { "epoch": 0.61, "learning_rate": 4.842507290067374e-07, "logits/chosen": 0.3105614483356476, "logits/rejected": 0.2698826789855957, "logps/chosen": -256.3908386230469, "logps/rejected": -250.67025756835938, "loss": 0.6865, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.004344923421740532, "rewards/margins": 0.013838335871696472, "rewards/rejected": -0.018183257430791855, "step": 1160 }, { "epoch": 0.61, "learning_rate": 4.837145873172567e-07, "logits/chosen": 0.3267471194267273, "logits/rejected": 0.3460347354412079, "logps/chosen": -251.1807861328125, "logps/rejected": -216.62417602539062, "loss": 0.6838, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0053130751475691795, "rewards/margins": 0.009674773551523685, "rewards/rejected": -0.014987850561738014, "step": 1170 }, { "epoch": 0.62, "learning_rate": 4.83169778939675e-07, "logits/chosen": 0.3920978903770447, "logits/rejected": 0.4109751284122467, "logps/chosen": -290.7029724121094, "logps/rejected": -280.5768127441406, "loss": 0.6814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.006850590463727713, "rewards/margins": 0.013157842680811882, "rewards/rejected": -0.006307253148406744, "step": 1180 }, { "epoch": 0.62, "learning_rate": 4.826163240767716e-07, "logits/chosen": 0.41425761580467224, "logits/rejected": 0.3763046860694885, "logps/chosen": -382.89776611328125, "logps/rejected": -345.54742431640625, "loss": 0.6838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02900487184524536, "rewards/margins": 0.03568483516573906, "rewards/rejected": -0.006679965648800135, "step": 1190 }, { "epoch": 0.63, "learning_rate": 4.820542432519584e-07, "logits/chosen": 0.28830844163894653, "logits/rejected": 0.341747522354126, "logps/chosen": -325.1733703613281, "logps/rejected": -350.9265441894531, "loss": 0.686, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00515395263209939, "rewards/margins": 0.01350557804107666, "rewards/rejected": -0.008351625874638557, "step": 1200 }, { "epoch": 0.63, "learning_rate": 4.814835573085176e-07, "logits/chosen": 0.4174725115299225, "logits/rejected": 0.4214004874229431, "logps/chosen": -245.7711944580078, "logps/rejected": -214.4864959716797, "loss": 0.6826, "rewards/accuracies": 0.625, "rewards/chosen": 0.003640042617917061, "rewards/margins": 0.016179818660020828, "rewards/rejected": -0.012539774179458618, "step": 1210 }, { "epoch": 0.64, "learning_rate": 4.809042874088304e-07, "logits/chosen": 0.353881299495697, "logits/rejected": 0.2757090926170349, "logps/chosen": -321.21514892578125, "logps/rejected": -249.38925170898438, "loss": 0.683, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009076332673430443, "rewards/margins": 0.027345478534698486, "rewards/rejected": -0.018269145861268044, "step": 1220 }, { "epoch": 0.64, "learning_rate": 4.803164550335905e-07, "logits/chosen": 0.4418944716453552, "logits/rejected": 0.39826610684394836, "logps/chosen": -257.59466552734375, "logps/rejected": -207.9666290283203, "loss": 0.6843, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.007926455698907375, "rewards/margins": 0.021452203392982483, "rewards/rejected": -0.013525748625397682, "step": 1230 }, { "epoch": 0.65, "learning_rate": 4.797200819810089e-07, "logits/chosen": 0.28744739294052124, "logits/rejected": 0.2932237386703491, "logps/chosen": -343.52764892578125, "logps/rejected": -286.6883850097656, "loss": 0.6801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.024472234770655632, "rewards/margins": 0.036373551934957504, "rewards/rejected": -0.011901318095624447, "step": 1240 }, { "epoch": 0.65, "learning_rate": 4.79115190366005e-07, "logits/chosen": 0.428488165140152, "logits/rejected": 0.39333200454711914, "logps/chosen": -311.669921875, "logps/rejected": -284.22674560546875, "loss": 0.6825, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.014714357443153858, "rewards/margins": 0.025674622505903244, "rewards/rejected": -0.010960264131426811, "step": 1250 }, { "epoch": 0.66, "learning_rate": 4.785018026193862e-07, "logits/chosen": 0.28775107860565186, "logits/rejected": 0.279806911945343, "logps/chosen": -261.11627197265625, "logps/rejected": -251.590087890625, "loss": 0.6835, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0010452417191118002, "rewards/margins": 0.009592377580702305, "rewards/rejected": -0.010637620463967323, "step": 1260 }, { "epoch": 0.66, "learning_rate": 4.77879941487017e-07, "logits/chosen": 0.30096435546875, "logits/rejected": 0.30046865344047546, "logps/chosen": -263.289306640625, "logps/rejected": -222.4429473876953, "loss": 0.6828, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.012327324599027634, "rewards/margins": 0.0253997091203928, "rewards/rejected": -0.013072386384010315, "step": 1270 }, { "epoch": 0.67, "learning_rate": 4.772496300289748e-07, "logits/chosen": 0.4783431887626648, "logits/rejected": 0.4230971336364746, "logps/chosen": -283.7425842285156, "logps/rejected": -240.39730834960938, "loss": 0.6814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.013720624148845673, "rewards/margins": 0.03198980540037155, "rewards/rejected": -0.018269184976816177, "step": 1280 }, { "epoch": 0.68, "learning_rate": 4.766108916186949e-07, "logits/chosen": 0.21571576595306396, "logits/rejected": 0.2778613567352295, "logps/chosen": -341.9070129394531, "logps/rejected": -331.6284484863281, "loss": 0.6829, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0014383181696757674, "rewards/margins": 0.017966976389288902, "rewards/rejected": -0.0165286585688591, "step": 1290 }, { "epoch": 0.68, "learning_rate": 4.759637499421042e-07, "logits/chosen": 0.303109347820282, "logits/rejected": 0.30239337682724, "logps/chosen": -323.6648864746094, "logps/rejected": -240.1118927001953, "loss": 0.6771, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.026541758328676224, "rewards/margins": 0.03878582641482353, "rewards/rejected": -0.012244068086147308, "step": 1300 }, { "epoch": 0.69, "learning_rate": 4.7530822899674207e-07, "logits/chosen": 0.3453063666820526, "logits/rejected": 0.2573213577270508, "logps/chosen": -351.45123291015625, "logps/rejected": -256.66986083984375, "loss": 0.6843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0206809863448143, "rewards/margins": 0.021090159192681313, "rewards/rejected": -0.00040917136357165873, "step": 1310 }, { "epoch": 0.69, "learning_rate": 4.7464435309087137e-07, "logits/chosen": 0.42679348587989807, "logits/rejected": 0.3228249251842499, "logps/chosen": -314.97119140625, "logps/rejected": -237.63192749023438, "loss": 0.6836, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004733038134872913, "rewards/margins": 0.035116709768772125, "rewards/rejected": -0.030383672565221786, "step": 1320 }, { "epoch": 0.7, "learning_rate": 4.739721468425763e-07, "logits/chosen": 0.31200721859931946, "logits/rejected": 0.30873575806617737, "logps/chosen": -308.97442626953125, "logps/rejected": -295.93328857421875, "loss": 0.6823, "rewards/accuracies": 0.625, "rewards/chosen": 0.008293518796563148, "rewards/margins": 0.021679170429706573, "rewards/rejected": -0.013385653495788574, "step": 1330 }, { "epoch": 0.7, "learning_rate": 4.7329163517885e-07, "logits/chosen": 0.34352797269821167, "logits/rejected": 0.34060853719711304, "logps/chosen": -312.0477600097656, "logps/rejected": -310.62890625, "loss": 0.6805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.020096898078918457, "rewards/margins": 0.03874671831727028, "rewards/rejected": -0.018649814650416374, "step": 1340 }, { "epoch": 0.71, "learning_rate": 4.7260284333466973e-07, "logits/chosen": 0.370521605014801, "logits/rejected": 0.36052078008651733, "logps/chosen": -276.4061279296875, "logps/rejected": -250.52090454101562, "loss": 0.6838, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.002161466982215643, "rewards/margins": 0.0014650661032646894, "rewards/rejected": 0.0006964011117815971, "step": 1350 }, { "epoch": 0.71, "learning_rate": 4.719057968520617e-07, "logits/chosen": 0.34359192848205566, "logits/rejected": 0.3505293130874634, "logps/chosen": -275.33087158203125, "logps/rejected": -249.4975128173828, "loss": 0.681, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.006505683064460754, "rewards/margins": 0.008261348120868206, "rewards/rejected": -0.001755664823576808, "step": 1360 }, { "epoch": 0.72, "learning_rate": 4.7120052157915345e-07, "logits/chosen": 0.2927667796611786, "logits/rejected": 0.33267074823379517, "logps/chosen": -290.85272216796875, "logps/rejected": -285.813720703125, "loss": 0.6777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006295072380453348, "rewards/margins": 0.025114480406045914, "rewards/rejected": -0.018819406628608704, "step": 1370 }, { "epoch": 0.72, "learning_rate": 4.7048704366921537e-07, "logits/chosen": 0.4149307310581207, "logits/rejected": 0.4059115946292877, "logps/chosen": -220.0292205810547, "logps/rejected": -171.5416717529297, "loss": 0.6849, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.000204659256269224, "rewards/margins": 0.025315681472420692, "rewards/rejected": -0.025111019611358643, "step": 1380 }, { "epoch": 0.73, "learning_rate": 4.6976538957969114e-07, "logits/chosen": 0.496499240398407, "logits/rejected": 0.4356865882873535, "logps/chosen": -280.1456298828125, "logps/rejected": -193.3099365234375, "loss": 0.68, "rewards/accuracies": 0.75, "rewards/chosen": 0.017452292144298553, "rewards/margins": 0.04280317574739456, "rewards/rejected": -0.02535087987780571, "step": 1390 }, { "epoch": 0.73, "learning_rate": 4.690355860712163e-07, "logits/chosen": 0.46831613779067993, "logits/rejected": 0.49073824286460876, "logps/chosen": -228.4839324951172, "logps/rejected": -255.85720825195312, "loss": 0.6817, "rewards/accuracies": 0.625, "rewards/chosen": -0.0025743378791958094, "rewards/margins": 0.01455251406878233, "rewards/rejected": -0.01712685264647007, "step": 1400 }, { "epoch": 0.74, "learning_rate": 4.682976602066262e-07, "logits/chosen": 0.3690539002418518, "logits/rejected": 0.3545624613761902, "logps/chosen": -252.1142578125, "logps/rejected": -207.72763061523438, "loss": 0.6794, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0002451833279337734, "rewards/margins": 0.024905264377593994, "rewards/rejected": -0.025150448083877563, "step": 1410 }, { "epoch": 0.74, "learning_rate": 4.6755163934995224e-07, "logits/chosen": 0.3355734646320343, "logits/rejected": 0.33245348930358887, "logps/chosen": -323.6839599609375, "logps/rejected": -270.827392578125, "loss": 0.6787, "rewards/accuracies": 0.625, "rewards/chosen": 0.008448930457234383, "rewards/margins": 0.024179551750421524, "rewards/rejected": -0.01573062129318714, "step": 1420 }, { "epoch": 0.75, "learning_rate": 4.667975511654072e-07, "logits/chosen": 0.38984426856040955, "logits/rejected": 0.3442505896091461, "logps/chosen": -336.49505615234375, "logps/rejected": -267.2830810546875, "loss": 0.6803, "rewards/accuracies": 0.625, "rewards/chosen": 0.005654610693454742, "rewards/margins": 0.03083883225917816, "rewards/rejected": -0.02518421970307827, "step": 1430 }, { "epoch": 0.75, "learning_rate": 4.660354236163595e-07, "logits/chosen": 0.29653918743133545, "logits/rejected": 0.3196846544742584, "logps/chosen": -326.2195129394531, "logps/rejected": -311.04278564453125, "loss": 0.6814, "rewards/accuracies": 0.625, "rewards/chosen": 0.012937399558722973, "rewards/margins": 0.027429040521383286, "rewards/rejected": -0.014491640031337738, "step": 1440 }, { "epoch": 0.76, "learning_rate": 4.6526528496429606e-07, "logits/chosen": 0.3130524456501007, "logits/rejected": 0.2565365433692932, "logps/chosen": -358.9948425292969, "logps/rejected": -256.02764892578125, "loss": 0.6776, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.022862132638692856, "rewards/margins": 0.057194191962480545, "rewards/rejected": -0.03433205932378769, "step": 1450 }, { "epoch": 0.76, "learning_rate": 4.644871637677745e-07, "logits/chosen": 0.33667245507240295, "logits/rejected": 0.28259915113449097, "logps/chosen": -315.0734558105469, "logps/rejected": -243.84463500976562, "loss": 0.68, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.008300681598484516, "rewards/margins": 0.01949361525475979, "rewards/rejected": -0.011192934587597847, "step": 1460 }, { "epoch": 0.77, "learning_rate": 4.637010888813638e-07, "logits/chosen": 0.40810179710388184, "logits/rejected": 0.24761733412742615, "logps/chosen": -432.92425537109375, "logps/rejected": -285.3038330078125, "loss": 0.6774, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.04085806757211685, "rewards/margins": 0.07843003422021866, "rewards/rejected": -0.037571974098682404, "step": 1470 }, { "epoch": 0.77, "learning_rate": 4.6290708945457493e-07, "logits/chosen": 0.3285070061683655, "logits/rejected": 0.3192768096923828, "logps/chosen": -324.9331359863281, "logps/rejected": -330.42132568359375, "loss": 0.6777, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00941525213420391, "rewards/margins": 0.016397925093770027, "rewards/rejected": -0.00698267063125968, "step": 1480 }, { "epoch": 0.78, "learning_rate": 4.6210519493077887e-07, "logits/chosen": 0.41080838441848755, "logits/rejected": 0.396353155374527, "logps/chosen": -300.5375061035156, "logps/rejected": -276.6629943847656, "loss": 0.6791, "rewards/accuracies": 0.75, "rewards/chosen": 0.016803177073597908, "rewards/margins": 0.0380239263176918, "rewards/rejected": -0.021220751106739044, "step": 1490 }, { "epoch": 0.78, "learning_rate": 4.6129543504611607e-07, "logits/chosen": 0.31903237104415894, "logits/rejected": 0.35140854120254517, "logps/chosen": -290.30255126953125, "logps/rejected": -328.60443115234375, "loss": 0.6779, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.006040631793439388, "rewards/margins": 0.028903204947710037, "rewards/rejected": -0.022862572222948074, "step": 1500 }, { "epoch": 0.79, "learning_rate": 4.604778398283927e-07, "logits/chosen": 0.3273167014122009, "logits/rejected": 0.2575587034225464, "logps/chosen": -274.8236389160156, "logps/rejected": -232.8859405517578, "loss": 0.6789, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0010220941621810198, "rewards/margins": 0.028309160843491554, "rewards/rejected": -0.029331251978874207, "step": 1510 }, { "epoch": 0.8, "learning_rate": 4.596524395959678e-07, "logits/chosen": 0.4078959822654724, "logits/rejected": 0.3541221618652344, "logps/chosen": -306.6305847167969, "logps/rejected": -306.5215148925781, "loss": 0.6773, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.018020112067461014, "rewards/margins": 0.02500280737876892, "rewards/rejected": -0.006982696708291769, "step": 1520 }, { "epoch": 0.8, "learning_rate": 4.588192649566285e-07, "logits/chosen": 0.26278024911880493, "logits/rejected": 0.31725552678108215, "logps/chosen": -382.01214599609375, "logps/rejected": -314.90850830078125, "loss": 0.6788, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0021470137871801853, "rewards/margins": 0.039449065923690796, "rewards/rejected": -0.04159608483314514, "step": 1530 }, { "epoch": 0.81, "learning_rate": 4.5797834680645553e-07, "logits/chosen": 0.2967289984226227, "logits/rejected": 0.32962074875831604, "logps/chosen": -374.12322998046875, "logps/rejected": -369.6998291015625, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": 0.03092324174940586, "rewards/margins": 0.03554985299706459, "rewards/rejected": -0.004626607988029718, "step": 1540 }, { "epoch": 0.81, "learning_rate": 4.5712971632867715e-07, "logits/chosen": 0.4084998071193695, "logits/rejected": 0.417635977268219, "logps/chosen": -211.31640625, "logps/rejected": -183.09400939941406, "loss": 0.6775, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00762952771037817, "rewards/margins": 0.016260989010334015, "rewards/rejected": -0.02389051578938961, "step": 1550 }, { "epoch": 0.82, "learning_rate": 4.562734049925129e-07, "logits/chosen": 0.3083654046058655, "logits/rejected": 0.33990222215652466, "logps/chosen": -249.00332641601562, "logps/rejected": -260.27191162109375, "loss": 0.6801, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.017335115000605583, "rewards/margins": -0.0029615708626806736, "rewards/rejected": -0.014373543672263622, "step": 1560 }, { "epoch": 0.82, "learning_rate": 4.5540944455200663e-07, "logits/chosen": 0.29243311285972595, "logits/rejected": 0.38317304849624634, "logps/chosen": -291.7667541503906, "logps/rejected": -305.8726806640625, "loss": 0.6806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0014521919656544924, "rewards/margins": 0.01298120804131031, "rewards/rejected": -0.014433401636779308, "step": 1570 }, { "epoch": 0.83, "learning_rate": 4.545378670448492e-07, "logits/chosen": 0.3252048194408417, "logits/rejected": 0.3160475194454193, "logps/chosen": -308.96282958984375, "logps/rejected": -296.4386291503906, "loss": 0.6814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.011841426603496075, "rewards/margins": 0.03404033184051514, "rewards/rejected": -0.022198904305696487, "step": 1580 }, { "epoch": 0.83, "learning_rate": 4.5365870479119014e-07, "logits/chosen": 0.3502393662929535, "logits/rejected": 0.3742118775844574, "logps/chosen": -343.4779357910156, "logps/rejected": -285.8212890625, "loss": 0.6773, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00474547827616334, "rewards/margins": 0.039683885872364044, "rewards/rejected": -0.034938402473926544, "step": 1590 }, { "epoch": 0.84, "learning_rate": 4.5277199039243917e-07, "logits/chosen": 0.40031924843788147, "logits/rejected": 0.3826026916503906, "logps/chosen": -381.2270812988281, "logps/rejected": -255.69287109375, "loss": 0.6751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010327344760298729, "rewards/margins": 0.037659358233213425, "rewards/rejected": -0.027332013472914696, "step": 1600 }, { "epoch": 0.84, "learning_rate": 4.5187775673005744e-07, "logits/chosen": 0.3817320764064789, "logits/rejected": 0.32918184995651245, "logps/chosen": -294.9057922363281, "logps/rejected": -206.7053985595703, "loss": 0.6745, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0099642314016819, "rewards/margins": 0.04299734905362129, "rewards/rejected": -0.03303311765193939, "step": 1610 }, { "epoch": 0.85, "learning_rate": 4.509760369643384e-07, "logits/chosen": 0.3825025260448456, "logits/rejected": 0.42483648657798767, "logps/chosen": -308.7125549316406, "logps/rejected": -309.3365173339844, "loss": 0.6774, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.003090689657256007, "rewards/margins": 0.006784576922655106, "rewards/rejected": -0.009875266812741756, "step": 1620 }, { "epoch": 0.85, "learning_rate": 4.5006686453317734e-07, "logits/chosen": 0.3839607536792755, "logits/rejected": 0.3535715937614441, "logps/chosen": -339.5386047363281, "logps/rejected": -244.91543579101562, "loss": 0.6797, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0017743610078468919, "rewards/margins": 0.03925933688879013, "rewards/rejected": -0.037484973669052124, "step": 1630 }, { "epoch": 0.86, "learning_rate": 4.4915027315083243e-07, "logits/chosen": 0.2991335988044739, "logits/rejected": 0.2891019582748413, "logps/chosen": -304.2152099609375, "logps/rejected": -264.15185546875, "loss": 0.6802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.014160381630063057, "rewards/margins": 0.05216909572482109, "rewards/rejected": -0.038008708506822586, "step": 1640 }, { "epoch": 0.86, "learning_rate": 4.482262968066737e-07, "logits/chosen": 0.32558315992355347, "logits/rejected": 0.33039188385009766, "logps/chosen": -271.199462890625, "logps/rejected": -319.0653381347656, "loss": 0.679, "rewards/accuracies": 0.75, "rewards/chosen": 0.000756078225094825, "rewards/margins": 0.031729746609926224, "rewards/rejected": -0.03097366914153099, "step": 1650 }, { "epoch": 0.87, "learning_rate": 4.4729496976392324e-07, "logits/chosen": 0.34333691000938416, "logits/rejected": 0.4006190299987793, "logps/chosen": -281.2134094238281, "logps/rejected": -294.54595947265625, "loss": 0.6751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00261726719327271, "rewards/margins": 0.031192597001791, "rewards/rejected": -0.02857532724738121, "step": 1660 }, { "epoch": 0.87, "learning_rate": 4.463563265583843e-07, "logits/chosen": 0.29203808307647705, "logits/rejected": 0.31420475244522095, "logps/chosen": -298.7978515625, "logps/rejected": -272.68255615234375, "loss": 0.6754, "rewards/accuracies": 0.5, "rewards/chosen": 0.0030163261108100414, "rewards/margins": 0.013660850934684277, "rewards/rejected": -0.010644523426890373, "step": 1670 }, { "epoch": 0.88, "learning_rate": 4.4541040199716063e-07, "logits/chosen": 0.30822497606277466, "logits/rejected": 0.2724303603172302, "logps/chosen": -269.5755920410156, "logps/rejected": -256.20172119140625, "loss": 0.6808, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009109275415539742, "rewards/margins": 0.013653385452926159, "rewards/rejected": -0.022762665525078773, "step": 1680 }, { "epoch": 0.88, "learning_rate": 4.4445723115736587e-07, "logits/chosen": 0.33946898579597473, "logits/rejected": 0.2928759455680847, "logps/chosen": -351.9305419921875, "logps/rejected": -251.12399291992188, "loss": 0.6756, "rewards/accuracies": 0.75, "rewards/chosen": 0.007805541157722473, "rewards/margins": 0.055908508598804474, "rewards/rejected": -0.0481029637157917, "step": 1690 }, { "epoch": 0.89, "learning_rate": 4.434968493848228e-07, "logits/chosen": 0.4504426419734955, "logits/rejected": 0.37384623289108276, "logps/chosen": -277.2381286621094, "logps/rejected": -238.30520629882812, "loss": 0.6752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002321247011423111, "rewards/margins": 0.023793473839759827, "rewards/rejected": -0.021472224965691566, "step": 1700 }, { "epoch": 0.89, "learning_rate": 4.425292922927525e-07, "logits/chosen": 0.383585125207901, "logits/rejected": 0.28925397992134094, "logps/chosen": -257.552734375, "logps/rejected": -234.51431274414062, "loss": 0.6798, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.002572057768702507, "rewards/margins": 0.04384605213999748, "rewards/rejected": -0.041273992508649826, "step": 1710 }, { "epoch": 0.9, "learning_rate": 4.41554595760454e-07, "logits/chosen": 0.284583181142807, "logits/rejected": 0.25669267773628235, "logps/chosen": -370.1746826171875, "logps/rejected": -323.588623046875, "loss": 0.6772, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.015700947493314743, "rewards/margins": 0.042133111506700516, "rewards/rejected": -0.026432165876030922, "step": 1720 }, { "epoch": 0.91, "learning_rate": 4.4057279593197326e-07, "logits/chosen": 0.3870350122451782, "logits/rejected": 0.39999377727508545, "logps/chosen": -249.8700714111328, "logps/rejected": -261.95843505859375, "loss": 0.6768, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.001268685213290155, "rewards/margins": 0.021950436756014824, "rewards/rejected": -0.023219123482704163, "step": 1730 }, { "epoch": 0.91, "learning_rate": 4.395839292147637e-07, "logits/chosen": 0.3090742230415344, "logits/rejected": 0.30543801188468933, "logps/chosen": -321.3290100097656, "logps/rejected": -286.99774169921875, "loss": 0.676, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.024974200874567032, "rewards/margins": 0.06467586010694504, "rewards/rejected": -0.039701662957668304, "step": 1740 }, { "epoch": 0.92, "learning_rate": 4.3858803227833526e-07, "logits/chosen": 0.24626950919628143, "logits/rejected": 0.24712149798870087, "logps/chosen": -372.52020263671875, "logps/rejected": -345.7261657714844, "loss": 0.6744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.021267516538500786, "rewards/margins": 0.05029396340250969, "rewards/rejected": -0.029026448726654053, "step": 1750 }, { "epoch": 0.92, "learning_rate": 4.375851420528951e-07, "logits/chosen": 0.32322531938552856, "logits/rejected": 0.34263914823532104, "logps/chosen": -288.7018737792969, "logps/rejected": -257.0304260253906, "loss": 0.6752, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0024817243684083223, "rewards/margins": 0.02616637572646141, "rewards/rejected": -0.028648102656006813, "step": 1760 }, { "epoch": 0.93, "learning_rate": 4.36575295727978e-07, "logits/chosen": 0.38421711325645447, "logits/rejected": 0.41225776076316833, "logps/chosen": -309.842529296875, "logps/rejected": -314.6719665527344, "loss": 0.6796, "rewards/accuracies": 0.75, "rewards/chosen": 0.009599078446626663, "rewards/margins": 0.052566200494766235, "rewards/rejected": -0.042967118322849274, "step": 1770 }, { "epoch": 0.93, "learning_rate": 4.355585307510675e-07, "logits/chosen": 0.391152948141098, "logits/rejected": 0.35352757573127747, "logps/chosen": -262.3291320800781, "logps/rejected": -261.0271301269531, "loss": 0.6768, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.017994780093431473, "rewards/margins": 0.007160873152315617, "rewards/rejected": -0.025155652314424515, "step": 1780 }, { "epoch": 0.94, "learning_rate": 4.345348848262068e-07, "logits/chosen": 0.36389559507369995, "logits/rejected": 0.39531487226486206, "logps/chosen": -333.47760009765625, "logps/rejected": -261.5639953613281, "loss": 0.6794, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.007003250531852245, "rewards/margins": 0.015884755179286003, "rewards/rejected": -0.022888006642460823, "step": 1790 }, { "epoch": 0.94, "learning_rate": 4.33504395912601e-07, "logits/chosen": 0.28872233629226685, "logits/rejected": 0.25540515780448914, "logps/chosen": -337.00360107421875, "logps/rejected": -328.2117004394531, "loss": 0.6764, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0031368457712233067, "rewards/margins": 0.034987322986125946, "rewards/rejected": -0.03185047581791878, "step": 1800 }, { "epoch": 0.95, "learning_rate": 4.324671022232095e-07, "logits/chosen": 0.2700476050376892, "logits/rejected": 0.34975817799568176, "logps/chosen": -226.2433319091797, "logps/rejected": -245.605712890625, "loss": 0.6731, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.019788941368460655, "rewards/margins": 0.02122773975133896, "rewards/rejected": -0.04101668298244476, "step": 1810 }, { "epoch": 0.95, "learning_rate": 4.314230422233286e-07, "logits/chosen": 0.367465615272522, "logits/rejected": 0.34759631752967834, "logps/chosen": -327.2635498046875, "logps/rejected": -291.80902099609375, "loss": 0.671, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.021458884701132774, "rewards/margins": 0.05451428145170212, "rewards/rejected": -0.033055394887924194, "step": 1820 }, { "epoch": 0.96, "learning_rate": 4.303722546291655e-07, "logits/chosen": 0.2959325611591339, "logits/rejected": 0.25788697600364685, "logps/chosen": -288.8806457519531, "logps/rejected": -267.59320068359375, "loss": 0.6719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005892341025173664, "rewards/margins": 0.03235545754432678, "rewards/rejected": -0.03824779763817787, "step": 1830 }, { "epoch": 0.96, "learning_rate": 4.2931477840640243e-07, "logits/chosen": 0.30159324407577515, "logits/rejected": 0.324674516916275, "logps/chosen": -312.8402404785156, "logps/rejected": -270.1824645996094, "loss": 0.6732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010435075499117374, "rewards/margins": 0.05785765126347542, "rewards/rejected": -0.04742256924510002, "step": 1840 }, { "epoch": 0.97, "learning_rate": 4.282506527687517e-07, "logits/chosen": 0.5027902126312256, "logits/rejected": 0.5105705261230469, "logps/chosen": -254.4239959716797, "logps/rejected": -224.8533477783203, "loss": 0.6752, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01124303974211216, "rewards/margins": 0.059338875114917755, "rewards/rejected": -0.048095833510160446, "step": 1850 }, { "epoch": 0.97, "learning_rate": 4.271799171765016e-07, "logits/chosen": 0.37145090103149414, "logits/rejected": 0.4021398425102234, "logps/chosen": -241.52194213867188, "logps/rejected": -235.2782440185547, "loss": 0.6694, "rewards/accuracies": 0.625, "rewards/chosen": 0.0031918413005769253, "rewards/margins": 0.042500268667936325, "rewards/rejected": -0.03930842876434326, "step": 1860 }, { "epoch": 0.98, "learning_rate": 4.2610261133505323e-07, "logits/chosen": 0.36971864104270935, "logits/rejected": 0.303048312664032, "logps/chosen": -361.45941162109375, "logps/rejected": -338.87738037109375, "loss": 0.6722, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017358357086777687, "rewards/margins": 0.05369344353675842, "rewards/rejected": -0.036335088312625885, "step": 1870 }, { "epoch": 0.98, "learning_rate": 4.250187751934479e-07, "logits/chosen": 0.22648346424102783, "logits/rejected": 0.2450123280286789, "logps/chosen": -312.0504150390625, "logps/rejected": -309.24127197265625, "loss": 0.6723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006947515066713095, "rewards/margins": 0.025908803567290306, "rewards/rejected": -0.03285632282495499, "step": 1880 }, { "epoch": 0.99, "learning_rate": 4.2392844894288605e-07, "logits/chosen": 0.36955493688583374, "logits/rejected": 0.37511900067329407, "logps/chosen": -281.3575134277344, "logps/rejected": -293.9774169921875, "loss": 0.6748, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.008413514122366905, "rewards/margins": 0.05240880325436592, "rewards/rejected": -0.060822319239377975, "step": 1890 }, { "epoch": 0.99, "learning_rate": 4.2283167301523634e-07, "logits/chosen": 0.380458801984787, "logits/rejected": 0.36902934312820435, "logps/chosen": -254.7010498046875, "logps/rejected": -220.5850372314453, "loss": 0.677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01337057538330555, "rewards/margins": 0.040344201028347015, "rewards/rejected": -0.05371478199958801, "step": 1900 }, { "epoch": 1.0, "learning_rate": 4.217284880815369e-07, "logits/chosen": 0.2915937304496765, "logits/rejected": 0.28769439458847046, "logps/chosen": -311.2845458984375, "logps/rejected": -284.9055480957031, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": 0.008056707680225372, "rewards/margins": 0.026231324300169945, "rewards/rejected": -0.01817461848258972, "step": 1910 }, { "epoch": 1.0, "learning_rate": 4.2061893505048694e-07, "logits/chosen": 0.4433872103691101, "logits/rejected": 0.5107973217964172, "logps/chosen": -273.775390625, "logps/rejected": -284.90509033203125, "loss": 0.6723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0016942888032644987, "rewards/margins": 0.04829670116305351, "rewards/rejected": -0.04999098926782608, "step": 1920 }, { "epoch": 1.01, "learning_rate": 4.1950305506692967e-07, "logits/chosen": 0.38846421241760254, "logits/rejected": 0.38444000482559204, "logps/chosen": -299.87896728515625, "logps/rejected": -298.85369873046875, "loss": 0.6756, "rewards/accuracies": 0.75, "rewards/chosen": 0.0038868593983352184, "rewards/margins": 0.0499236099421978, "rewards/rejected": -0.046036750078201294, "step": 1930 }, { "epoch": 1.02, "learning_rate": 4.1838088951032656e-07, "logits/chosen": 0.25646716356277466, "logits/rejected": 0.2298089563846588, "logps/chosen": -345.03997802734375, "logps/rejected": -319.909912109375, "loss": 0.6709, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.00014434941112995148, "rewards/margins": 0.06486411392688751, "rewards/rejected": -0.06500846892595291, "step": 1940 }, { "epoch": 1.02, "learning_rate": 4.172524799932231e-07, "logits/chosen": 0.4529836177825928, "logits/rejected": 0.4203832745552063, "logps/chosen": -221.33285522460938, "logps/rejected": -218.61892700195312, "loss": 0.673, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.016625165939331055, "rewards/margins": 0.043132197111845016, "rewards/rejected": -0.05975737050175667, "step": 1950 }, { "epoch": 1.03, "learning_rate": 4.161178683597054e-07, "logits/chosen": 0.3584556579589844, "logits/rejected": 0.36893972754478455, "logps/chosen": -299.40118408203125, "logps/rejected": -290.5531921386719, "loss": 0.6748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01853383705019951, "rewards/margins": 0.02997218444943428, "rewards/rejected": -0.04850602149963379, "step": 1960 }, { "epoch": 1.03, "learning_rate": 4.1497709668384885e-07, "logits/chosen": 0.26903319358825684, "logits/rejected": 0.2895038425922394, "logps/chosen": -355.22833251953125, "logps/rejected": -334.9042663574219, "loss": 0.6753, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0025704544968903065, "rewards/margins": 0.03809639438986778, "rewards/rejected": -0.03552594035863876, "step": 1970 }, { "epoch": 1.04, "learning_rate": 4.1383020726815745e-07, "logits/chosen": 0.30599287152290344, "logits/rejected": 0.19721153378486633, "logps/chosen": -380.2153015136719, "logps/rejected": -280.277587890625, "loss": 0.6702, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004432808607816696, "rewards/margins": 0.04755949229001999, "rewards/rejected": -0.043126679956912994, "step": 1980 }, { "epoch": 1.04, "learning_rate": 4.126772426419959e-07, "logits/chosen": 0.40132278203964233, "logits/rejected": 0.3763146698474884, "logps/chosen": -268.63140869140625, "logps/rejected": -236.5342254638672, "loss": 0.6784, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01728612184524536, "rewards/margins": 0.02777310088276863, "rewards/rejected": -0.045059219002723694, "step": 1990 }, { "epoch": 1.05, "learning_rate": 4.1151824556001145e-07, "logits/chosen": 0.3457508683204651, "logits/rejected": 0.30099207162857056, "logps/chosen": -280.82720947265625, "logps/rejected": -264.7738342285156, "loss": 0.6737, "rewards/accuracies": 0.5, "rewards/chosen": -0.02697814628481865, "rewards/margins": 0.02771422266960144, "rewards/rejected": -0.05469236522912979, "step": 2000 }, { "epoch": 1.05, "learning_rate": 4.103532590005495e-07, "logits/chosen": 0.40293654799461365, "logits/rejected": 0.33411556482315063, "logps/chosen": -299.47198486328125, "logps/rejected": -236.7769012451172, "loss": 0.6698, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02038879506289959, "rewards/margins": 0.06998451054096222, "rewards/rejected": -0.049595706164836884, "step": 2010 }, { "epoch": 1.06, "learning_rate": 4.091823261640592e-07, "logits/chosen": 0.39449697732925415, "logits/rejected": 0.2622838616371155, "logps/chosen": -361.0088806152344, "logps/rejected": -254.7809295654297, "loss": 0.671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0016580044757574797, "rewards/margins": 0.05266914889216423, "rewards/rejected": -0.05432716757059097, "step": 2020 }, { "epoch": 1.06, "learning_rate": 4.080054904714917e-07, "logits/chosen": 0.45272356271743774, "logits/rejected": 0.3646053969860077, "logps/chosen": -290.106689453125, "logps/rejected": -281.1361083984375, "loss": 0.6713, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.013397174887359142, "rewards/margins": 0.034534044563770294, "rewards/rejected": -0.04793121665716171, "step": 2030 }, { "epoch": 1.07, "learning_rate": 4.0682279556268993e-07, "logits/chosen": 0.3366960883140564, "logits/rejected": 0.29935771226882935, "logps/chosen": -343.77142333984375, "logps/rejected": -292.15960693359375, "loss": 0.6702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0014214410912245512, "rewards/margins": 0.039344437420368195, "rewards/rejected": -0.037922997027635574, "step": 2040 }, { "epoch": 1.07, "learning_rate": 4.056342852947706e-07, "logits/chosen": 0.4113841950893402, "logits/rejected": 0.36100929975509644, "logps/chosen": -304.214599609375, "logps/rejected": -267.27294921875, "loss": 0.6753, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0162814911454916, "rewards/margins": 0.03878726810216904, "rewards/rejected": -0.05506875365972519, "step": 2050 }, { "epoch": 1.08, "learning_rate": 4.044400037404973e-07, "logits/chosen": 0.2680433392524719, "logits/rejected": 0.23881573975086212, "logps/chosen": -247.9770050048828, "logps/rejected": -265.25341796875, "loss": 0.6702, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01128520630300045, "rewards/margins": 0.043352216482162476, "rewards/rejected": -0.054637424647808075, "step": 2060 }, { "epoch": 1.08, "learning_rate": 4.032399951866468e-07, "logits/chosen": 0.3386828303337097, "logits/rejected": 0.37686488032341003, "logps/chosen": -260.5417175292969, "logps/rejected": -324.646484375, "loss": 0.6783, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025614675134420395, "rewards/margins": 0.016095593571662903, "rewards/rejected": -0.041710264980793, "step": 2070 }, { "epoch": 1.09, "learning_rate": 4.0203430413236637e-07, "logits/chosen": 0.3427388072013855, "logits/rejected": 0.2969673275947571, "logps/chosen": -331.38677978515625, "logps/rejected": -326.67315673828125, "loss": 0.6724, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.006624752189964056, "rewards/margins": 0.06880460679531097, "rewards/rejected": -0.07542935013771057, "step": 2080 }, { "epoch": 1.09, "learning_rate": 4.0082297528752407e-07, "logits/chosen": 0.3527575135231018, "logits/rejected": 0.3978745639324188, "logps/chosen": -256.7491760253906, "logps/rejected": -277.7139587402344, "loss": 0.6678, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.04201974719762802, "rewards/margins": 0.015251986682415009, "rewards/rejected": -0.05727173015475273, "step": 2090 }, { "epoch": 1.1, "learning_rate": 3.9960605357105e-07, "logits/chosen": 0.29884204268455505, "logits/rejected": 0.21345266699790955, "logps/chosen": -328.6796875, "logps/rejected": -261.64208984375, "loss": 0.6663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0030146692879498005, "rewards/margins": 0.06451131403446198, "rewards/rejected": -0.06149665638804436, "step": 2100 }, { "epoch": 1.1, "learning_rate": 3.983835841092716e-07, "logits/chosen": 0.23722746968269348, "logits/rejected": 0.23029477894306183, "logps/chosen": -390.21002197265625, "logps/rejected": -309.8589782714844, "loss": 0.6654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006493761204183102, "rewards/margins": 0.09093605726957321, "rewards/rejected": -0.08444229513406754, "step": 2110 }, { "epoch": 1.11, "learning_rate": 3.971556122342398e-07, "logits/chosen": 0.4192166328430176, "logits/rejected": 0.4076710343360901, "logps/chosen": -287.0326232910156, "logps/rejected": -246.2929229736328, "loss": 0.67, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.022333037108182907, "rewards/margins": 0.04057054594159126, "rewards/rejected": -0.06290359050035477, "step": 2120 }, { "epoch": 1.11, "learning_rate": 3.9592218348204766e-07, "logits/chosen": 0.3088974356651306, "logits/rejected": 0.2831978499889374, "logps/chosen": -273.89495849609375, "logps/rejected": -264.4192810058594, "loss": 0.6762, "rewards/accuracies": 0.625, "rewards/chosen": -0.020427577197551727, "rewards/margins": 0.03233342617750168, "rewards/rejected": -0.05276099964976311, "step": 2130 }, { "epoch": 1.12, "learning_rate": 3.946833435911423e-07, "logits/chosen": 0.3843507766723633, "logits/rejected": 0.39491331577301025, "logps/chosen": -282.49163818359375, "logps/rejected": -212.6112060546875, "loss": 0.6677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.013249019160866737, "rewards/margins": 0.09190802276134491, "rewards/rejected": -0.07865899056196213, "step": 2140 }, { "epoch": 1.13, "learning_rate": 3.9343913850062856e-07, "logits/chosen": 0.2880811095237732, "logits/rejected": 0.2723557949066162, "logps/chosen": -298.74163818359375, "logps/rejected": -255.66567993164062, "loss": 0.6739, "rewards/accuracies": 0.625, "rewards/chosen": -0.027912423014640808, "rewards/margins": 0.040033094584941864, "rewards/rejected": -0.06794553250074387, "step": 2150 }, { "epoch": 1.13, "learning_rate": 3.921896143485657e-07, "logits/chosen": 0.2881740927696228, "logits/rejected": 0.3070821762084961, "logps/chosen": -277.80462646484375, "logps/rejected": -267.8509521484375, "loss": 0.6694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03029860183596611, "rewards/margins": 0.03809525817632675, "rewards/rejected": -0.06839386373758316, "step": 2160 }, { "epoch": 1.14, "learning_rate": 3.9093481747025615e-07, "logits/chosen": 0.36197030544281006, "logits/rejected": 0.33939141035079956, "logps/chosen": -365.284423828125, "logps/rejected": -327.8709716796875, "loss": 0.6669, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02871832624077797, "rewards/margins": 0.041896142065525055, "rewards/rejected": -0.07061446458101273, "step": 2170 }, { "epoch": 1.14, "learning_rate": 3.896747943965275e-07, "logits/chosen": 0.28229203820228577, "logits/rejected": 0.3258668780326843, "logps/chosen": -254.9258270263672, "logps/rejected": -258.52142333984375, "loss": 0.6717, "rewards/accuracies": 0.75, "rewards/chosen": -0.023102175444364548, "rewards/margins": 0.06240806728601456, "rewards/rejected": -0.0855102464556694, "step": 2180 }, { "epoch": 1.15, "learning_rate": 3.8840959185200717e-07, "logits/chosen": 0.2814302146434784, "logits/rejected": 0.30456703901290894, "logps/chosen": -275.612060546875, "logps/rejected": -282.63909912109375, "loss": 0.6693, "rewards/accuracies": 0.5, "rewards/chosen": -0.01812458410859108, "rewards/margins": 0.020614150911569595, "rewards/rejected": -0.038738735020160675, "step": 2190 }, { "epoch": 1.15, "learning_rate": 3.871392567533893e-07, "logits/chosen": 0.37268248200416565, "logits/rejected": 0.3728785514831543, "logps/chosen": -278.0909423828125, "logps/rejected": -248.9449920654297, "loss": 0.6741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03167051449418068, "rewards/margins": 0.046841494739055634, "rewards/rejected": -0.07851200550794601, "step": 2200 }, { "epoch": 1.16, "learning_rate": 3.858638362076953e-07, "logits/chosen": 0.3451462984085083, "logits/rejected": 0.3002368211746216, "logps/chosen": -288.4288330078125, "logps/rejected": -265.7797546386719, "loss": 0.6735, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03316992148756981, "rewards/margins": 0.03865870088338852, "rewards/rejected": -0.07182861864566803, "step": 2210 }, { "epoch": 1.16, "learning_rate": 3.845833775105272e-07, "logits/chosen": 0.36414092779159546, "logits/rejected": 0.356741726398468, "logps/chosen": -326.46490478515625, "logps/rejected": -294.9678039550781, "loss": 0.6683, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020617762580513954, "rewards/margins": 0.0373426154255867, "rewards/rejected": -0.05796036869287491, "step": 2220 }, { "epoch": 1.17, "learning_rate": 3.832979281443133e-07, "logits/chosen": 0.31907418370246887, "logits/rejected": 0.29716235399246216, "logps/chosen": -337.14874267578125, "logps/rejected": -303.4570007324219, "loss": 0.6727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014699602499604225, "rewards/margins": 0.035601574927568436, "rewards/rejected": -0.05030117556452751, "step": 2230 }, { "epoch": 1.17, "learning_rate": 3.8200753577654765e-07, "logits/chosen": 0.33600661158561707, "logits/rejected": 0.3653258681297302, "logps/chosen": -278.1409912109375, "logps/rejected": -219.810791015625, "loss": 0.6669, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.049881428480148315, "rewards/margins": 0.025163423269987106, "rewards/rejected": -0.07504484802484512, "step": 2240 }, { "epoch": 1.18, "learning_rate": 3.8071224825802273e-07, "logits/chosen": 0.30984312295913696, "logits/rejected": 0.32006576657295227, "logps/chosen": -266.3006591796875, "logps/rejected": -258.7674560546875, "loss": 0.6643, "rewards/accuracies": 0.625, "rewards/chosen": -0.022308386862277985, "rewards/margins": 0.03970428183674812, "rewards/rejected": -0.062012672424316406, "step": 2250 }, { "epoch": 1.18, "learning_rate": 3.7941211362105453e-07, "logits/chosen": 0.3602008819580078, "logits/rejected": 0.36997145414352417, "logps/chosen": -377.5152893066406, "logps/rejected": -342.37744140625, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": 0.013790899887681007, "rewards/margins": 0.059584714472293854, "rewards/rejected": -0.0457938127219677, "step": 2260 }, { "epoch": 1.19, "learning_rate": 3.781071800777017e-07, "logits/chosen": 0.42406994104385376, "logits/rejected": 0.406341552734375, "logps/chosen": -289.2626037597656, "logps/rejected": -255.7004852294922, "loss": 0.6661, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04291009157896042, "rewards/margins": 0.031034070998430252, "rewards/rejected": -0.07394416630268097, "step": 2270 }, { "epoch": 1.19, "learning_rate": 3.767974960179776e-07, "logits/chosen": 0.38467639684677124, "logits/rejected": 0.36330491304397583, "logps/chosen": -376.39801025390625, "logps/rejected": -299.58599853515625, "loss": 0.6696, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02126261219382286, "rewards/margins": 0.0888260155916214, "rewards/rejected": -0.06756339967250824, "step": 2280 }, { "epoch": 1.2, "learning_rate": 3.7548311000805605e-07, "logits/chosen": 0.383809894323349, "logits/rejected": 0.3646061420440674, "logps/chosen": -299.8836975097656, "logps/rejected": -254.42153930664062, "loss": 0.6699, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025484036654233932, "rewards/margins": 0.05899130553007126, "rewards/rejected": -0.08447533845901489, "step": 2290 }, { "epoch": 1.2, "learning_rate": 3.7416407078847015e-07, "logits/chosen": 0.39214324951171875, "logits/rejected": 0.39261943101882935, "logps/chosen": -319.5709533691406, "logps/rejected": -287.7979736328125, "loss": 0.6672, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.016738515347242355, "rewards/margins": 0.06187937781214714, "rewards/rejected": -0.0786178931593895, "step": 2300 }, { "epoch": 1.21, "learning_rate": 3.7284042727230506e-07, "logits/chosen": 0.3887889087200165, "logits/rejected": 0.3705871105194092, "logps/chosen": -288.55438232421875, "logps/rejected": -297.3534240722656, "loss": 0.6688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011513590812683105, "rewards/margins": 0.06126584857702255, "rewards/rejected": -0.07277944684028625, "step": 2310 }, { "epoch": 1.21, "learning_rate": 3.7151222854338413e-07, "logits/chosen": 0.36249834299087524, "logits/rejected": 0.44951605796813965, "logps/chosen": -264.47503662109375, "logps/rejected": -288.58367919921875, "loss": 0.6675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.019437994807958603, "rewards/margins": 0.05599921941757202, "rewards/rejected": -0.07543721050024033, "step": 2320 }, { "epoch": 1.22, "learning_rate": 3.701795238544488e-07, "logits/chosen": 0.3534383773803711, "logits/rejected": 0.35369449853897095, "logps/chosen": -345.45233154296875, "logps/rejected": -266.8211364746094, "loss": 0.6697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.011401348747313023, "rewards/margins": 0.04326106235384941, "rewards/rejected": -0.05466241389513016, "step": 2330 }, { "epoch": 1.22, "learning_rate": 3.688423626253318e-07, "logits/chosen": 0.4797401428222656, "logits/rejected": 0.3779059648513794, "logps/chosen": -360.5686950683594, "logps/rejected": -247.0854034423828, "loss": 0.6667, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.023860588669776917, "rewards/margins": 0.0794132724404335, "rewards/rejected": -0.10327385365962982, "step": 2340 }, { "epoch": 1.23, "learning_rate": 3.675007944411253e-07, "logits/chosen": 0.33330869674682617, "logits/rejected": 0.19414468109607697, "logps/chosen": -334.6083068847656, "logps/rejected": -248.15170288085938, "loss": 0.6592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.01777069643139839, "rewards/margins": 0.08200386166572571, "rewards/rejected": -0.0997745618224144, "step": 2350 }, { "epoch": 1.23, "learning_rate": 3.6615486905034167e-07, "logits/chosen": 0.29605159163475037, "logits/rejected": 0.2758367359638214, "logps/chosen": -312.92071533203125, "logps/rejected": -266.828369140625, "loss": 0.6672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.034411195665597916, "rewards/margins": 0.03460158035159111, "rewards/rejected": -0.06901277601718903, "step": 2360 }, { "epoch": 1.24, "learning_rate": 3.6480463636306846e-07, "logits/chosen": 0.31986507773399353, "logits/rejected": 0.27656489610671997, "logps/chosen": -377.6355895996094, "logps/rejected": -301.74896240234375, "loss": 0.6634, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01851150207221508, "rewards/margins": 0.0832141637802124, "rewards/rejected": -0.10172567516565323, "step": 2370 }, { "epoch": 1.25, "learning_rate": 3.634501464491183e-07, "logits/chosen": 0.37213101983070374, "logits/rejected": 0.33602914214134216, "logps/chosen": -316.6075439453125, "logps/rejected": -262.6651611328125, "loss": 0.6642, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01945626363158226, "rewards/margins": 0.058711398392915726, "rewards/rejected": -0.07816765457391739, "step": 2380 }, { "epoch": 1.25, "learning_rate": 3.6209144953617175e-07, "logits/chosen": 0.3521496653556824, "logits/rejected": 0.3292858898639679, "logps/chosen": -352.98388671875, "logps/rejected": -309.53057861328125, "loss": 0.6659, "rewards/accuracies": 0.75, "rewards/chosen": 0.02008138597011566, "rewards/margins": 0.06914113461971283, "rewards/rejected": -0.04905973747372627, "step": 2390 }, { "epoch": 1.26, "learning_rate": 3.607285960079146e-07, "logits/chosen": 0.3754183351993561, "logits/rejected": 0.30044883489608765, "logps/chosen": -285.4665832519531, "logps/rejected": -233.817138671875, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": -0.03596179559826851, "rewards/margins": 0.05098012834787369, "rewards/rejected": -0.08694193512201309, "step": 2400 }, { "epoch": 1.26, "learning_rate": 3.593616364021701e-07, "logits/chosen": 0.33115464448928833, "logits/rejected": 0.3573494255542755, "logps/chosen": -217.48855590820312, "logps/rejected": -243.66287231445312, "loss": 0.6665, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.042571354657411575, "rewards/margins": 0.040859851986169815, "rewards/rejected": -0.0834311991930008, "step": 2410 }, { "epoch": 1.27, "learning_rate": 3.5799062140902413e-07, "logits/chosen": 0.29329270124435425, "logits/rejected": 0.3528839945793152, "logps/chosen": -254.28964233398438, "logps/rejected": -271.6838684082031, "loss": 0.6671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011812356300652027, "rewards/margins": 0.07317445427179337, "rewards/rejected": -0.08498681336641312, "step": 2420 }, { "epoch": 1.27, "learning_rate": 3.566156018689462e-07, "logits/chosen": 0.3424796760082245, "logits/rejected": 0.3287104666233063, "logps/chosen": -383.9771423339844, "logps/rejected": -304.5723876953125, "loss": 0.6749, "rewards/accuracies": 0.625, "rewards/chosen": -0.019487058743834496, "rewards/margins": 0.059456080198287964, "rewards/rejected": -0.07894313335418701, "step": 2430 }, { "epoch": 1.28, "learning_rate": 3.552366287709038e-07, "logits/chosen": 0.35111716389656067, "logits/rejected": 0.34353193640708923, "logps/chosen": -374.93115234375, "logps/rejected": -317.59686279296875, "loss": 0.6677, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.004710751585662365, "rewards/margins": 0.08451598882675171, "rewards/rejected": -0.08922673761844635, "step": 2440 }, { "epoch": 1.28, "learning_rate": 3.5385375325047163e-07, "logits/chosen": 0.38372933864593506, "logits/rejected": 0.38185805082321167, "logps/chosen": -289.7801818847656, "logps/rejected": -228.0769805908203, "loss": 0.6671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06441032141447067, "rewards/margins": 0.028500813990831375, "rewards/rejected": -0.09291113913059235, "step": 2450 }, { "epoch": 1.29, "learning_rate": 3.524670265879353e-07, "logits/chosen": 0.2816036343574524, "logits/rejected": 0.28083696961402893, "logps/chosen": -371.5879821777344, "logps/rejected": -354.3928527832031, "loss": 0.6673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014490177854895592, "rewards/margins": 0.08113773912191391, "rewards/rejected": -0.09562792629003525, "step": 2460 }, { "epoch": 1.29, "learning_rate": 3.510765002063901e-07, "logits/chosen": 0.3037402629852295, "logits/rejected": 0.32839471101760864, "logps/chosen": -307.32916259765625, "logps/rejected": -284.83636474609375, "loss": 0.6696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0401952862739563, "rewards/margins": 0.05480067804455757, "rewards/rejected": -0.09499596059322357, "step": 2470 }, { "epoch": 1.3, "learning_rate": 3.4968222566983367e-07, "logits/chosen": 0.28994929790496826, "logits/rejected": 0.3284696936607361, "logps/chosen": -280.9746398925781, "logps/rejected": -247.8575439453125, "loss": 0.668, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08172313123941422, "rewards/margins": 0.007884040474891663, "rewards/rejected": -0.08960716426372528, "step": 2480 }, { "epoch": 1.3, "learning_rate": 3.482842546812543e-07, "logits/chosen": 0.35081690549850464, "logits/rejected": 0.324882447719574, "logps/chosen": -347.78057861328125, "logps/rejected": -286.3685607910156, "loss": 0.6664, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04719354957342148, "rewards/margins": 0.06654486060142517, "rewards/rejected": -0.11373841762542725, "step": 2490 }, { "epoch": 1.31, "learning_rate": 3.4688263908071307e-07, "logits/chosen": 0.2330433577299118, "logits/rejected": 0.26575979590415955, "logps/chosen": -333.82049560546875, "logps/rejected": -299.0555725097656, "loss": 0.666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03410058468580246, "rewards/margins": 0.054598551243543625, "rewards/rejected": -0.08869913965463638, "step": 2500 }, { "epoch": 1.31, "learning_rate": 3.454774308434222e-07, "logits/chosen": 0.39580804109573364, "logits/rejected": 0.4053748548030853, "logps/chosen": -269.8752746582031, "logps/rejected": -264.6448059082031, "loss": 0.6671, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.029624611139297485, "rewards/margins": 0.06283847242593765, "rewards/rejected": -0.09246308356523514, "step": 2510 }, { "epoch": 1.32, "learning_rate": 3.4406868207781725e-07, "logits/chosen": 0.30161893367767334, "logits/rejected": 0.325286328792572, "logps/chosen": -264.8555908203125, "logps/rejected": -299.9549255371094, "loss": 0.6686, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.06190790608525276, "rewards/margins": 0.022210311144590378, "rewards/rejected": -0.08411821722984314, "step": 2520 }, { "epoch": 1.32, "learning_rate": 3.426564450236249e-07, "logits/chosen": 0.3033444881439209, "logits/rejected": 0.338754266500473, "logps/chosen": -296.7602233886719, "logps/rejected": -301.8728942871094, "loss": 0.6655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04776741936802864, "rewards/margins": 0.041456062346696854, "rewards/rejected": -0.08922348916530609, "step": 2530 }, { "epoch": 1.33, "learning_rate": 3.4124077204992576e-07, "logits/chosen": 0.32950612902641296, "logits/rejected": 0.3448053002357483, "logps/chosen": -285.41912841796875, "logps/rejected": -230.43820190429688, "loss": 0.6621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006403304636478424, "rewards/margins": 0.08996753394603729, "rewards/rejected": -0.08356423676013947, "step": 2540 }, { "epoch": 1.33, "learning_rate": 3.398217156532125e-07, "logits/chosen": 0.3072153627872467, "logits/rejected": 0.3017449676990509, "logps/chosen": -314.2388916015625, "logps/rejected": -291.82501220703125, "loss": 0.6642, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.009415028616786003, "rewards/margins": 0.09692169725894928, "rewards/rejected": -0.08750666677951813, "step": 2550 }, { "epoch": 1.34, "learning_rate": 3.383993284554431e-07, "logits/chosen": 0.2225678414106369, "logits/rejected": 0.22605307400226593, "logps/chosen": -339.57342529296875, "logps/rejected": -302.07830810546875, "loss": 0.6687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.015412566252052784, "rewards/margins": 0.05832207202911377, "rewards/rejected": -0.07373463362455368, "step": 2560 }, { "epoch": 1.34, "learning_rate": 3.3697366320208955e-07, "logits/chosen": 0.2920198142528534, "logits/rejected": 0.3342500627040863, "logps/chosen": -296.05169677734375, "logps/rejected": -315.5035705566406, "loss": 0.6687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03576388955116272, "rewards/margins": 0.07963220775127411, "rewards/rejected": -0.11539609730243683, "step": 2570 }, { "epoch": 1.35, "learning_rate": 3.355447727601816e-07, "logits/chosen": 0.3131474554538727, "logits/rejected": 0.35242384672164917, "logps/chosen": -318.4373474121094, "logps/rejected": -317.1743469238281, "loss": 0.6656, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0381944365799427, "rewards/margins": 0.0370551198720932, "rewards/rejected": -0.0752495601773262, "step": 2580 }, { "epoch": 1.36, "learning_rate": 3.3411271011634697e-07, "logits/chosen": 0.40101736783981323, "logits/rejected": 0.34883618354797363, "logps/chosen": -295.4554443359375, "logps/rejected": -307.9974365234375, "loss": 0.6742, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020811382681131363, "rewards/margins": 0.06216179206967354, "rewards/rejected": -0.0829731673002243, "step": 2590 }, { "epoch": 1.36, "learning_rate": 3.3267752837484587e-07, "logits/chosen": 0.3139980435371399, "logits/rejected": 0.3570219576358795, "logps/chosen": -332.45684814453125, "logps/rejected": -294.8971862792969, "loss": 0.6644, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03061739169061184, "rewards/margins": 0.049742937088012695, "rewards/rejected": -0.08036032319068909, "step": 2600 }, { "epoch": 1.37, "learning_rate": 3.31239280755602e-07, "logits/chosen": 0.37406110763549805, "logits/rejected": 0.3708162307739258, "logps/chosen": -272.1312255859375, "logps/rejected": -225.3962860107422, "loss": 0.6657, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04108922928571701, "rewards/margins": 0.0460977777838707, "rewards/rejected": -0.0871870145201683, "step": 2610 }, { "epoch": 1.37, "learning_rate": 3.2979802059222936e-07, "logits/chosen": 0.30520889163017273, "logits/rejected": 0.3104460835456848, "logps/chosen": -288.23614501953125, "logps/rejected": -305.2883605957031, "loss": 0.6677, "rewards/accuracies": 0.625, "rewards/chosen": -0.060263652354478836, "rewards/margins": 0.018435927107930183, "rewards/rejected": -0.07869957387447357, "step": 2620 }, { "epoch": 1.38, "learning_rate": 3.283538013300537e-07, "logits/chosen": 0.2660229504108429, "logits/rejected": 0.25236591696739197, "logps/chosen": -316.4666442871094, "logps/rejected": -251.6602325439453, "loss": 0.6645, "rewards/accuracies": 0.625, "rewards/chosen": -0.05971134454011917, "rewards/margins": 0.06788526475429535, "rewards/rejected": -0.12759660184383392, "step": 2630 }, { "epoch": 1.38, "learning_rate": 3.269066765241314e-07, "logits/chosen": 0.353706419467926, "logits/rejected": 0.2846869230270386, "logps/chosen": -340.47100830078125, "logps/rejected": -274.4219970703125, "loss": 0.6665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04179629683494568, "rewards/margins": 0.08098026365041733, "rewards/rejected": -0.12277655303478241, "step": 2640 }, { "epoch": 1.39, "learning_rate": 3.254566998372634e-07, "logits/chosen": 0.3207184076309204, "logits/rejected": 0.3243858218193054, "logps/chosen": -291.7104797363281, "logps/rejected": -275.7958068847656, "loss": 0.6687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.023017197847366333, "rewards/margins": 0.06596306711435318, "rewards/rejected": -0.08898027241230011, "step": 2650 }, { "epoch": 1.39, "learning_rate": 3.2400392503800477e-07, "logits/chosen": 0.2825477719306946, "logits/rejected": 0.30369895696640015, "logps/chosen": -270.70367431640625, "logps/rejected": -311.6441955566406, "loss": 0.6665, "rewards/accuracies": 0.625, "rewards/chosen": -0.054010480642318726, "rewards/margins": 0.06956785917282104, "rewards/rejected": -0.12357833236455917, "step": 2660 }, { "epoch": 1.4, "learning_rate": 3.225484059986715e-07, "logits/chosen": 0.37458476424217224, "logits/rejected": 0.3407798409461975, "logps/chosen": -310.98712158203125, "logps/rejected": -299.9549865722656, "loss": 0.6777, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05233936384320259, "rewards/margins": 0.025646230205893517, "rewards/rejected": -0.07798559963703156, "step": 2670 }, { "epoch": 1.4, "learning_rate": 3.2109019669334215e-07, "logits/chosen": 0.32417067885398865, "logits/rejected": 0.317020982503891, "logps/chosen": -295.0063171386719, "logps/rejected": -251.1151123046875, "loss": 0.6652, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03081471286714077, "rewards/margins": 0.07520559430122375, "rewards/rejected": -0.10602030903100967, "step": 2680 }, { "epoch": 1.41, "learning_rate": 3.19629351195857e-07, "logits/chosen": 0.3401109576225281, "logits/rejected": 0.33117538690567017, "logps/chosen": -269.0340270996094, "logps/rejected": -289.77703857421875, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": -0.07183998078107834, "rewards/margins": 0.021650653332471848, "rewards/rejected": -0.09349063783884048, "step": 2690 }, { "epoch": 1.41, "learning_rate": 3.1816592367781236e-07, "logits/chosen": 0.3018365502357483, "logits/rejected": 0.2779509127140045, "logps/chosen": -290.2937927246094, "logps/rejected": -261.4476623535156, "loss": 0.6619, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.046037644147872925, "rewards/margins": 0.06918086111545563, "rewards/rejected": -0.11521850526332855, "step": 2700 }, { "epoch": 1.42, "learning_rate": 3.166999684065521e-07, "logits/chosen": 0.2595480680465698, "logits/rejected": 0.25535058975219727, "logps/chosen": -278.13775634765625, "logps/rejected": -259.5060729980469, "loss": 0.6631, "rewards/accuracies": 0.625, "rewards/chosen": -0.05215588957071304, "rewards/margins": 0.06091712787747383, "rewards/rejected": -0.11307301372289658, "step": 2710 }, { "epoch": 1.42, "learning_rate": 3.1523153974315497e-07, "logits/chosen": 0.3285236954689026, "logits/rejected": 0.25150084495544434, "logps/chosen": -398.76983642578125, "logps/rejected": -276.6143493652344, "loss": 0.658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.037296637892723083, "rewards/margins": 0.07015573978424072, "rewards/rejected": -0.10745237022638321, "step": 2720 }, { "epoch": 1.43, "learning_rate": 3.137606921404191e-07, "logits/chosen": 0.30357763171195984, "logits/rejected": 0.2153654545545578, "logps/chosen": -385.17486572265625, "logps/rejected": -260.77166748046875, "loss": 0.6598, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.009311881847679615, "rewards/margins": 0.13724280893802643, "rewards/rejected": -0.1279309242963791, "step": 2730 }, { "epoch": 1.43, "learning_rate": 3.1228748014084243e-07, "logits/chosen": 0.4386512339115143, "logits/rejected": 0.42425936460494995, "logps/chosen": -256.3681640625, "logps/rejected": -229.419677734375, "loss": 0.6658, "rewards/accuracies": 0.75, "rewards/chosen": -0.048789579421281815, "rewards/margins": 0.07216767966747284, "rewards/rejected": -0.12095727026462555, "step": 2740 }, { "epoch": 1.44, "learning_rate": 3.108119583746005e-07, "logits/chosen": 0.28032955527305603, "logits/rejected": 0.3129872679710388, "logps/chosen": -233.5014190673828, "logps/rejected": -281.8854064941406, "loss": 0.6639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04254484921693802, "rewards/margins": 0.07571487128734589, "rewards/rejected": -0.11825971305370331, "step": 2750 }, { "epoch": 1.44, "learning_rate": 3.093341815575202e-07, "logits/chosen": 0.37104588747024536, "logits/rejected": 0.3991813063621521, "logps/chosen": -306.2823791503906, "logps/rejected": -334.3895568847656, "loss": 0.6625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03866010531783104, "rewards/margins": 0.0574616901576519, "rewards/rejected": -0.09612180292606354, "step": 2760 }, { "epoch": 1.45, "learning_rate": 3.078542044890513e-07, "logits/chosen": 0.369444876909256, "logits/rejected": 0.41406869888305664, "logps/chosen": -263.76654052734375, "logps/rejected": -248.6498260498047, "loss": 0.6668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.066460520029068, "rewards/margins": 0.08304537087678909, "rewards/rejected": -0.1495058834552765, "step": 2770 }, { "epoch": 1.45, "learning_rate": 3.0637208205023386e-07, "logits/chosen": 0.3381851315498352, "logits/rejected": 0.36313921213150024, "logps/chosen": -293.7001953125, "logps/rejected": -306.66766357421875, "loss": 0.6625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04342319816350937, "rewards/margins": 0.06256397068500519, "rewards/rejected": -0.10598716884851456, "step": 2780 }, { "epoch": 1.46, "learning_rate": 3.0488786920166343e-07, "logits/chosen": 0.33373400568962097, "logits/rejected": 0.3136371076107025, "logps/chosen": -313.1701354980469, "logps/rejected": -301.2196960449219, "loss": 0.6533, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.04934342950582504, "rewards/margins": 0.09779079258441925, "rewards/rejected": -0.1471342295408249, "step": 2790 }, { "epoch": 1.47, "learning_rate": 3.034016209814529e-07, "logits/chosen": 0.30923396348953247, "logits/rejected": 0.29981285333633423, "logps/chosen": -287.7221374511719, "logps/rejected": -223.39950561523438, "loss": 0.6682, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04875212907791138, "rewards/margins": 0.08592663705348969, "rewards/rejected": -0.13467876613140106, "step": 2800 }, { "epoch": 1.47, "learning_rate": 3.0191339250319147e-07, "logits/chosen": 0.3353407382965088, "logits/rejected": 0.37832242250442505, "logps/chosen": -333.6568603515625, "logps/rejected": -342.98223876953125, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": -0.07310348749160767, "rewards/margins": 0.07849867641925812, "rewards/rejected": -0.1516021341085434, "step": 2810 }, { "epoch": 1.48, "learning_rate": 3.004232389539011e-07, "logits/chosen": 0.2526446282863617, "logits/rejected": 0.2754290699958801, "logps/chosen": -335.6014709472656, "logps/rejected": -305.3747253417969, "loss": 0.6684, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09115692973136902, "rewards/margins": 0.0300412829965353, "rewards/rejected": -0.12119821459054947, "step": 2820 }, { "epoch": 1.48, "learning_rate": 2.989312155919898e-07, "logits/chosen": 0.27508842945098877, "logits/rejected": 0.22659547626972198, "logps/chosen": -311.8320007324219, "logps/rejected": -304.56805419921875, "loss": 0.6691, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.032169900834560394, "rewards/margins": 0.10568948835134506, "rewards/rejected": -0.13785937428474426, "step": 2830 }, { "epoch": 1.49, "learning_rate": 2.9743737774520266e-07, "logits/chosen": 0.33120667934417725, "logits/rejected": 0.30626198649406433, "logps/chosen": -314.50384521484375, "logps/rejected": -244.17703247070312, "loss": 0.6659, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.059983305633068085, "rewards/margins": 0.0858563557267189, "rewards/rejected": -0.145839661359787, "step": 2840 }, { "epoch": 1.49, "learning_rate": 2.959417808085702e-07, "logits/chosen": 0.25924453139305115, "logits/rejected": 0.25570935010910034, "logps/chosen": -332.52886962890625, "logps/rejected": -291.54901123046875, "loss": 0.6669, "rewards/accuracies": 0.75, "rewards/chosen": -0.04318776726722717, "rewards/margins": 0.07569596916437149, "rewards/rejected": -0.11888374388217926, "step": 2850 }, { "epoch": 1.5, "learning_rate": 2.944444802423542e-07, "logits/chosen": 0.292915940284729, "logits/rejected": 0.2799091339111328, "logps/chosen": -324.13507080078125, "logps/rejected": -310.09613037109375, "loss": 0.6586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05468999221920967, "rewards/margins": 0.0883164256811142, "rewards/rejected": -0.14300641417503357, "step": 2860 }, { "epoch": 1.5, "learning_rate": 2.929455315699908e-07, "logits/chosen": 0.34056323766708374, "logits/rejected": 0.31849169731140137, "logps/chosen": -360.6206359863281, "logps/rejected": -268.1399841308594, "loss": 0.6643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04418856278061867, "rewards/margins": 0.1260400414466858, "rewards/rejected": -0.17022861540317535, "step": 2870 }, { "epoch": 1.51, "learning_rate": 2.9144499037603204e-07, "logits/chosen": 0.40575551986694336, "logits/rejected": 0.42678695917129517, "logps/chosen": -291.07928466796875, "logps/rejected": -285.8336181640625, "loss": 0.659, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03479582443833351, "rewards/margins": 0.0920044481754303, "rewards/rejected": -0.1268002688884735, "step": 2880 }, { "epoch": 1.51, "learning_rate": 2.899429123040843e-07, "logits/chosen": 0.32538041472435, "logits/rejected": 0.30334755778312683, "logps/chosen": -294.2937927246094, "logps/rejected": -285.8299865722656, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07144945859909058, "rewards/margins": 0.07757963240146637, "rewards/rejected": -0.14902909100055695, "step": 2890 }, { "epoch": 1.52, "learning_rate": 2.884393530547452e-07, "logits/chosen": 0.378600537776947, "logits/rejected": 0.2921772301197052, "logps/chosen": -343.437255859375, "logps/rejected": -311.8553466796875, "loss": 0.6584, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04117577522993088, "rewards/margins": 0.07172152400016785, "rewards/rejected": -0.11289729923009872, "step": 2900 }, { "epoch": 1.52, "learning_rate": 2.869343683835376e-07, "logits/chosen": 0.30608147382736206, "logits/rejected": 0.2674848735332489, "logps/chosen": -358.9638671875, "logps/rejected": -281.90582275390625, "loss": 0.6538, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.023800866678357124, "rewards/margins": 0.11379053443670273, "rewards/rejected": -0.1375913918018341, "step": 2910 }, { "epoch": 1.53, "learning_rate": 2.8542801409884253e-07, "logits/chosen": 0.3522571921348572, "logits/rejected": 0.3938680589199066, "logps/chosen": -267.18597412109375, "logps/rejected": -226.17062377929688, "loss": 0.6623, "rewards/accuracies": 0.625, "rewards/chosen": -0.05746225267648697, "rewards/margins": 0.06825403869152069, "rewards/rejected": -0.12571629881858826, "step": 2920 }, { "epoch": 1.53, "learning_rate": 2.839203460598297e-07, "logits/chosen": 0.25303295254707336, "logits/rejected": 0.308340847492218, "logps/chosen": -384.5491027832031, "logps/rejected": -375.91925048828125, "loss": 0.6652, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.049062587320804596, "rewards/margins": 0.08676549792289734, "rewards/rejected": -0.13582809269428253, "step": 2930 }, { "epoch": 1.54, "learning_rate": 2.8241142017438557e-07, "logits/chosen": 0.36798354983329773, "logits/rejected": 0.3655903935432434, "logps/chosen": -257.49383544921875, "logps/rejected": -294.4346923828125, "loss": 0.6627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06181669980287552, "rewards/margins": 0.08136054128408432, "rewards/rejected": -0.14317724108695984, "step": 2940 }, { "epoch": 1.54, "learning_rate": 2.8090129239704083e-07, "logits/chosen": 0.2773851156234741, "logits/rejected": 0.2811713218688965, "logps/chosen": -329.9738464355469, "logps/rejected": -298.5468444824219, "loss": 0.6573, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06664275377988815, "rewards/margins": 0.059970151633024216, "rewards/rejected": -0.12661293148994446, "step": 2950 }, { "epoch": 1.55, "learning_rate": 2.7939001872689496e-07, "logits/chosen": 0.32175400853157043, "logits/rejected": 0.3031854033470154, "logps/chosen": -328.90771484375, "logps/rejected": -307.1551208496094, "loss": 0.6579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03462161868810654, "rewards/margins": 0.10548686981201172, "rewards/rejected": -0.14010848104953766, "step": 2960 }, { "epoch": 1.55, "learning_rate": 2.778776552055398e-07, "logits/chosen": 0.4104437828063965, "logits/rejected": 0.3513795733451843, "logps/chosen": -294.09381103515625, "logps/rejected": -247.57510375976562, "loss": 0.6591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04517308250069618, "rewards/margins": 0.10987844318151474, "rewards/rejected": -0.15505151450634003, "step": 2970 }, { "epoch": 1.56, "learning_rate": 2.763642579149817e-07, "logits/chosen": 0.27305012941360474, "logits/rejected": 0.27377820014953613, "logps/chosen": -300.01483154296875, "logps/rejected": -337.39788818359375, "loss": 0.6597, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.047467950731515884, "rewards/margins": 0.05927792191505432, "rewards/rejected": -0.1067458763718605, "step": 2980 }, { "epoch": 1.56, "learning_rate": 2.748498829755615e-07, "logits/chosen": 0.32688483595848083, "logits/rejected": 0.3695555627346039, "logps/chosen": -307.6633605957031, "logps/rejected": -285.3849182128906, "loss": 0.6555, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.06436268240213394, "rewards/margins": 0.08075664192438126, "rewards/rejected": -0.1451193392276764, "step": 2990 }, { "epoch": 1.57, "learning_rate": 2.7333458654387344e-07, "logits/chosen": 0.26363521814346313, "logits/rejected": 0.26629766821861267, "logps/chosen": -343.8567810058594, "logps/rejected": -367.24700927734375, "loss": 0.6641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09303367137908936, "rewards/margins": 0.033451493829488754, "rewards/rejected": -0.12648515403270721, "step": 3000 }, { "epoch": 1.58, "learning_rate": 2.718184248106828e-07, "logits/chosen": 0.3711121380329132, "logits/rejected": 0.3596826195716858, "logps/chosen": -298.93804931640625, "logps/rejected": -271.2373046875, "loss": 0.6632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05417951941490173, "rewards/margins": 0.0800536572933197, "rewards/rejected": -0.13423319160938263, "step": 3010 }, { "epoch": 1.58, "learning_rate": 2.7030145399884275e-07, "logits/chosen": 0.30993330478668213, "logits/rejected": 0.24670329689979553, "logps/chosen": -302.9925842285156, "logps/rejected": -254.4071807861328, "loss": 0.6577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10473034530878067, "rewards/margins": 0.03766946122050285, "rewards/rejected": -0.14239981770515442, "step": 3020 }, { "epoch": 1.59, "learning_rate": 2.687837303612085e-07, "logits/chosen": 0.4012434482574463, "logits/rejected": 0.3031303286552429, "logps/chosen": -306.01971435546875, "logps/rejected": -336.9535827636719, "loss": 0.6605, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06967012584209442, "rewards/margins": 0.10760972648859024, "rewards/rejected": -0.17727985978126526, "step": 3030 }, { "epoch": 1.59, "learning_rate": 2.672653101785519e-07, "logits/chosen": 0.3572728931903839, "logits/rejected": 0.3076297640800476, "logps/chosen": -343.535400390625, "logps/rejected": -291.32000732421875, "loss": 0.655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06969340145587921, "rewards/margins": 0.10455378144979477, "rewards/rejected": -0.17424717545509338, "step": 3040 }, { "epoch": 1.6, "learning_rate": 2.657462497574747e-07, "logits/chosen": 0.3425232172012329, "logits/rejected": 0.31714963912963867, "logps/chosen": -333.19268798828125, "logps/rejected": -256.7529296875, "loss": 0.6586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05247022956609726, "rewards/margins": 0.09439842402935028, "rewards/rejected": -0.14686864614486694, "step": 3050 }, { "epoch": 1.6, "learning_rate": 2.642266054283198e-07, "logits/chosen": 0.26972079277038574, "logits/rejected": 0.27910494804382324, "logps/chosen": -305.31048583984375, "logps/rejected": -318.89703369140625, "loss": 0.6614, "rewards/accuracies": 0.625, "rewards/chosen": -0.07155313342809677, "rewards/margins": 0.03983413800597191, "rewards/rejected": -0.11138726770877838, "step": 3060 }, { "epoch": 1.61, "learning_rate": 2.627064335430829e-07, "logits/chosen": 0.27563899755477905, "logits/rejected": 0.28687483072280884, "logps/chosen": -327.7835388183594, "logps/rejected": -291.73126220703125, "loss": 0.665, "rewards/accuracies": 0.75, "rewards/chosen": -0.05870665982365608, "rewards/margins": 0.11560998111963272, "rewards/rejected": -0.1743166148662567, "step": 3070 }, { "epoch": 1.61, "learning_rate": 2.611857904733227e-07, "logits/chosen": 0.3899874985218048, "logits/rejected": 0.35909098386764526, "logps/chosen": -366.1586608886719, "logps/rejected": -318.25494384765625, "loss": 0.664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05585475638508797, "rewards/margins": 0.08511849492788315, "rewards/rejected": -0.14097324013710022, "step": 3080 }, { "epoch": 1.62, "learning_rate": 2.5966473260807076e-07, "logits/chosen": 0.28359925746917725, "logits/rejected": 0.26255688071250916, "logps/chosen": -339.81292724609375, "logps/rejected": -299.61956787109375, "loss": 0.6508, "rewards/accuracies": 0.625, "rewards/chosen": -0.06393036991357803, "rewards/margins": 0.08874894678592682, "rewards/rejected": -0.15267930924892426, "step": 3090 }, { "epoch": 1.62, "learning_rate": 2.5814331635173987e-07, "logits/chosen": 0.30540210008621216, "logits/rejected": 0.26466771960258484, "logps/chosen": -297.3798828125, "logps/rejected": -266.07757568359375, "loss": 0.6599, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10192809998989105, "rewards/margins": 0.04608898609876633, "rewards/rejected": -0.14801709353923798, "step": 3100 }, { "epoch": 1.63, "learning_rate": 2.566215981220331e-07, "logits/chosen": 0.3526119589805603, "logits/rejected": 0.3886163532733917, "logps/chosen": -245.1267547607422, "logps/rejected": -252.8359832763672, "loss": 0.6681, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12257083505392075, "rewards/margins": 0.012356823310256004, "rewards/rejected": -0.1349276602268219, "step": 3110 }, { "epoch": 1.63, "learning_rate": 2.550996343478514e-07, "logits/chosen": 0.3826276659965515, "logits/rejected": 0.40776434540748596, "logps/chosen": -245.58157348632812, "logps/rejected": -256.46234130859375, "loss": 0.6612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06684736907482147, "rewards/margins": 0.04188776761293411, "rewards/rejected": -0.10873512923717499, "step": 3120 }, { "epoch": 1.64, "learning_rate": 2.5357748146720076e-07, "logits/chosen": 0.38492801785469055, "logits/rejected": 0.3946647047996521, "logps/chosen": -305.4632873535156, "logps/rejected": -279.2232666015625, "loss": 0.6527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07274258881807327, "rewards/margins": 0.05328000336885452, "rewards/rejected": -0.1260225772857666, "step": 3130 }, { "epoch": 1.64, "learning_rate": 2.5205519592509993e-07, "logits/chosen": 0.31763720512390137, "logits/rejected": 0.32317763566970825, "logps/chosen": -332.72076416015625, "logps/rejected": -300.34222412109375, "loss": 0.6565, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07132816314697266, "rewards/margins": 0.08335666358470917, "rewards/rejected": -0.15468482673168182, "step": 3140 }, { "epoch": 1.65, "learning_rate": 2.505328341714873e-07, "logits/chosen": 0.3209022283554077, "logits/rejected": 0.353001207113266, "logps/chosen": -296.9544372558594, "logps/rejected": -305.7718200683594, "loss": 0.6567, "rewards/accuracies": 0.625, "rewards/chosen": -0.08769343048334122, "rewards/margins": 0.06560392677783966, "rewards/rejected": -0.15329734981060028, "step": 3150 }, { "epoch": 1.65, "learning_rate": 2.4901045265912687e-07, "logits/chosen": 0.40290650725364685, "logits/rejected": 0.38739943504333496, "logps/chosen": -284.33612060546875, "logps/rejected": -271.75872802734375, "loss": 0.6655, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.052501898258924484, "rewards/margins": 0.11684814840555191, "rewards/rejected": -0.1693500280380249, "step": 3160 }, { "epoch": 1.66, "learning_rate": 2.4748810784151555e-07, "logits/chosen": 0.2552871108055115, "logits/rejected": 0.2728117108345032, "logps/chosen": -282.8292541503906, "logps/rejected": -245.09439086914062, "loss": 0.6584, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07773838937282562, "rewards/margins": 0.06995000690221786, "rewards/rejected": -0.14768841862678528, "step": 3170 }, { "epoch": 1.66, "learning_rate": 2.459658561707898e-07, "logits/chosen": 0.2766292095184326, "logits/rejected": 0.3535088002681732, "logps/chosen": -265.8050537109375, "logps/rejected": -232.62393188476562, "loss": 0.6518, "rewards/accuracies": 0.625, "rewards/chosen": -0.11231112480163574, "rewards/margins": 0.05461050942540169, "rewards/rejected": -0.16692163050174713, "step": 3180 }, { "epoch": 1.67, "learning_rate": 2.4444375409563145e-07, "logits/chosen": 0.28582626581192017, "logits/rejected": 0.21247181296348572, "logps/chosen": -326.25750732421875, "logps/rejected": -261.94793701171875, "loss": 0.6519, "rewards/accuracies": 0.75, "rewards/chosen": -0.08714159578084946, "rewards/margins": 0.09397827088832855, "rewards/rejected": -0.1811198890209198, "step": 3190 }, { "epoch": 1.67, "learning_rate": 2.429218580591753e-07, "logits/chosen": 0.36546292901039124, "logits/rejected": 0.31438449025154114, "logps/chosen": -293.0782775878906, "logps/rejected": -322.4738464355469, "loss": 0.6569, "rewards/accuracies": 0.75, "rewards/chosen": -0.06082568317651749, "rewards/margins": 0.1483532190322876, "rewards/rejected": -0.2091788798570633, "step": 3200 }, { "epoch": 1.68, "learning_rate": 2.414002244969158e-07, "logits/chosen": 0.26624953746795654, "logits/rejected": 0.23065993189811707, "logps/chosen": -330.13323974609375, "logps/rejected": -318.6907958984375, "loss": 0.6508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07344881445169449, "rewards/margins": 0.11431051790714264, "rewards/rejected": -0.18775932490825653, "step": 3210 }, { "epoch": 1.68, "learning_rate": 2.3987890983461403e-07, "logits/chosen": 0.3105958104133606, "logits/rejected": 0.3086475729942322, "logps/chosen": -274.396484375, "logps/rejected": -301.52557373046875, "loss": 0.6658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10729696601629257, "rewards/margins": 0.04789247363805771, "rewards/rejected": -0.1551894247531891, "step": 3220 }, { "epoch": 1.69, "learning_rate": 2.3835797048620564e-07, "logits/chosen": 0.4288257956504822, "logits/rejected": 0.31377506256103516, "logps/chosen": -331.15008544921875, "logps/rejected": -216.7270050048828, "loss": 0.6496, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.066102035343647, "rewards/margins": 0.09226211160421371, "rewards/rejected": -0.15836414694786072, "step": 3230 }, { "epoch": 1.7, "learning_rate": 2.368374628517088e-07, "logits/chosen": 0.26842182874679565, "logits/rejected": 0.23748120665550232, "logps/chosen": -340.2176208496094, "logps/rejected": -247.35824584960938, "loss": 0.6574, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11225073039531708, "rewards/margins": 0.09872641414403915, "rewards/rejected": -0.21097715198993683, "step": 3240 }, { "epoch": 1.7, "learning_rate": 2.3531744331513247e-07, "logits/chosen": 0.3615303635597229, "logits/rejected": 0.3841082453727722, "logps/chosen": -357.51458740234375, "logps/rejected": -380.087890625, "loss": 0.6524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0837298184633255, "rewards/margins": 0.06683110445737839, "rewards/rejected": -0.15056093037128448, "step": 3250 }, { "epoch": 1.71, "learning_rate": 2.3379796824238608e-07, "logits/chosen": 0.37287023663520813, "logits/rejected": 0.330514132976532, "logps/chosen": -304.1961975097656, "logps/rejected": -266.56610107421875, "loss": 0.6518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0935707688331604, "rewards/margins": 0.05958444997668266, "rewards/rejected": -0.15315520763397217, "step": 3260 }, { "epoch": 1.71, "learning_rate": 2.3227909397918894e-07, "logits/chosen": 0.33427393436431885, "logits/rejected": 0.24959492683410645, "logps/chosen": -320.3334045410156, "logps/rejected": -285.8636169433594, "loss": 0.6577, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.044574957340955734, "rewards/margins": 0.08377736806869507, "rewards/rejected": -0.1283523142337799, "step": 3270 }, { "epoch": 1.72, "learning_rate": 2.3076087684898076e-07, "logits/chosen": 0.24056890606880188, "logits/rejected": 0.22842903435230255, "logps/chosen": -336.13067626953125, "logps/rejected": -277.524658203125, "loss": 0.6559, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08839359879493713, "rewards/margins": 0.13642925024032593, "rewards/rejected": -0.22482284903526306, "step": 3280 }, { "epoch": 1.72, "learning_rate": 2.2924337315083353e-07, "logits/chosen": 0.3178374767303467, "logits/rejected": 0.24835416674613953, "logps/chosen": -421.72589111328125, "logps/rejected": -324.20416259765625, "loss": 0.659, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07225541770458221, "rewards/margins": 0.11300001293420792, "rewards/rejected": -0.18525540828704834, "step": 3290 }, { "epoch": 1.73, "learning_rate": 2.277266391573633e-07, "logits/chosen": 0.20869994163513184, "logits/rejected": 0.2740298807621002, "logps/chosen": -368.10650634765625, "logps/rejected": -339.2476501464844, "loss": 0.6633, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05671432614326477, "rewards/margins": 0.06923334300518036, "rewards/rejected": -0.12594766914844513, "step": 3300 }, { "epoch": 1.73, "learning_rate": 2.2621073111264357e-07, "logits/chosen": 0.2960966229438782, "logits/rejected": 0.27994006872177124, "logps/chosen": -285.3283386230469, "logps/rejected": -246.00830078125, "loss": 0.6598, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08893431723117828, "rewards/margins": 0.055390290915966034, "rewards/rejected": -0.14432461559772491, "step": 3310 }, { "epoch": 1.74, "learning_rate": 2.2469570523011993e-07, "logits/chosen": 0.28653472661972046, "logits/rejected": 0.25573134422302246, "logps/chosen": -274.28167724609375, "logps/rejected": -297.33184814453125, "loss": 0.6671, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11661437898874283, "rewards/margins": 0.023711198940873146, "rewards/rejected": -0.14032557606697083, "step": 3320 }, { "epoch": 1.74, "learning_rate": 2.2318161769052525e-07, "logits/chosen": 0.30002620816230774, "logits/rejected": 0.29848071932792664, "logps/chosen": -354.6106872558594, "logps/rejected": -291.1500244140625, "loss": 0.662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11448033154010773, "rewards/margins": 0.07472564280033112, "rewards/rejected": -0.18920597434043884, "step": 3330 }, { "epoch": 1.75, "learning_rate": 2.2166852463979624e-07, "logits/chosen": 0.3476138710975647, "logits/rejected": 0.3336217999458313, "logps/chosen": -331.3888244628906, "logps/rejected": -294.93829345703125, "loss": 0.6529, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.07934984564781189, "rewards/margins": 0.09157811850309372, "rewards/rejected": -0.1709279716014862, "step": 3340 }, { "epoch": 1.75, "learning_rate": 2.20156482186992e-07, "logits/chosen": 0.2583394944667816, "logits/rejected": 0.27358004450798035, "logps/chosen": -300.673828125, "logps/rejected": -256.2142028808594, "loss": 0.651, "rewards/accuracies": 0.875, "rewards/chosen": -0.0521802194416523, "rewards/margins": 0.16437430679798126, "rewards/rejected": -0.21655452251434326, "step": 3350 }, { "epoch": 1.76, "learning_rate": 2.1864554640221244e-07, "logits/chosen": 0.3455668091773987, "logits/rejected": 0.3013859987258911, "logps/chosen": -386.6537170410156, "logps/rejected": -359.4359130859375, "loss": 0.6556, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10155276954174042, "rewards/margins": 0.06396753340959549, "rewards/rejected": -0.1655203104019165, "step": 3360 }, { "epoch": 1.76, "learning_rate": 2.1713577331452016e-07, "logits/chosen": 0.246551513671875, "logits/rejected": 0.21105051040649414, "logps/chosen": -338.63525390625, "logps/rejected": -307.242919921875, "loss": 0.6525, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0833117812871933, "rewards/margins": 0.12226946651935577, "rewards/rejected": -0.20558123290538788, "step": 3370 }, { "epoch": 1.77, "learning_rate": 2.1562721890986199e-07, "logits/chosen": 0.3521527945995331, "logits/rejected": 0.32254111766815186, "logps/chosen": -271.57318115234375, "logps/rejected": -228.3287353515625, "loss": 0.6581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08410477638244629, "rewards/margins": 0.09148812294006348, "rewards/rejected": -0.17559289932250977, "step": 3380 }, { "epoch": 1.77, "learning_rate": 2.1411993912899285e-07, "logits/chosen": 0.29545360803604126, "logits/rejected": 0.29878222942352295, "logps/chosen": -256.453125, "logps/rejected": -274.4709777832031, "loss": 0.6481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0756291002035141, "rewards/margins": 0.09634210169315338, "rewards/rejected": -0.1719711869955063, "step": 3390 }, { "epoch": 1.78, "learning_rate": 2.126139898654021e-07, "logits/chosen": 0.38120537996292114, "logits/rejected": 0.37205177545547485, "logps/chosen": -398.5644226074219, "logps/rejected": -280.35357666015625, "loss": 0.6443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0817885547876358, "rewards/margins": 0.09885601699352264, "rewards/rejected": -0.18064458668231964, "step": 3400 }, { "epoch": 1.78, "learning_rate": 2.1110942696324012e-07, "logits/chosen": 0.2960183024406433, "logits/rejected": 0.29262787103652954, "logps/chosen": -383.98211669921875, "logps/rejected": -315.3439025878906, "loss": 0.6545, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07995359599590302, "rewards/margins": 0.11479581892490387, "rewards/rejected": -0.19474941492080688, "step": 3410 }, { "epoch": 1.79, "learning_rate": 2.0960630621524762e-07, "logits/chosen": 0.30641573667526245, "logits/rejected": 0.31198835372924805, "logps/chosen": -352.9584655761719, "logps/rejected": -342.90496826171875, "loss": 0.65, "rewards/accuracies": 0.75, "rewards/chosen": -0.0978015661239624, "rewards/margins": 0.08735918253660202, "rewards/rejected": -0.1851607710123062, "step": 3420 }, { "epoch": 1.79, "learning_rate": 2.0810468336068697e-07, "logits/chosen": 0.26992154121398926, "logits/rejected": 0.27308109402656555, "logps/chosen": -315.85833740234375, "logps/rejected": -311.920166015625, "loss": 0.6574, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08393695205450058, "rewards/margins": 0.09377161413431168, "rewards/rejected": -0.17770855128765106, "step": 3430 }, { "epoch": 1.8, "learning_rate": 2.0660461408327535e-07, "logits/chosen": 0.31459373235702515, "logits/rejected": 0.32342398166656494, "logps/chosen": -238.4505157470703, "logps/rejected": -222.8074951171875, "loss": 0.6567, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08796955645084381, "rewards/margins": 0.10214855521917343, "rewards/rejected": -0.19011810421943665, "step": 3440 }, { "epoch": 1.81, "learning_rate": 2.0510615400911906e-07, "logits/chosen": 0.23679859936237335, "logits/rejected": 0.21291649341583252, "logps/chosen": -318.44915771484375, "logps/rejected": -316.59967041015625, "loss": 0.6603, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09217986464500427, "rewards/margins": 0.05333445593714714, "rewards/rejected": -0.14551430940628052, "step": 3450 }, { "epoch": 1.81, "learning_rate": 2.0360935870465185e-07, "logits/chosen": 0.38853684067726135, "logits/rejected": 0.3678573668003082, "logps/chosen": -260.76263427734375, "logps/rejected": -278.4898986816406, "loss": 0.6557, "rewards/accuracies": 0.75, "rewards/chosen": -0.10340939462184906, "rewards/margins": 0.0924212783575058, "rewards/rejected": -0.19583067297935486, "step": 3460 }, { "epoch": 1.82, "learning_rate": 2.021142836745739e-07, "logits/chosen": 0.3437570333480835, "logits/rejected": 0.3541465699672699, "logps/chosen": -264.78924560546875, "logps/rejected": -235.6223907470703, "loss": 0.6558, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08142450451850891, "rewards/margins": 0.1238001137971878, "rewards/rejected": -0.2052246332168579, "step": 3470 }, { "epoch": 1.82, "learning_rate": 2.0062098435979308e-07, "logits/chosen": 0.28070321679115295, "logits/rejected": 0.30005306005477905, "logps/chosen": -326.1820983886719, "logps/rejected": -315.1957702636719, "loss": 0.6532, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13556493818759918, "rewards/margins": 0.06753392517566681, "rewards/rejected": -0.2030988484621048, "step": 3480 }, { "epoch": 1.83, "learning_rate": 1.9912951613536997e-07, "logits/chosen": 0.3095873296260834, "logits/rejected": 0.2897317111492157, "logps/chosen": -317.66412353515625, "logps/rejected": -253.30819702148438, "loss": 0.6447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0781150683760643, "rewards/margins": 0.1319970190525055, "rewards/rejected": -0.2101120948791504, "step": 3490 }, { "epoch": 1.83, "learning_rate": 1.9763993430846392e-07, "logits/chosen": 0.3042958676815033, "logits/rejected": 0.2619950771331787, "logps/chosen": -341.4963073730469, "logps/rejected": -233.6422882080078, "loss": 0.6525, "rewards/accuracies": 0.75, "rewards/chosen": -0.10992898792028427, "rewards/margins": 0.10214383900165558, "rewards/rejected": -0.21207281947135925, "step": 3500 }, { "epoch": 1.84, "learning_rate": 1.9615229411628212e-07, "logits/chosen": 0.35564860701560974, "logits/rejected": 0.3918618857860565, "logps/chosen": -175.1331329345703, "logps/rejected": -214.7126922607422, "loss": 0.6595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12910182774066925, "rewards/margins": 0.055904828011989594, "rewards/rejected": -0.18500666320323944, "step": 3510 }, { "epoch": 1.84, "learning_rate": 1.946666507240314e-07, "logits/chosen": 0.2995254397392273, "logits/rejected": 0.2703678011894226, "logps/chosen": -272.8249206542969, "logps/rejected": -238.4146270751953, "loss": 0.6575, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09808726608753204, "rewards/margins": 0.04268048703670502, "rewards/rejected": -0.14076772332191467, "step": 3520 }, { "epoch": 1.85, "learning_rate": 1.9318305922287268e-07, "logits/chosen": 0.3370177149772644, "logits/rejected": 0.2872919738292694, "logps/chosen": -335.13922119140625, "logps/rejected": -325.77545166015625, "loss": 0.6583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09754248708486557, "rewards/margins": 0.11342176049947739, "rewards/rejected": -0.21096424758434296, "step": 3530 }, { "epoch": 1.85, "learning_rate": 1.9170157462787762e-07, "logits/chosen": 0.29330548644065857, "logits/rejected": 0.2378358542919159, "logps/chosen": -320.5285949707031, "logps/rejected": -303.8361511230469, "loss": 0.6629, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08726056665182114, "rewards/margins": 0.06550947576761246, "rewards/rejected": -0.1527700424194336, "step": 3540 }, { "epoch": 1.86, "learning_rate": 1.902222518759891e-07, "logits/chosen": 0.2827271819114685, "logits/rejected": 0.3013695180416107, "logps/chosen": -354.85955810546875, "logps/rejected": -337.86419677734375, "loss": 0.6561, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09215109795331955, "rewards/margins": 0.1384793072938919, "rewards/rejected": -0.23063039779663086, "step": 3550 }, { "epoch": 1.86, "learning_rate": 1.8874514582398368e-07, "logits/chosen": 0.29755842685699463, "logits/rejected": 0.3269795775413513, "logps/chosen": -318.353271484375, "logps/rejected": -308.84576416015625, "loss": 0.6531, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09082455933094025, "rewards/margins": 0.10677800327539444, "rewards/rejected": -0.19760257005691528, "step": 3560 }, { "epoch": 1.87, "learning_rate": 1.8727031124643738e-07, "logits/chosen": 0.303489625453949, "logits/rejected": 0.3056088984012604, "logps/chosen": -327.00250244140625, "logps/rejected": -273.66448974609375, "loss": 0.6551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0742236077785492, "rewards/margins": 0.16267745196819305, "rewards/rejected": -0.23690101504325867, "step": 3570 }, { "epoch": 1.87, "learning_rate": 1.8579780283369472e-07, "logits/chosen": 0.32123714685440063, "logits/rejected": 0.350780189037323, "logps/chosen": -296.42919921875, "logps/rejected": -271.38922119140625, "loss": 0.6451, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1357162892818451, "rewards/margins": 0.055894482880830765, "rewards/rejected": -0.19161078333854675, "step": 3580 }, { "epoch": 1.88, "learning_rate": 1.8432767518984043e-07, "logits/chosen": 0.3722311854362488, "logits/rejected": 0.3453969955444336, "logps/chosen": -301.4832458496094, "logps/rejected": -240.32846069335938, "loss": 0.6503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13313955068588257, "rewards/margins": 0.09740877896547318, "rewards/rejected": -0.23054830729961395, "step": 3590 }, { "epoch": 1.88, "learning_rate": 1.8285998283067478e-07, "logits/chosen": 0.24569134414196014, "logits/rejected": 0.2697654366493225, "logps/chosen": -363.0978698730469, "logps/rejected": -371.66009521484375, "loss": 0.6551, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06352634727954865, "rewards/margins": 0.108018659055233, "rewards/rejected": -0.17154501378536224, "step": 3600 }, { "epoch": 1.89, "learning_rate": 1.8139478018169197e-07, "logits/chosen": 0.3788035213947296, "logits/rejected": 0.31163662672042847, "logps/chosen": -310.60528564453125, "logps/rejected": -234.04910278320312, "loss": 0.6549, "rewards/accuracies": 0.75, "rewards/chosen": -0.047023043036460876, "rewards/margins": 0.1184474378824234, "rewards/rejected": -0.16547051072120667, "step": 3610 }, { "epoch": 1.89, "learning_rate": 1.799321215760617e-07, "logits/chosen": 0.23150837421417236, "logits/rejected": 0.23608848452568054, "logps/chosen": -341.4847717285156, "logps/rejected": -318.3338928222656, "loss": 0.6492, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11050182580947876, "rewards/margins": 0.09023106098175049, "rewards/rejected": -0.20073287189006805, "step": 3620 }, { "epoch": 1.9, "learning_rate": 1.7847206125261476e-07, "logits/chosen": 0.24327079951763153, "logits/rejected": 0.22404679656028748, "logps/chosen": -296.234619140625, "logps/rejected": -340.331298828125, "loss": 0.6503, "rewards/accuracies": 0.625, "rewards/chosen": -0.12481401860713959, "rewards/margins": 0.09092869609594345, "rewards/rejected": -0.21574273705482483, "step": 3630 }, { "epoch": 1.9, "learning_rate": 1.7701465335383148e-07, "logits/chosen": 0.37283509969711304, "logits/rejected": 0.30105775594711304, "logps/chosen": -293.0348205566406, "logps/rejected": -243.2663116455078, "loss": 0.6599, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12276791036128998, "rewards/margins": 0.11833088099956512, "rewards/rejected": -0.2410987913608551, "step": 3640 }, { "epoch": 1.91, "learning_rate": 1.7555995192383377e-07, "logits/chosen": 0.26368245482444763, "logits/rejected": 0.2792828381061554, "logps/chosen": -277.477294921875, "logps/rejected": -258.3664245605469, "loss": 0.653, "rewards/accuracies": 0.75, "rewards/chosen": -0.1236424446105957, "rewards/margins": 0.10521407425403595, "rewards/rejected": -0.22885651886463165, "step": 3650 }, { "epoch": 1.92, "learning_rate": 1.7410801090638166e-07, "logits/chosen": 0.18897534906864166, "logits/rejected": 0.2537173330783844, "logps/chosen": -389.4532165527344, "logps/rejected": -336.89990234375, "loss": 0.6551, "rewards/accuracies": 0.625, "rewards/chosen": -0.0968824028968811, "rewards/margins": 0.10752584040164948, "rewards/rejected": -0.20440824329853058, "step": 3660 }, { "epoch": 1.92, "learning_rate": 1.7265888414287245e-07, "logits/chosen": 0.2385127991437912, "logits/rejected": 0.22628657519817352, "logps/chosen": -323.94769287109375, "logps/rejected": -265.17169189453125, "loss": 0.6509, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16426163911819458, "rewards/margins": 0.11096008121967316, "rewards/rejected": -0.27522170543670654, "step": 3670 }, { "epoch": 1.93, "learning_rate": 1.7121262537034396e-07, "logits/chosen": 0.2674527168273926, "logits/rejected": 0.2606186270713806, "logps/chosen": -376.69281005859375, "logps/rejected": -302.48626708984375, "loss": 0.66, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1338208019733429, "rewards/margins": 0.10959096997976303, "rewards/rejected": -0.24341173470020294, "step": 3680 }, { "epoch": 1.93, "learning_rate": 1.697692882194826e-07, "logits/chosen": 0.3283900320529938, "logits/rejected": 0.3107720613479614, "logps/chosen": -319.2773742675781, "logps/rejected": -296.90130615234375, "loss": 0.6556, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09138993918895721, "rewards/margins": 0.09348934888839722, "rewards/rejected": -0.18487928807735443, "step": 3690 }, { "epoch": 1.94, "learning_rate": 1.6832892621263406e-07, "logits/chosen": 0.2618725597858429, "logits/rejected": 0.22988107800483704, "logps/chosen": -348.5962829589844, "logps/rejected": -358.07794189453125, "loss": 0.6544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14753267168998718, "rewards/margins": 0.05720914527773857, "rewards/rejected": -0.20474180579185486, "step": 3700 }, { "epoch": 1.94, "learning_rate": 1.668915927618183e-07, "logits/chosen": 0.29424089193344116, "logits/rejected": 0.3408924639225006, "logps/chosen": -281.04449462890625, "logps/rejected": -266.5536804199219, "loss": 0.657, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1075427383184433, "rewards/margins": 0.053296517580747604, "rewards/rejected": -0.1608392596244812, "step": 3710 }, { "epoch": 1.95, "learning_rate": 1.6545734116674965e-07, "logits/chosen": 0.2831776738166809, "logits/rejected": 0.296794056892395, "logps/chosen": -335.51214599609375, "logps/rejected": -404.1058654785156, "loss": 0.6519, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14560511708259583, "rewards/margins": 0.07225475460290909, "rewards/rejected": -0.2178598940372467, "step": 3720 }, { "epoch": 1.95, "learning_rate": 1.6402622461286e-07, "logits/chosen": 0.305799663066864, "logits/rejected": 0.3058302700519562, "logps/chosen": -323.1365661621094, "logps/rejected": -298.42730712890625, "loss": 0.6494, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13549456000328064, "rewards/margins": 0.08603169769048691, "rewards/rejected": -0.22152626514434814, "step": 3730 }, { "epoch": 1.96, "learning_rate": 1.625982961693262e-07, "logits/chosen": 0.2127005159854889, "logits/rejected": 0.20395474135875702, "logps/chosen": -293.8401794433594, "logps/rejected": -282.7629699707031, "loss": 0.655, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1535455286502838, "rewards/margins": 0.06461331248283386, "rewards/rejected": -0.21815884113311768, "step": 3740 }, { "epoch": 1.96, "learning_rate": 1.6117360878710266e-07, "logits/chosen": 0.26588043570518494, "logits/rejected": 0.25278085470199585, "logps/chosen": -299.72467041015625, "logps/rejected": -284.97003173828125, "loss": 0.6608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15774992108345032, "rewards/margins": 0.09110499918460846, "rewards/rejected": -0.24885492026805878, "step": 3750 }, { "epoch": 1.97, "learning_rate": 1.5975221529695773e-07, "logits/chosen": 0.27109450101852417, "logits/rejected": 0.27076664566993713, "logps/chosen": -289.7879638671875, "logps/rejected": -277.06243896484375, "loss": 0.6549, "rewards/accuracies": 0.625, "rewards/chosen": -0.11164456605911255, "rewards/margins": 0.06714334338903427, "rewards/rejected": -0.17878788709640503, "step": 3760 }, { "epoch": 1.97, "learning_rate": 1.5833416840751406e-07, "logits/chosen": 0.22059431672096252, "logits/rejected": 0.23375996947288513, "logps/chosen": -327.8553161621094, "logps/rejected": -280.8095397949219, "loss": 0.6518, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10653182119131088, "rewards/margins": 0.07202602922916412, "rewards/rejected": -0.1785578578710556, "step": 3770 }, { "epoch": 1.98, "learning_rate": 1.5691952070329493e-07, "logits/chosen": 0.2996228337287903, "logits/rejected": 0.33212026953697205, "logps/chosen": -313.38653564453125, "logps/rejected": -257.5924072265625, "loss": 0.6558, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11817840486764908, "rewards/margins": 0.08965203911066055, "rewards/rejected": -0.20783045887947083, "step": 3780 }, { "epoch": 1.98, "learning_rate": 1.555083246427734e-07, "logits/chosen": 0.21885398030281067, "logits/rejected": 0.29226353764533997, "logps/chosen": -264.7387390136719, "logps/rejected": -311.4655456542969, "loss": 0.6444, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12575462460517883, "rewards/margins": 0.11083336919546127, "rewards/rejected": -0.2365880012512207, "step": 3790 }, { "epoch": 1.99, "learning_rate": 1.5410063255642767e-07, "logits/chosen": 0.3748754858970642, "logits/rejected": 0.32010817527770996, "logps/chosen": -295.5055847167969, "logps/rejected": -278.5317687988281, "loss": 0.6579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10374332964420319, "rewards/margins": 0.1423131227493286, "rewards/rejected": -0.24605640769004822, "step": 3800 }, { "epoch": 1.99, "learning_rate": 1.5269649664480037e-07, "logits/chosen": 0.1914290487766266, "logits/rejected": 0.2500323951244354, "logps/chosen": -292.2693786621094, "logps/rejected": -354.45672607421875, "loss": 0.6727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16124172508716583, "rewards/margins": 0.0014358393382281065, "rewards/rejected": -0.1626775562763214, "step": 3810 }, { "epoch": 2.0, "learning_rate": 1.5129596897656255e-07, "logits/chosen": 0.30393776297569275, "logits/rejected": 0.3444197177886963, "logps/chosen": -329.9333190917969, "logps/rejected": -295.95538330078125, "loss": 0.6499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10849185287952423, "rewards/margins": 0.10884352028369904, "rewards/rejected": -0.21733537316322327, "step": 3820 }, { "epoch": 2.0, "learning_rate": 1.4989910148658324e-07, "logits/chosen": 0.24706992506980896, "logits/rejected": 0.261945903301239, "logps/chosen": -303.5214538574219, "logps/rejected": -291.674072265625, "loss": 0.6564, "rewards/accuracies": 0.625, "rewards/chosen": -0.11958847939968109, "rewards/margins": 0.09228087961673737, "rewards/rejected": -0.21186935901641846, "step": 3830 }, { "epoch": 2.01, "learning_rate": 1.485059459740035e-07, "logits/chosen": 0.1678394377231598, "logits/rejected": 0.15751591324806213, "logps/chosen": -323.06304931640625, "logps/rejected": -313.9187316894531, "loss": 0.6445, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18193499743938446, "rewards/margins": 0.07315002381801605, "rewards/rejected": -0.2550850212574005, "step": 3840 }, { "epoch": 2.01, "learning_rate": 1.4711655410031536e-07, "logits/chosen": 0.28335142135620117, "logits/rejected": 0.3012840151786804, "logps/chosen": -286.52984619140625, "logps/rejected": -262.88568115234375, "loss": 0.6503, "rewards/accuracies": 0.625, "rewards/chosen": -0.10715119540691376, "rewards/margins": 0.12121255695819855, "rewards/rejected": -0.2283637523651123, "step": 3850 }, { "epoch": 2.02, "learning_rate": 1.4573097738744623e-07, "logits/chosen": 0.2741647958755493, "logits/rejected": 0.2756713628768921, "logps/chosen": -360.9956359863281, "logps/rejected": -328.37017822265625, "loss": 0.6633, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13849574327468872, "rewards/margins": 0.07003328949213028, "rewards/rejected": -0.2085290253162384, "step": 3860 }, { "epoch": 2.03, "learning_rate": 1.4434926721584865e-07, "logits/chosen": 0.34434953331947327, "logits/rejected": 0.35850682854652405, "logps/chosen": -274.724853515625, "logps/rejected": -265.60101318359375, "loss": 0.6545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13894551992416382, "rewards/margins": 0.06858086585998535, "rewards/rejected": -0.20752640068531036, "step": 3870 }, { "epoch": 2.03, "learning_rate": 1.4297147482259424e-07, "logits/chosen": 0.3271617293357849, "logits/rejected": 0.3250243067741394, "logps/chosen": -335.4681091308594, "logps/rejected": -310.607421875, "loss": 0.6536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13369736075401306, "rewards/margins": 0.0888337790966034, "rewards/rejected": -0.22253112494945526, "step": 3880 }, { "epoch": 2.04, "learning_rate": 1.4159765129947443e-07, "logits/chosen": 0.28231456875801086, "logits/rejected": 0.2409648895263672, "logps/chosen": -318.02203369140625, "logps/rejected": -273.2533874511719, "loss": 0.6575, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10755988210439682, "rewards/margins": 0.09727488458156586, "rewards/rejected": -0.20483477413654327, "step": 3890 }, { "epoch": 2.04, "learning_rate": 1.4022784759110576e-07, "logits/chosen": 0.2656182050704956, "logits/rejected": 0.2702781558036804, "logps/chosen": -278.57379150390625, "logps/rejected": -280.07281494140625, "loss": 0.6533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15303316712379456, "rewards/margins": 0.10009218752384186, "rewards/rejected": -0.2531253397464752, "step": 3900 }, { "epoch": 2.05, "learning_rate": 1.3886211449304002e-07, "logits/chosen": 0.2852250933647156, "logits/rejected": 0.28865867853164673, "logps/chosen": -320.269775390625, "logps/rejected": -292.2712707519531, "loss": 0.646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12158701568841934, "rewards/margins": 0.09683831036090851, "rewards/rejected": -0.21842531859874725, "step": 3910 }, { "epoch": 2.05, "learning_rate": 1.3750050264988172e-07, "logits/chosen": 0.31218335032463074, "logits/rejected": 0.3479720950126648, "logps/chosen": -265.9737854003906, "logps/rejected": -305.076171875, "loss": 0.6711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13722750544548035, "rewards/margins": 0.03017706237733364, "rewards/rejected": -0.16740456223487854, "step": 3920 }, { "epoch": 2.06, "learning_rate": 1.3614306255340918e-07, "logits/chosen": 0.23540154099464417, "logits/rejected": 0.26212871074676514, "logps/chosen": -324.95550537109375, "logps/rejected": -311.8691101074219, "loss": 0.6534, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1239587664604187, "rewards/margins": 0.09770622104406357, "rewards/rejected": -0.22166499495506287, "step": 3930 }, { "epoch": 2.06, "learning_rate": 1.347898445407027e-07, "logits/chosen": 0.3467102646827698, "logits/rejected": 0.36723384261131287, "logps/chosen": -361.93487548828125, "logps/rejected": -322.19219970703125, "loss": 0.6581, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.14353439211845398, "rewards/margins": 0.07150904834270477, "rewards/rejected": -0.21504342555999756, "step": 3940 }, { "epoch": 2.07, "learning_rate": 1.3344089879227768e-07, "logits/chosen": 0.3199073374271393, "logits/rejected": 0.3352632522583008, "logps/chosen": -267.867919921875, "logps/rejected": -305.97332763671875, "loss": 0.6583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11239242553710938, "rewards/margins": 0.11769165843725204, "rewards/rejected": -0.23008409142494202, "step": 3950 }, { "epoch": 2.07, "learning_rate": 1.3209627533022393e-07, "logits/chosen": 0.15640456974506378, "logits/rejected": 0.1485460102558136, "logps/chosen": -339.8058166503906, "logps/rejected": -301.31085205078125, "loss": 0.6564, "rewards/accuracies": 0.5, "rewards/chosen": -0.1585397869348526, "rewards/margins": 0.0327836349606514, "rewards/rejected": -0.1913234144449234, "step": 3960 }, { "epoch": 2.08, "learning_rate": 1.3075602401635056e-07, "logits/chosen": 0.4258531630039215, "logits/rejected": 0.3634311556816101, "logps/chosen": -379.2318420410156, "logps/rejected": -312.0608215332031, "loss": 0.6613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09157432615756989, "rewards/margins": 0.12343801558017731, "rewards/rejected": -0.2150123119354248, "step": 3970 }, { "epoch": 2.08, "learning_rate": 1.2942019455033715e-07, "logits/chosen": 0.2574451267719269, "logits/rejected": 0.2524639666080475, "logps/chosen": -277.36871337890625, "logps/rejected": -278.53826904296875, "loss": 0.6593, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13917557895183563, "rewards/margins": 0.034988995641469955, "rewards/rejected": -0.1741645783185959, "step": 3980 }, { "epoch": 2.09, "learning_rate": 1.2808883646789088e-07, "logits/chosen": 0.3025432229042053, "logits/rejected": 0.2939312160015106, "logps/chosen": -272.0987243652344, "logps/rejected": -242.9075164794922, "loss": 0.6586, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.10569024085998535, "rewards/margins": 0.07147324085235596, "rewards/rejected": -0.1771634817123413, "step": 3990 }, { "epoch": 2.09, "learning_rate": 1.2676199913890933e-07, "logits/chosen": 0.32760852575302124, "logits/rejected": 0.2918204367160797, "logps/chosen": -307.29669189453125, "logps/rejected": -285.5381164550781, "loss": 0.65, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13056454062461853, "rewards/margins": 0.0760079026222229, "rewards/rejected": -0.20657244324684143, "step": 4000 }, { "epoch": 2.1, "learning_rate": 1.2543973176565012e-07, "logits/chosen": 0.2552763521671295, "logits/rejected": 0.27753502130508423, "logps/chosen": -328.93402099609375, "logps/rejected": -296.59820556640625, "loss": 0.6489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11348460614681244, "rewards/margins": 0.07869541645050049, "rewards/rejected": -0.19218002259731293, "step": 4010 }, { "epoch": 2.1, "learning_rate": 1.2412208338090565e-07, "logits/chosen": 0.32259517908096313, "logits/rejected": 0.34156057238578796, "logps/chosen": -317.1317138671875, "logps/rejected": -321.1855163574219, "loss": 0.6582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1507299840450287, "rewards/margins": 0.08442052453756332, "rewards/rejected": -0.2351505309343338, "step": 4020 }, { "epoch": 2.11, "learning_rate": 1.228091028461858e-07, "logits/chosen": 0.3298514783382416, "logits/rejected": 0.3019588887691498, "logps/chosen": -279.78802490234375, "logps/rejected": -285.08465576171875, "loss": 0.6508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17347638309001923, "rewards/margins": 0.06616106629371643, "rewards/rejected": -0.23963744938373566, "step": 4030 }, { "epoch": 2.11, "learning_rate": 1.2150083884990536e-07, "logits/chosen": 0.1470915973186493, "logits/rejected": 0.19251108169555664, "logps/chosen": -348.84246826171875, "logps/rejected": -342.70941162109375, "loss": 0.6618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13458314538002014, "rewards/margins": 0.08001428842544556, "rewards/rejected": -0.2145974189043045, "step": 4040 }, { "epoch": 2.12, "learning_rate": 1.201973399055788e-07, "logits/chosen": 0.22867533564567566, "logits/rejected": 0.2250591516494751, "logps/chosen": -297.4351501464844, "logps/rejected": -297.4842529296875, "loss": 0.6515, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13157221674919128, "rewards/margins": 0.08421944081783295, "rewards/rejected": -0.21579165756702423, "step": 4050 }, { "epoch": 2.12, "learning_rate": 1.1889865435002117e-07, "logits/chosen": 0.23430044949054718, "logits/rejected": 0.22815270721912384, "logps/chosen": -323.7678527832031, "logps/rejected": -262.4975891113281, "loss": 0.6544, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11733143031597137, "rewards/margins": 0.09236739575862885, "rewards/rejected": -0.20969882607460022, "step": 4060 }, { "epoch": 2.13, "learning_rate": 1.1760483034155588e-07, "logits/chosen": 0.25802451372146606, "logits/rejected": 0.29646921157836914, "logps/chosen": -344.3500061035156, "logps/rejected": -366.99029541015625, "loss": 0.6516, "rewards/accuracies": 0.625, "rewards/chosen": -0.16074565052986145, "rewards/margins": 0.05782736465334892, "rewards/rejected": -0.21857304871082306, "step": 4070 }, { "epoch": 2.14, "learning_rate": 1.163159158582284e-07, "logits/chosen": 0.27491092681884766, "logits/rejected": 0.29113298654556274, "logps/chosen": -289.4082946777344, "logps/rejected": -281.1566467285156, "loss": 0.6498, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12462715059518814, "rewards/margins": 0.082103431224823, "rewards/rejected": -0.20673055946826935, "step": 4080 }, { "epoch": 2.14, "learning_rate": 1.1503195869602766e-07, "logits/chosen": 0.35794973373413086, "logits/rejected": 0.3533916771411896, "logps/chosen": -293.63299560546875, "logps/rejected": -263.76715087890625, "loss": 0.6564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16337043046951294, "rewards/margins": 0.029357850551605225, "rewards/rejected": -0.19272826611995697, "step": 4090 }, { "epoch": 2.15, "learning_rate": 1.137530064671135e-07, "logits/chosen": 0.27308765053749084, "logits/rejected": 0.28640851378440857, "logps/chosen": -334.1259765625, "logps/rejected": -294.29638671875, "loss": 0.6408, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10533700883388519, "rewards/margins": 0.09341181814670563, "rewards/rejected": -0.19874884188175201, "step": 4100 }, { "epoch": 2.15, "learning_rate": 1.1247910659805063e-07, "logits/chosen": 0.27220582962036133, "logits/rejected": 0.23189368844032288, "logps/chosen": -309.3544616699219, "logps/rejected": -294.084716796875, "loss": 0.6583, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.17769691348075867, "rewards/margins": 0.08467571437358856, "rewards/rejected": -0.2623726427555084, "step": 4110 }, { "epoch": 2.16, "learning_rate": 1.112103063280509e-07, "logits/chosen": 0.18239018321037292, "logits/rejected": 0.19460837543010712, "logps/chosen": -306.71697998046875, "logps/rejected": -295.2325744628906, "loss": 0.6573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15410485863685608, "rewards/margins": 0.06368996202945709, "rewards/rejected": -0.21779482066631317, "step": 4120 }, { "epoch": 2.16, "learning_rate": 1.099466527072207e-07, "logits/chosen": 0.3449271023273468, "logits/rejected": 0.3305651545524597, "logps/chosen": -257.5208435058594, "logps/rejected": -210.81307983398438, "loss": 0.6487, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13710053265094757, "rewards/margins": 0.09004830569028854, "rewards/rejected": -0.2271488457918167, "step": 4130 }, { "epoch": 2.17, "learning_rate": 1.0868819259481638e-07, "logits/chosen": 0.3679484724998474, "logits/rejected": 0.2941407561302185, "logps/chosen": -334.38861083984375, "logps/rejected": -252.18588256835938, "loss": 0.6584, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10539355129003525, "rewards/margins": 0.1011449545621872, "rewards/rejected": -0.20653851330280304, "step": 4140 }, { "epoch": 2.17, "learning_rate": 1.0743497265750701e-07, "logits/chosen": 0.33962732553482056, "logits/rejected": 0.3592199683189392, "logps/chosen": -265.0174255371094, "logps/rejected": -267.397705078125, "loss": 0.647, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14396080374717712, "rewards/margins": 0.08976936340332031, "rewards/rejected": -0.23373015224933624, "step": 4150 }, { "epoch": 2.18, "learning_rate": 1.0618703936764359e-07, "logits/chosen": 0.35714811086654663, "logits/rejected": 0.3340142071247101, "logps/chosen": -285.12762451171875, "logps/rejected": -312.48602294921875, "loss": 0.6556, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.10041477531194687, "rewards/margins": 0.1002071350812912, "rewards/rejected": -0.20062191784381866, "step": 4160 }, { "epoch": 2.18, "learning_rate": 1.0494443900153557e-07, "logits/chosen": 0.3087230324745178, "logits/rejected": 0.3323986530303955, "logps/chosen": -290.55926513671875, "logps/rejected": -333.17535400390625, "loss": 0.6523, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.15257273614406586, "rewards/margins": 0.070375956594944, "rewards/rejected": -0.22294867038726807, "step": 4170 }, { "epoch": 2.19, "learning_rate": 1.0370721763773507e-07, "logits/chosen": 0.286038339138031, "logits/rejected": 0.20693036913871765, "logps/chosen": -388.9151306152344, "logps/rejected": -281.81634521484375, "loss": 0.6486, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1107790619134903, "rewards/margins": 0.12731772661209106, "rewards/rejected": -0.23809678852558136, "step": 4180 }, { "epoch": 2.19, "learning_rate": 1.0247542115532845e-07, "logits/chosen": 0.24497263133525848, "logits/rejected": 0.208289235830307, "logps/chosen": -376.7227478027344, "logps/rejected": -310.5379638671875, "loss": 0.6539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08648854494094849, "rewards/margins": 0.13909420371055603, "rewards/rejected": -0.22558274865150452, "step": 4190 }, { "epoch": 2.2, "learning_rate": 1.0124909523223418e-07, "logits/chosen": 0.2840736210346222, "logits/rejected": 0.35097819566726685, "logps/chosen": -294.68548583984375, "logps/rejected": -337.6000061035156, "loss": 0.6501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15106235444545746, "rewards/margins": 0.058377016335725784, "rewards/rejected": -0.20943935215473175, "step": 4200 }, { "epoch": 2.2, "learning_rate": 1.0002828534350987e-07, "logits/chosen": 0.28108084201812744, "logits/rejected": 0.2525596618652344, "logps/chosen": -321.88232421875, "logps/rejected": -342.23681640625, "loss": 0.6445, "rewards/accuracies": 0.75, "rewards/chosen": -0.10137365758419037, "rewards/margins": 0.17096036672592163, "rewards/rejected": -0.2723340094089508, "step": 4210 }, { "epoch": 2.21, "learning_rate": 9.881303675966524e-08, "logits/chosen": 0.17838601768016815, "logits/rejected": 0.16133739054203033, "logps/chosen": -415.6875, "logps/rejected": -328.04986572265625, "loss": 0.656, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.092302106320858, "rewards/margins": 0.1187206357717514, "rewards/rejected": -0.21102270483970642, "step": 4220 }, { "epoch": 2.21, "learning_rate": 9.760339454498393e-08, "logits/chosen": 0.281174898147583, "logits/rejected": 0.2819034457206726, "logps/chosen": -254.9493408203125, "logps/rejected": -267.9808044433594, "loss": 0.652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10819119215011597, "rewards/margins": 0.09994282573461533, "rewards/rejected": -0.20813405513763428, "step": 4230 }, { "epoch": 2.22, "learning_rate": 9.639940355585218e-08, "logits/chosen": 0.2734231948852539, "logits/rejected": 0.2930763363838196, "logps/chosen": -333.05340576171875, "logps/rejected": -247.87380981445312, "loss": 0.6468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10973703861236572, "rewards/margins": 0.13471439480781555, "rewards/rejected": -0.24445144832134247, "step": 4240 }, { "epoch": 2.22, "learning_rate": 9.52011084390954e-08, "logits/chosen": 0.3024441599845886, "logits/rejected": 0.3216271996498108, "logps/chosen": -372.4168395996094, "logps/rejected": -402.4332275390625, "loss": 0.6543, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12555637955665588, "rewards/margins": 0.10845603048801422, "rewards/rejected": -0.2340123951435089, "step": 4250 }, { "epoch": 2.23, "learning_rate": 9.400855363032262e-08, "logits/chosen": 0.33374837040901184, "logits/rejected": 0.31805044412612915, "logps/chosen": -293.5007629394531, "logps/rejected": -289.728515625, "loss": 0.6391, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1061309352517128, "rewards/margins": 0.16277316212654114, "rewards/rejected": -0.26890408992767334, "step": 4260 }, { "epoch": 2.23, "learning_rate": 9.282178335227883e-08, "logits/chosen": 0.2601068615913391, "logits/rejected": 0.3062272071838379, "logps/chosen": -265.8717956542969, "logps/rejected": -276.9969177246094, "loss": 0.6451, "rewards/accuracies": 0.75, "rewards/chosen": -0.16810034215450287, "rewards/margins": 0.1246415227651596, "rewards/rejected": -0.2927418649196625, "step": 4270 }, { "epoch": 2.24, "learning_rate": 9.164084161320471e-08, "logits/chosen": 0.30895930528640747, "logits/rejected": 0.340026319026947, "logps/chosen": -249.67147827148438, "logps/rejected": -213.41751098632812, "loss": 0.665, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11838191747665405, "rewards/margins": 0.10098621994256973, "rewards/rejected": -0.2193681299686432, "step": 4280 }, { "epoch": 2.24, "learning_rate": 9.046577220520518e-08, "logits/chosen": 0.27443909645080566, "logits/rejected": 0.25630897283554077, "logps/chosen": -335.86541748046875, "logps/rejected": -314.80230712890625, "loss": 0.6585, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1495860517024994, "rewards/margins": 0.08008397370576859, "rewards/rejected": -0.22967001795768738, "step": 4290 }, { "epoch": 2.25, "learning_rate": 8.929661870262525e-08, "logits/chosen": 0.2684154808521271, "logits/rejected": 0.24717223644256592, "logps/chosen": -264.81390380859375, "logps/rejected": -209.41360473632812, "loss": 0.644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15925763547420502, "rewards/margins": 0.09572459757328033, "rewards/rejected": -0.25498223304748535, "step": 4300 }, { "epoch": 2.26, "learning_rate": 8.813342446043423e-08, "logits/chosen": 0.24432964622974396, "logits/rejected": 0.24687163531780243, "logps/chosen": -261.1036071777344, "logps/rejected": -251.00479125976562, "loss": 0.6515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12970378994941711, "rewards/margins": 0.0732387825846672, "rewards/rejected": -0.20294256508350372, "step": 4310 }, { "epoch": 2.26, "learning_rate": 8.697623261261788e-08, "logits/chosen": 0.258579820394516, "logits/rejected": 0.2788470387458801, "logps/chosen": -312.0073547363281, "logps/rejected": -281.97882080078125, "loss": 0.659, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10256769508123398, "rewards/margins": 0.09709702432155609, "rewards/rejected": -0.19966474175453186, "step": 4320 }, { "epoch": 2.27, "learning_rate": 8.58250860705792e-08, "logits/chosen": 0.3692210018634796, "logits/rejected": 0.33054882287979126, "logps/chosen": -364.50201416015625, "logps/rejected": -311.98626708984375, "loss": 0.638, "rewards/accuracies": 0.625, "rewards/chosen": -0.12589401006698608, "rewards/margins": 0.0628051906824112, "rewards/rejected": -0.18869920074939728, "step": 4330 }, { "epoch": 2.27, "learning_rate": 8.468002752154671e-08, "logits/chosen": 0.28316354751586914, "logits/rejected": 0.32993918657302856, "logps/chosen": -272.2777404785156, "logps/rejected": -237.9681396484375, "loss": 0.6455, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15095901489257812, "rewards/margins": 0.08649053424596786, "rewards/rejected": -0.2374495565891266, "step": 4340 }, { "epoch": 2.28, "learning_rate": 8.354109942699208e-08, "logits/chosen": 0.2612248957157135, "logits/rejected": 0.24447908997535706, "logps/chosen": -295.1114807128906, "logps/rejected": -305.5965881347656, "loss": 0.6536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1192585676908493, "rewards/margins": 0.10337366163730621, "rewards/rejected": -0.2226322442293167, "step": 4350 }, { "epoch": 2.28, "learning_rate": 8.240834402105524e-08, "logits/chosen": 0.3573678135871887, "logits/rejected": 0.28943532705307007, "logps/chosen": -316.6932678222656, "logps/rejected": -276.90496826171875, "loss": 0.6581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1998998373746872, "rewards/margins": 0.03210686147212982, "rewards/rejected": -0.23200669884681702, "step": 4360 }, { "epoch": 2.29, "learning_rate": 8.128180330897791e-08, "logits/chosen": 0.3203149437904358, "logits/rejected": 0.2803335189819336, "logps/chosen": -309.9126892089844, "logps/rejected": -289.2821960449219, "loss": 0.6531, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1578475385904312, "rewards/margins": 0.09269069135189056, "rewards/rejected": -0.2505382299423218, "step": 4370 }, { "epoch": 2.29, "learning_rate": 8.016151906554683e-08, "logits/chosen": 0.2752537131309509, "logits/rejected": 0.27471452951431274, "logps/chosen": -328.5307312011719, "logps/rejected": -294.3258056640625, "loss": 0.6516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.21460950374603271, "rewards/margins": 0.034757621586322784, "rewards/rejected": -0.2493671178817749, "step": 4380 }, { "epoch": 2.3, "learning_rate": 7.90475328335439e-08, "logits/chosen": 0.2595667243003845, "logits/rejected": 0.26930028200149536, "logps/chosen": -326.0294494628906, "logps/rejected": -378.00164794921875, "loss": 0.6541, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13298392295837402, "rewards/margins": 0.12766322493553162, "rewards/rejected": -0.26064714789390564, "step": 4390 }, { "epoch": 2.3, "learning_rate": 7.793988592220568e-08, "logits/chosen": 0.2611275315284729, "logits/rejected": 0.22679242491722107, "logps/chosen": -316.05780029296875, "logps/rejected": -265.9347229003906, "loss": 0.6459, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11774953454732895, "rewards/margins": 0.12449419498443604, "rewards/rejected": -0.24224373698234558, "step": 4400 }, { "epoch": 2.31, "learning_rate": 7.683861940569217e-08, "logits/chosen": 0.2962859272956848, "logits/rejected": 0.23215405642986298, "logps/chosen": -333.14239501953125, "logps/rejected": -311.7344665527344, "loss": 0.6497, "rewards/accuracies": 0.625, "rewards/chosen": -0.14002402126789093, "rewards/margins": 0.11002373695373535, "rewards/rejected": -0.2500477433204651, "step": 4410 }, { "epoch": 2.31, "learning_rate": 7.574377412156291e-08, "logits/chosen": 0.277981698513031, "logits/rejected": 0.23015658557415009, "logps/chosen": -309.0813903808594, "logps/rejected": -304.1460266113281, "loss": 0.6457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14443780481815338, "rewards/margins": 0.1270671784877777, "rewards/rejected": -0.2715049982070923, "step": 4420 }, { "epoch": 2.32, "learning_rate": 7.465539066926322e-08, "logits/chosen": 0.21482165157794952, "logits/rejected": 0.2153225690126419, "logps/chosen": -393.70172119140625, "logps/rejected": -346.408935546875, "loss": 0.6383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15823566913604736, "rewards/margins": 0.10698536783456802, "rewards/rejected": -0.2652210295200348, "step": 4430 }, { "epoch": 2.32, "learning_rate": 7.357350940861845e-08, "logits/chosen": 0.26978224515914917, "logits/rejected": 0.2641783356666565, "logps/chosen": -373.73516845703125, "logps/rejected": -330.3535461425781, "loss": 0.6499, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.1348724067211151, "rewards/margins": 0.08027593791484833, "rewards/rejected": -0.21514835953712463, "step": 4440 }, { "epoch": 2.33, "learning_rate": 7.249817045833726e-08, "logits/chosen": 0.20601427555084229, "logits/rejected": 0.1943071484565735, "logps/chosen": -355.9278259277344, "logps/rejected": -310.63531494140625, "loss": 0.6486, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.19451366364955902, "rewards/margins": 0.06895452737808228, "rewards/rejected": -0.2634682059288025, "step": 4450 }, { "epoch": 2.33, "learning_rate": 7.14294136945241e-08, "logits/chosen": 0.3372945487499237, "logits/rejected": 0.28835657238960266, "logps/chosen": -328.9299621582031, "logps/rejected": -289.6046142578125, "loss": 0.6494, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11903943866491318, "rewards/margins": 0.13281255960464478, "rewards/rejected": -0.25185197591781616, "step": 4460 }, { "epoch": 2.34, "learning_rate": 7.036727874920043e-08, "logits/chosen": 0.2705017328262329, "logits/rejected": 0.27911967039108276, "logps/chosen": -290.66314697265625, "logps/rejected": -310.8837890625, "loss": 0.6378, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16096973419189453, "rewards/margins": 0.10622663795948029, "rewards/rejected": -0.267196387052536, "step": 4470 }, { "epoch": 2.34, "learning_rate": 6.931180500883484e-08, "logits/chosen": 0.1446174830198288, "logits/rejected": 0.12518611550331116, "logps/chosen": -413.4287109375, "logps/rejected": -355.8533020019531, "loss": 0.6494, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1334320604801178, "rewards/margins": 0.12716497480869293, "rewards/rejected": -0.2605970501899719, "step": 4480 }, { "epoch": 2.35, "learning_rate": 6.826303161288302e-08, "logits/chosen": 0.24317510426044464, "logits/rejected": 0.22099463641643524, "logps/chosen": -361.7783203125, "logps/rejected": -312.4311218261719, "loss": 0.6575, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13776350021362305, "rewards/margins": 0.14507371187210083, "rewards/rejected": -0.2828372120857239, "step": 4490 }, { "epoch": 2.35, "learning_rate": 6.722099745233594e-08, "logits/chosen": 0.24744892120361328, "logits/rejected": 0.25841349363327026, "logps/chosen": -391.8916015625, "logps/rejected": -308.3046569824219, "loss": 0.6622, "rewards/accuracies": 0.75, "rewards/chosen": -0.13982141017913818, "rewards/margins": 0.13293033838272095, "rewards/rejected": -0.27275174856185913, "step": 4500 }, { "epoch": 2.36, "learning_rate": 6.618574116827786e-08, "logits/chosen": 0.20393629372119904, "logits/rejected": 0.1978496015071869, "logps/chosen": -324.3951721191406, "logps/rejected": -361.393310546875, "loss": 0.6595, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.14017212390899658, "rewards/margins": 0.0554613396525383, "rewards/rejected": -0.19563347101211548, "step": 4510 }, { "epoch": 2.37, "learning_rate": 6.515730115045339e-08, "logits/chosen": 0.2902334928512573, "logits/rejected": 0.2500172257423401, "logps/chosen": -343.683349609375, "logps/rejected": -329.7452697753906, "loss": 0.6488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09550157934427261, "rewards/margins": 0.15107180178165436, "rewards/rejected": -0.24657340347766876, "step": 4520 }, { "epoch": 2.37, "learning_rate": 6.413571553584399e-08, "logits/chosen": 0.31443777680397034, "logits/rejected": 0.298168420791626, "logps/chosen": -301.1230163574219, "logps/rejected": -361.3356018066406, "loss": 0.6454, "rewards/accuracies": 0.75, "rewards/chosen": -0.14689506590366364, "rewards/margins": 0.13481785356998444, "rewards/rejected": -0.28171294927597046, "step": 4530 }, { "epoch": 2.38, "learning_rate": 6.312102220725346e-08, "logits/chosen": 0.17988334596157074, "logits/rejected": 0.23794107139110565, "logps/chosen": -280.0445251464844, "logps/rejected": -317.3033142089844, "loss": 0.6398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18284687399864197, "rewards/margins": 0.08084186166524887, "rewards/rejected": -0.26368874311447144, "step": 4540 }, { "epoch": 2.38, "learning_rate": 6.21132587919036e-08, "logits/chosen": 0.2522971034049988, "logits/rejected": 0.26374301314353943, "logps/chosen": -305.38629150390625, "logps/rejected": -303.3993225097656, "loss": 0.6562, "rewards/accuracies": 0.625, "rewards/chosen": -0.16436639428138733, "rewards/margins": 0.09498941898345947, "rewards/rejected": -0.2593558430671692, "step": 4550 }, { "epoch": 2.39, "learning_rate": 6.111246266003859e-08, "logits/chosen": 0.29947465658187866, "logits/rejected": 0.2651771008968353, "logps/chosen": -356.18914794921875, "logps/rejected": -301.4688720703125, "loss": 0.644, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1766948699951172, "rewards/margins": 0.05812176316976547, "rewards/rejected": -0.23481664061546326, "step": 4560 }, { "epoch": 2.39, "learning_rate": 6.011867092353934e-08, "logits/chosen": 0.23344139754772186, "logits/rejected": 0.24661417305469513, "logps/chosen": -288.53033447265625, "logps/rejected": -285.0890808105469, "loss": 0.6587, "rewards/accuracies": 0.625, "rewards/chosen": -0.1591729074716568, "rewards/margins": 0.07025494426488876, "rewards/rejected": -0.22942781448364258, "step": 4570 }, { "epoch": 2.4, "learning_rate": 5.9131920434547235e-08, "logits/chosen": 0.33210188150405884, "logits/rejected": 0.36312466859817505, "logps/chosen": -242.6085968017578, "logps/rejected": -289.72479248046875, "loss": 0.6515, "rewards/accuracies": 0.75, "rewards/chosen": -0.11827802658081055, "rewards/margins": 0.12428691238164902, "rewards/rejected": -0.24256494641304016, "step": 4580 }, { "epoch": 2.4, "learning_rate": 5.8152247784097664e-08, "logits/chosen": 0.24385061860084534, "logits/rejected": 0.29127827286720276, "logps/chosen": -299.4405822753906, "logps/rejected": -340.1574401855469, "loss": 0.6569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.17524193227291107, "rewards/margins": 0.07939636707305908, "rewards/rejected": -0.25463834404945374, "step": 4590 }, { "epoch": 2.41, "learning_rate": 5.717968930076289e-08, "logits/chosen": 0.28695303201675415, "logits/rejected": 0.2945484519004822, "logps/chosen": -341.9978332519531, "logps/rejected": -280.802001953125, "loss": 0.653, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13538150489330292, "rewards/margins": 0.14087039232254028, "rewards/rejected": -0.276251882314682, "step": 4600 }, { "epoch": 2.41, "learning_rate": 5.621428104930528e-08, "logits/chosen": 0.2341652363538742, "logits/rejected": 0.2110733687877655, "logps/chosen": -276.9383239746094, "logps/rejected": -275.8529357910156, "loss": 0.6554, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.17822220921516418, "rewards/margins": 0.0678471028804779, "rewards/rejected": -0.2460693120956421, "step": 4610 }, { "epoch": 2.42, "learning_rate": 5.525605882933965e-08, "logits/chosen": 0.32365158200263977, "logits/rejected": 0.24177177250385284, "logps/chosen": -342.0087890625, "logps/rejected": -319.028564453125, "loss": 0.6471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11949370801448822, "rewards/margins": 0.1479637771844864, "rewards/rejected": -0.2674574851989746, "step": 4620 }, { "epoch": 2.42, "learning_rate": 5.4305058174005853e-08, "logits/chosen": 0.3449974060058594, "logits/rejected": 0.2847765386104584, "logps/chosen": -297.4521179199219, "logps/rejected": -292.80194091796875, "loss": 0.6543, "rewards/accuracies": 0.625, "rewards/chosen": -0.15853652358055115, "rewards/margins": 0.03810811787843704, "rewards/rejected": -0.1966446191072464, "step": 4630 }, { "epoch": 2.43, "learning_rate": 5.33613143486511e-08, "logits/chosen": 0.3558960258960724, "logits/rejected": 0.372741162776947, "logps/chosen": -263.94915771484375, "logps/rejected": -239.63687133789062, "loss": 0.6526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1764945238828659, "rewards/margins": 0.09427468478679657, "rewards/rejected": -0.27076923847198486, "step": 4640 }, { "epoch": 2.43, "learning_rate": 5.242486234952206e-08, "logits/chosen": 0.2200225591659546, "logits/rejected": 0.2050172984600067, "logps/chosen": -323.3846130371094, "logps/rejected": -237.43331909179688, "loss": 0.6561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16000418365001678, "rewards/margins": 0.11692575365304947, "rewards/rejected": -0.27692991495132446, "step": 4650 }, { "epoch": 2.44, "learning_rate": 5.149573690246758e-08, "logits/chosen": 0.25134509801864624, "logits/rejected": 0.3196043074131012, "logps/chosen": -328.13555908203125, "logps/rejected": -286.40325927734375, "loss": 0.6535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.17728720605373383, "rewards/margins": 0.04581373184919357, "rewards/rejected": -0.2231009304523468, "step": 4660 }, { "epoch": 2.44, "learning_rate": 5.057397246165052e-08, "logits/chosen": 0.24966394901275635, "logits/rejected": 0.2608310580253601, "logps/chosen": -335.91424560546875, "logps/rejected": -341.7672119140625, "loss": 0.6554, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13174419105052948, "rewards/margins": 0.08406993746757507, "rewards/rejected": -0.21581411361694336, "step": 4670 }, { "epoch": 2.45, "learning_rate": 4.9659603208270173e-08, "logits/chosen": 0.20629934966564178, "logits/rejected": 0.19532974064350128, "logps/chosen": -393.67547607421875, "logps/rejected": -367.861083984375, "loss": 0.6388, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07362738996744156, "rewards/margins": 0.16155405342578888, "rewards/rejected": -0.23518145084381104, "step": 4680 }, { "epoch": 2.45, "learning_rate": 4.875266304929496e-08, "logits/chosen": 0.28497210144996643, "logits/rejected": 0.2833143472671509, "logps/chosen": -315.1612854003906, "logps/rejected": -271.2629089355469, "loss": 0.6534, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.17749536037445068, "rewards/margins": 0.0488092266023159, "rewards/rejected": -0.2263045758008957, "step": 4690 }, { "epoch": 2.46, "learning_rate": 4.785318561620511e-08, "logits/chosen": 0.218004509806633, "logits/rejected": 0.24838721752166748, "logps/chosen": -409.41473388671875, "logps/rejected": -329.8982238769531, "loss": 0.6505, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09751011431217194, "rewards/margins": 0.14528706669807434, "rewards/rejected": -0.24279718101024628, "step": 4700 }, { "epoch": 2.46, "learning_rate": 4.696120426374503e-08, "logits/chosen": 0.3327026069164276, "logits/rejected": 0.30656957626342773, "logps/chosen": -311.63763427734375, "logps/rejected": -304.68841552734375, "loss": 0.6527, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16270048916339874, "rewards/margins": 0.0921703428030014, "rewards/rejected": -0.25487083196640015, "step": 4710 }, { "epoch": 2.47, "learning_rate": 4.607675206868705e-08, "logits/chosen": 0.22925233840942383, "logits/rejected": 0.2312936782836914, "logps/chosen": -353.39422607421875, "logps/rejected": -296.1007995605469, "loss": 0.6482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12219414860010147, "rewards/margins": 0.14850768446922302, "rewards/rejected": -0.2707018256187439, "step": 4720 }, { "epoch": 2.48, "learning_rate": 4.519986182860452e-08, "logits/chosen": 0.193797767162323, "logits/rejected": 0.2653660774230957, "logps/chosen": -295.244873046875, "logps/rejected": -267.8289489746094, "loss": 0.6456, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1571170836687088, "rewards/margins": 0.13725200295448303, "rewards/rejected": -0.29436904191970825, "step": 4730 }, { "epoch": 2.48, "learning_rate": 4.433056606065552e-08, "logits/chosen": 0.20690850913524628, "logits/rejected": 0.22397968173027039, "logps/chosen": -328.7765808105469, "logps/rejected": -320.0532531738281, "loss": 0.6572, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1482156366109848, "rewards/margins": 0.09247883409261703, "rewards/rejected": -0.24069447815418243, "step": 4740 }, { "epoch": 2.49, "learning_rate": 4.3468897000377427e-08, "logits/chosen": 0.2184235155582428, "logits/rejected": 0.26873156428337097, "logps/chosen": -253.054931640625, "logps/rejected": -231.57522583007812, "loss": 0.6571, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1425979882478714, "rewards/margins": 0.0733003169298172, "rewards/rejected": -0.2158983051776886, "step": 4750 }, { "epoch": 2.49, "learning_rate": 4.2614886600491115e-08, "logits/chosen": 0.19274510443210602, "logits/rejected": 0.13967491686344147, "logps/chosen": -299.489013671875, "logps/rejected": -248.2526092529297, "loss": 0.6506, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1602405607700348, "rewards/margins": 0.08521406352519989, "rewards/rejected": -0.24545462429523468, "step": 4760 }, { "epoch": 2.5, "learning_rate": 4.1768566529716415e-08, "logits/chosen": 0.258291095495224, "logits/rejected": 0.2351008951663971, "logps/chosen": -297.2149353027344, "logps/rejected": -235.2070770263672, "loss": 0.6552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14642244577407837, "rewards/margins": 0.11697183549404144, "rewards/rejected": -0.263394296169281, "step": 4770 }, { "epoch": 2.5, "learning_rate": 4.0929968171597526e-08, "logits/chosen": 0.30216288566589355, "logits/rejected": 0.2607000768184662, "logps/chosen": -351.5521545410156, "logps/rejected": -286.664794921875, "loss": 0.6603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13644321262836456, "rewards/margins": 0.135478213429451, "rewards/rejected": -0.27192145586013794, "step": 4780 }, { "epoch": 2.51, "learning_rate": 4.009912262333942e-08, "logits/chosen": 0.266966849565506, "logits/rejected": 0.27799034118652344, "logps/chosen": -301.3504638671875, "logps/rejected": -274.5638427734375, "loss": 0.6491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15286056697368622, "rewards/margins": 0.0682164877653122, "rewards/rejected": -0.2210770845413208, "step": 4790 }, { "epoch": 2.51, "learning_rate": 3.927606069465442e-08, "logits/chosen": 0.25001880526542664, "logits/rejected": 0.2829858660697937, "logps/chosen": -288.3041076660156, "logps/rejected": -270.7674560546875, "loss": 0.6472, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11720645427703857, "rewards/margins": 0.1586742252111435, "rewards/rejected": -0.27588069438934326, "step": 4800 }, { "epoch": 2.52, "learning_rate": 3.8460812906620037e-08, "logits/chosen": 0.28133073449134827, "logits/rejected": 0.225880429148674, "logps/chosen": -277.67620849609375, "logps/rejected": -286.3600158691406, "loss": 0.6493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1528925597667694, "rewards/margins": 0.10731463134288788, "rewards/rejected": -0.2602071762084961, "step": 4810 }, { "epoch": 2.52, "learning_rate": 3.765340949054696e-08, "logits/chosen": 0.28822416067123413, "logits/rejected": 0.2694561183452606, "logps/chosen": -341.54034423828125, "logps/rejected": -300.6506042480469, "loss": 0.6472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14739778637886047, "rewards/margins": 0.13927289843559265, "rewards/rejected": -0.2866706848144531, "step": 4820 }, { "epoch": 2.53, "learning_rate": 3.685388038685811e-08, "logits/chosen": 0.25113362073898315, "logits/rejected": 0.20978930592536926, "logps/chosen": -391.368896484375, "logps/rejected": -324.4923400878906, "loss": 0.6436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14761051535606384, "rewards/margins": 0.11779048293828964, "rewards/rejected": -0.2654009759426117, "step": 4830 }, { "epoch": 2.53, "learning_rate": 3.60622552439783e-08, "logits/chosen": 0.30557817220687866, "logits/rejected": 0.28471964597702026, "logps/chosen": -309.66851806640625, "logps/rejected": -264.75189208984375, "loss": 0.6425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15169253945350647, "rewards/margins": 0.07038528472185135, "rewards/rejected": -0.22207781672477722, "step": 4840 }, { "epoch": 2.54, "learning_rate": 3.527856341723479e-08, "logits/chosen": 0.3501953184604645, "logits/rejected": 0.36876240372657776, "logps/chosen": -275.45587158203125, "logps/rejected": -287.1562194824219, "loss": 0.6401, "rewards/accuracies": 0.75, "rewards/chosen": -0.1547791212797165, "rewards/margins": 0.11520648002624512, "rewards/rejected": -0.2699856162071228, "step": 4850 }, { "epoch": 2.54, "learning_rate": 3.4502833967768816e-08, "logits/chosen": 0.260027140378952, "logits/rejected": 0.2362249791622162, "logps/chosen": -355.7345886230469, "logps/rejected": -350.7608642578125, "loss": 0.655, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13119223713874817, "rewards/margins": 0.1489580124616623, "rewards/rejected": -0.28015023469924927, "step": 4860 }, { "epoch": 2.55, "learning_rate": 3.373509566145793e-08, "logits/chosen": 0.30141669511795044, "logits/rejected": 0.26752427220344543, "logps/chosen": -323.4403991699219, "logps/rejected": -272.54815673828125, "loss": 0.6551, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15655481815338135, "rewards/margins": 0.11246142536401749, "rewards/rejected": -0.26901623606681824, "step": 4870 }, { "epoch": 2.55, "learning_rate": 3.2975376967849104e-08, "logits/chosen": 0.3648565411567688, "logits/rejected": 0.3760683536529541, "logps/chosen": -275.06890869140625, "logps/rejected": -216.0530548095703, "loss": 0.6525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13468894362449646, "rewards/margins": 0.09779195487499237, "rewards/rejected": -0.23248091340065002, "step": 4880 }, { "epoch": 2.56, "learning_rate": 3.222370605910332e-08, "logits/chosen": 0.2788364887237549, "logits/rejected": 0.2764519155025482, "logps/chosen": -305.3879699707031, "logps/rejected": -265.95086669921875, "loss": 0.6438, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1286066770553589, "rewards/margins": 0.139145165681839, "rewards/rejected": -0.26775187253952026, "step": 4890 }, { "epoch": 2.56, "learning_rate": 3.1480110808950746e-08, "logits/chosen": 0.21629850566387177, "logits/rejected": 0.2902226150035858, "logps/chosen": -381.25604248046875, "logps/rejected": -385.64727783203125, "loss": 0.6333, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09950147569179535, "rewards/margins": 0.1605621874332428, "rewards/rejected": -0.26006367802619934, "step": 4900 }, { "epoch": 2.57, "learning_rate": 3.07446187916568e-08, "logits/chosen": 0.2720637321472168, "logits/rejected": 0.3146423399448395, "logps/chosen": -367.902587890625, "logps/rejected": -328.25006103515625, "loss": 0.6523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.138359934091568, "rewards/margins": 0.11164422333240509, "rewards/rejected": -0.2500041723251343, "step": 4910 }, { "epoch": 2.57, "learning_rate": 3.001725728100021e-08, "logits/chosen": 0.16783829033374786, "logits/rejected": 0.21476595103740692, "logps/chosen": -323.3216857910156, "logps/rejected": -346.15203857421875, "loss": 0.6429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1808640956878662, "rewards/margins": 0.06437064707279205, "rewards/rejected": -0.24523475766181946, "step": 4920 }, { "epoch": 2.58, "learning_rate": 2.9298053249261238e-08, "logits/chosen": 0.21411249041557312, "logits/rejected": 0.2068692445755005, "logps/chosen": -301.1133728027344, "logps/rejected": -324.4837646484375, "loss": 0.6585, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.18682220578193665, "rewards/margins": 0.061571698635816574, "rewards/rejected": -0.24839389324188232, "step": 4930 }, { "epoch": 2.59, "learning_rate": 2.8587033366221534e-08, "logits/chosen": 0.31188955903053284, "logits/rejected": 0.3038763999938965, "logps/chosen": -335.08197021484375, "logps/rejected": -356.7730712890625, "loss": 0.6412, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13679111003875732, "rewards/margins": 0.11448071151971817, "rewards/rejected": -0.2512718141078949, "step": 4940 }, { "epoch": 2.59, "learning_rate": 2.7884223998175248e-08, "logits/chosen": 0.316620409488678, "logits/rejected": 0.3149321675300598, "logps/chosen": -250.12158203125, "logps/rejected": -260.7198486328125, "loss": 0.6407, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.18968793749809265, "rewards/margins": 0.04125380888581276, "rewards/rejected": -0.2309417724609375, "step": 4950 }, { "epoch": 2.6, "learning_rate": 2.718965120695141e-08, "logits/chosen": 0.21362006664276123, "logits/rejected": 0.24621066451072693, "logps/chosen": -362.2911682128906, "logps/rejected": -329.33746337890625, "loss": 0.6531, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1833564043045044, "rewards/margins": 0.039215873926877975, "rewards/rejected": -0.22257229685783386, "step": 4960 }, { "epoch": 2.6, "learning_rate": 2.6503340748947083e-08, "logits/chosen": 0.30787166953086853, "logits/rejected": 0.2651337683200836, "logps/chosen": -365.96856689453125, "logps/rejected": -298.203369140625, "loss": 0.6515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08608182519674301, "rewards/margins": 0.17453762888908386, "rewards/rejected": -0.26061946153640747, "step": 4970 }, { "epoch": 2.61, "learning_rate": 2.5825318074172763e-08, "logits/chosen": 0.3091648817062378, "logits/rejected": 0.3526086211204529, "logps/chosen": -330.2383117675781, "logps/rejected": -311.9805908203125, "loss": 0.6627, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.18691927194595337, "rewards/margins": 0.04171084240078926, "rewards/rejected": -0.22863011062145233, "step": 4980 }, { "epoch": 2.61, "learning_rate": 2.5155608325308358e-08, "logits/chosen": 0.3669896423816681, "logits/rejected": 0.34136396646499634, "logps/chosen": -289.3947448730469, "logps/rejected": -231.5489044189453, "loss": 0.6519, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.15817251801490784, "rewards/margins": 0.06915486603975296, "rewards/rejected": -0.2273273766040802, "step": 4990 }, { "epoch": 2.62, "learning_rate": 2.4494236336770695e-08, "logits/chosen": 0.23264248669147491, "logits/rejected": 0.28665319085121155, "logps/chosen": -327.484375, "logps/rejected": -304.4945983886719, "loss": 0.6509, "rewards/accuracies": 0.625, "rewards/chosen": -0.10859771817922592, "rewards/margins": 0.13269570469856262, "rewards/rejected": -0.24129343032836914, "step": 5000 }, { "epoch": 2.62, "learning_rate": 2.3841226633792983e-08, "logits/chosen": 0.38944217562675476, "logits/rejected": 0.4039441645145416, "logps/chosen": -265.03985595703125, "logps/rejected": -283.97796630859375, "loss": 0.6588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17852087318897247, "rewards/margins": 0.06068998575210571, "rewards/rejected": -0.23921087384223938, "step": 5010 }, { "epoch": 2.63, "learning_rate": 2.319660343151511e-08, "logits/chosen": 0.22561486065387726, "logits/rejected": 0.269603431224823, "logps/chosen": -277.7508850097656, "logps/rejected": -330.13018798828125, "loss": 0.6513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16024520993232727, "rewards/margins": 0.09627407789230347, "rewards/rejected": -0.25651925802230835, "step": 5020 }, { "epoch": 2.63, "learning_rate": 2.2560390634085715e-08, "logits/chosen": 0.2743435800075531, "logits/rejected": 0.2494051158428192, "logps/chosen": -335.19024658203125, "logps/rejected": -276.3586730957031, "loss": 0.6423, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1546277403831482, "rewards/margins": 0.1273549497127533, "rewards/rejected": -0.2819827198982239, "step": 5030 }, { "epoch": 2.64, "learning_rate": 2.1932611833775843e-08, "logits/chosen": 0.31680962443351746, "logits/rejected": 0.3296371102333069, "logps/chosen": -241.97036743164062, "logps/rejected": -226.87362670898438, "loss": 0.6501, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15829713642597198, "rewards/margins": 0.05115853622555733, "rewards/rejected": -0.2094556987285614, "step": 5040 }, { "epoch": 2.64, "learning_rate": 2.1313290310103897e-08, "logits/chosen": 0.25922948122024536, "logits/rejected": 0.27459073066711426, "logps/chosen": -303.64093017578125, "logps/rejected": -268.06903076171875, "loss": 0.6481, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14507296681404114, "rewards/margins": 0.05711113288998604, "rewards/rejected": -0.20218412578105927, "step": 5050 }, { "epoch": 2.65, "learning_rate": 2.0702449028972696e-08, "logits/chosen": 0.2910730540752411, "logits/rejected": 0.21585910022258759, "logps/chosen": -404.71173095703125, "logps/rejected": -333.95281982421875, "loss": 0.6444, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1354363113641739, "rewards/margins": 0.13795578479766846, "rewards/rejected": -0.2733921408653259, "step": 5060 }, { "epoch": 2.65, "learning_rate": 2.0100110641817547e-08, "logits/chosen": 0.35233062505722046, "logits/rejected": 0.35314399003982544, "logps/chosen": -290.5165100097656, "logps/rejected": -295.0611267089844, "loss": 0.6561, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15800753235816956, "rewards/margins": 0.08591978251934052, "rewards/rejected": -0.24392731487751007, "step": 5070 }, { "epoch": 2.66, "learning_rate": 1.9506297484766427e-08, "logits/chosen": 0.2969042658805847, "logits/rejected": 0.24884216487407684, "logps/chosen": -309.47406005859375, "logps/rejected": -283.39501953125, "loss": 0.6405, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16573520004749298, "rewards/margins": 0.12126419693231583, "rewards/rejected": -0.2869994044303894, "step": 5080 }, { "epoch": 2.66, "learning_rate": 1.8921031577811692e-08, "logits/chosen": 0.30634480714797974, "logits/rejected": 0.23022684454917908, "logps/chosen": -341.3998107910156, "logps/rejected": -308.49285888671875, "loss": 0.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.17996549606323242, "rewards/margins": 0.09575355052947998, "rewards/rejected": -0.2757190465927124, "step": 5090 }, { "epoch": 2.67, "learning_rate": 1.834433462399351e-08, "logits/chosen": 0.23569568991661072, "logits/rejected": 0.2792285084724426, "logps/chosen": -388.8201599121094, "logps/rejected": -319.556884765625, "loss": 0.651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08855469524860382, "rewards/margins": 0.13768477737903595, "rewards/rejected": -0.22623948752880096, "step": 5100 }, { "epoch": 2.67, "learning_rate": 1.7776228008594962e-08, "logits/chosen": 0.266242653131485, "logits/rejected": 0.2676263749599457, "logps/chosen": -301.49359130859375, "logps/rejected": -362.7807312011719, "loss": 0.6505, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17288120090961456, "rewards/margins": 0.12345176935195923, "rewards/rejected": -0.2963329553604126, "step": 5110 }, { "epoch": 2.68, "learning_rate": 1.721673279834926e-08, "logits/chosen": 0.2719994783401489, "logits/rejected": 0.22567155957221985, "logps/chosen": -358.1050720214844, "logps/rejected": -335.2202453613281, "loss": 0.6548, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1040639653801918, "rewards/margins": 0.1355864554643631, "rewards/rejected": -0.2396504133939743, "step": 5120 }, { "epoch": 2.68, "learning_rate": 1.666586974065831e-08, "logits/chosen": 0.19955766201019287, "logits/rejected": 0.22099065780639648, "logps/chosen": -400.0057067871094, "logps/rejected": -326.6587829589844, "loss": 0.6457, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1376148909330368, "rewards/margins": 0.14305511116981506, "rewards/rejected": -0.28067004680633545, "step": 5130 }, { "epoch": 2.69, "learning_rate": 1.6123659262823497e-08, "logits/chosen": 0.2476876676082611, "logits/rejected": 0.31213703751564026, "logps/chosen": -334.13592529296875, "logps/rejected": -341.7435607910156, "loss": 0.6548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1919669210910797, "rewards/margins": 0.05596815422177315, "rewards/rejected": -0.24793505668640137, "step": 5140 }, { "epoch": 2.69, "learning_rate": 1.5590121471288104e-08, "logits/chosen": 0.17362567782402039, "logits/rejected": 0.22493436932563782, "logps/chosen": -305.18109130859375, "logps/rejected": -284.05657958984375, "loss": 0.6466, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18385998904705048, "rewards/margins": 0.07839556038379669, "rewards/rejected": -0.2622555196285248, "step": 5150 }, { "epoch": 2.7, "learning_rate": 1.5065276150891787e-08, "logits/chosen": 0.33663293719291687, "logits/rejected": 0.34182173013687134, "logps/chosen": -353.5645751953125, "logps/rejected": -282.2459716796875, "loss": 0.6498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14286093413829803, "rewards/margins": 0.061833806335926056, "rewards/rejected": -0.2046947181224823, "step": 5160 }, { "epoch": 2.71, "learning_rate": 1.4549142764136768e-08, "logits/chosen": 0.19153036177158356, "logits/rejected": 0.20434775948524475, "logps/chosen": -255.4372100830078, "logps/rejected": -241.89028930664062, "loss": 0.6467, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.19665054976940155, "rewards/margins": 0.032036345452070236, "rewards/rejected": -0.22868689894676208, "step": 5170 }, { "epoch": 2.71, "learning_rate": 1.4041740450466383e-08, "logits/chosen": 0.21942445635795593, "logits/rejected": 0.26775887608528137, "logps/chosen": -307.8087463378906, "logps/rejected": -300.2779541015625, "loss": 0.6502, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14142893254756927, "rewards/margins": 0.1599656045436859, "rewards/rejected": -0.3013945519924164, "step": 5180 }, { "epoch": 2.72, "learning_rate": 1.3543088025555094e-08, "logits/chosen": 0.26968908309936523, "logits/rejected": 0.3253975510597229, "logps/chosen": -323.02215576171875, "logps/rejected": -304.500732421875, "loss": 0.643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16611634194850922, "rewards/margins": 0.05251390486955643, "rewards/rejected": -0.21863026916980743, "step": 5190 }, { "epoch": 2.72, "learning_rate": 1.3053203980610744e-08, "logits/chosen": 0.2522396445274353, "logits/rejected": 0.27969008684158325, "logps/chosen": -273.38287353515625, "logps/rejected": -268.48370361328125, "loss": 0.6467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.18352146446704865, "rewards/margins": 0.06839027255773544, "rewards/rejected": -0.2519117295742035, "step": 5200 }, { "epoch": 2.73, "learning_rate": 1.2572106481689243e-08, "logits/chosen": 0.28615397214889526, "logits/rejected": 0.2679155170917511, "logps/chosen": -298.0022277832031, "logps/rejected": -299.09765625, "loss": 0.6431, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16940812766551971, "rewards/margins": 0.08789737522602081, "rewards/rejected": -0.25730547308921814, "step": 5210 }, { "epoch": 2.73, "learning_rate": 1.2099813369020467e-08, "logits/chosen": 0.31619611382484436, "logits/rejected": 0.32709795236587524, "logps/chosen": -288.56951904296875, "logps/rejected": -293.2323303222656, "loss": 0.6493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.17724359035491943, "rewards/margins": 0.07778071612119675, "rewards/rejected": -0.2550243139266968, "step": 5220 }, { "epoch": 2.74, "learning_rate": 1.1636342156346846e-08, "logits/chosen": 0.2666959762573242, "logits/rejected": 0.2191891372203827, "logps/chosen": -348.546630859375, "logps/rejected": -304.4643859863281, "loss": 0.637, "rewards/accuracies": 0.75, "rewards/chosen": -0.12001262605190277, "rewards/margins": 0.17269203066825867, "rewards/rejected": -0.29270467162132263, "step": 5230 }, { "epoch": 2.74, "learning_rate": 1.1181710030274043e-08, "logits/chosen": 0.2580859065055847, "logits/rejected": 0.19054308533668518, "logps/chosen": -390.20050048828125, "logps/rejected": -336.88482666015625, "loss": 0.6508, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15105481445789337, "rewards/margins": 0.10737206786870956, "rewards/rejected": -0.25842687487602234, "step": 5240 }, { "epoch": 2.75, "learning_rate": 1.0735933849633561e-08, "logits/chosen": 0.26194819808006287, "logits/rejected": 0.285000741481781, "logps/chosen": -263.4084777832031, "logps/rejected": -237.882568359375, "loss": 0.6546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17627274990081787, "rewards/margins": 0.08483530580997467, "rewards/rejected": -0.26110807061195374, "step": 5250 }, { "epoch": 2.75, "learning_rate": 1.0299030144857445e-08, "logits/chosen": 0.2571147680282593, "logits/rejected": 0.24691708385944366, "logps/chosen": -366.71435546875, "logps/rejected": -301.5981140136719, "loss": 0.6364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10573141276836395, "rewards/margins": 0.1143946647644043, "rewards/rejected": -0.22012607753276825, "step": 5260 }, { "epoch": 2.76, "learning_rate": 9.871015117365516e-09, "logits/chosen": 0.2549286484718323, "logits/rejected": 0.22352001070976257, "logps/chosen": -308.32208251953125, "logps/rejected": -247.04324340820312, "loss": 0.6534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13238653540611267, "rewards/margins": 0.17268718779087067, "rewards/rejected": -0.30507367849349976, "step": 5270 }, { "epoch": 2.76, "learning_rate": 9.451904638964447e-09, "logits/chosen": 0.24427881836891174, "logits/rejected": 0.2131018191576004, "logps/chosen": -360.16229248046875, "logps/rejected": -383.1683349609375, "loss": 0.6616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1408374309539795, "rewards/margins": 0.0509166419506073, "rewards/rejected": -0.1917540729045868, "step": 5280 }, { "epoch": 2.77, "learning_rate": 9.041714251259214e-09, "logits/chosen": 0.2528178095817566, "logits/rejected": 0.23741415143013, "logps/chosen": -299.9393005371094, "logps/rejected": -269.7351989746094, "loss": 0.6448, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.16982091963291168, "rewards/margins": 0.10371136665344238, "rewards/rejected": -0.27353227138519287, "step": 5290 }, { "epoch": 2.77, "learning_rate": 8.640459165076857e-09, "logits/chosen": 0.15313532948493958, "logits/rejected": 0.23000986874103546, "logps/chosen": -320.9138488769531, "logps/rejected": -266.3368835449219, "loss": 0.6559, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1354338675737381, "rewards/margins": 0.07715844362974167, "rewards/rejected": -0.21259228885173798, "step": 5300 }, { "epoch": 2.78, "learning_rate": 8.248154259902246e-09, "logits/chosen": 0.3111906051635742, "logits/rejected": 0.30129846930503845, "logps/chosen": -243.94140625, "logps/rejected": -262.7614440917969, "loss": 0.6409, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1589013636112213, "rewards/margins": 0.06876397132873535, "rewards/rejected": -0.22766533493995667, "step": 5310 }, { "epoch": 2.78, "learning_rate": 7.86481408332651e-09, "logits/chosen": 0.23351125419139862, "logits/rejected": 0.248075932264328, "logps/chosen": -362.76446533203125, "logps/rejected": -319.3492736816406, "loss": 0.6513, "rewards/accuracies": 0.75, "rewards/chosen": -0.10988447815179825, "rewards/margins": 0.1581057608127594, "rewards/rejected": -0.26799023151397705, "step": 5320 }, { "epoch": 2.79, "learning_rate": 7.490452850507506e-09, "logits/chosen": 0.25614452362060547, "logits/rejected": 0.22783274948596954, "logps/chosen": -309.2509460449219, "logps/rejected": -323.1338806152344, "loss": 0.6487, "rewards/accuracies": 0.625, "rewards/chosen": -0.15227383375167847, "rewards/margins": 0.11466242372989655, "rewards/rejected": -0.2669362425804138, "step": 5330 }, { "epoch": 2.79, "learning_rate": 7.1250844436426535e-09, "logits/chosen": 0.23697538673877716, "logits/rejected": 0.3347683548927307, "logps/chosen": -290.0230407714844, "logps/rejected": -326.98626708984375, "loss": 0.6518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17432980239391327, "rewards/margins": 0.059077221900224686, "rewards/rejected": -0.23340705037117004, "step": 5340 }, { "epoch": 2.8, "learning_rate": 6.768722411454153e-09, "logits/chosen": 0.3221455216407776, "logits/rejected": 0.3266808092594147, "logps/chosen": -335.4026794433594, "logps/rejected": -314.0110778808594, "loss": 0.6511, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15195028483867645, "rewards/margins": 0.08262494951486588, "rewards/rejected": -0.23457522690296173, "step": 5350 }, { "epoch": 2.8, "learning_rate": 6.421379968686663e-09, "logits/chosen": 0.27548637986183167, "logits/rejected": 0.2258666306734085, "logps/chosen": -308.8509826660156, "logps/rejected": -300.86566162109375, "loss": 0.6423, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1670268476009369, "rewards/margins": 0.09721226990222931, "rewards/rejected": -0.2642391324043274, "step": 5360 }, { "epoch": 2.81, "learning_rate": 6.083069995617113e-09, "logits/chosen": 0.3081795573234558, "logits/rejected": 0.339841365814209, "logps/chosen": -307.8616027832031, "logps/rejected": -261.6089172363281, "loss": 0.6438, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12482471764087677, "rewards/margins": 0.1607241928577423, "rewards/rejected": -0.28554895520210266, "step": 5370 }, { "epoch": 2.82, "learning_rate": 5.753805037577192e-09, "logits/chosen": 0.22058221697807312, "logits/rejected": 0.23570296168327332, "logps/chosen": -345.9975891113281, "logps/rejected": -292.41510009765625, "loss": 0.6427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11676367372274399, "rewards/margins": 0.13583344221115112, "rewards/rejected": -0.2525970935821533, "step": 5380 }, { "epoch": 2.82, "learning_rate": 5.433597304488113e-09, "logits/chosen": 0.30112963914871216, "logits/rejected": 0.2849612832069397, "logps/chosen": -310.7317810058594, "logps/rejected": -260.43902587890625, "loss": 0.6522, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15954820811748505, "rewards/margins": 0.08634625375270844, "rewards/rejected": -0.24589447677135468, "step": 5390 }, { "epoch": 2.83, "learning_rate": 5.122458670407836e-09, "logits/chosen": 0.25500181317329407, "logits/rejected": 0.26972508430480957, "logps/chosen": -353.02105712890625, "logps/rejected": -318.2257385253906, "loss": 0.6532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10740556567907333, "rewards/margins": 0.1574164181947708, "rewards/rejected": -0.26482197642326355, "step": 5400 }, { "epoch": 2.83, "learning_rate": 4.820400673090669e-09, "logits/chosen": 0.235460564494133, "logits/rejected": 0.25399118661880493, "logps/chosen": -298.4765625, "logps/rejected": -291.35833740234375, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": -0.19252967834472656, "rewards/margins": 0.07367880642414093, "rewards/rejected": -0.2662084698677063, "step": 5410 }, { "epoch": 2.84, "learning_rate": 4.5274345135595525e-09, "logits/chosen": 0.20221033692359924, "logits/rejected": 0.22863800823688507, "logps/chosen": -316.8031921386719, "logps/rejected": -259.7807312011719, "loss": 0.652, "rewards/accuracies": 0.625, "rewards/chosen": -0.18545842170715332, "rewards/margins": 0.08575184643268585, "rewards/rejected": -0.271210253238678, "step": 5420 }, { "epoch": 2.84, "learning_rate": 4.243571055690648e-09, "logits/chosen": 0.3070654273033142, "logits/rejected": 0.33014267683029175, "logps/chosen": -316.16876220703125, "logps/rejected": -275.2470703125, "loss": 0.6541, "rewards/accuracies": 0.625, "rewards/chosen": -0.1588907241821289, "rewards/margins": 0.10213694721460342, "rewards/rejected": -0.26102766394615173, "step": 5430 }, { "epoch": 2.85, "learning_rate": 3.968820825810431e-09, "logits/chosen": 0.204188734292984, "logits/rejected": 0.23602977395057678, "logps/chosen": -255.2363739013672, "logps/rejected": -281.852783203125, "loss": 0.653, "rewards/accuracies": 0.625, "rewards/chosen": -0.17094658315181732, "rewards/margins": 0.0320189967751503, "rewards/rejected": -0.20296558737754822, "step": 5440 }, { "epoch": 2.85, "learning_rate": 3.7031940123053997e-09, "logits/chosen": 0.20005813241004944, "logits/rejected": 0.1966632455587387, "logps/chosen": -378.98028564453125, "logps/rejected": -298.57147216796875, "loss": 0.6502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11689343303442001, "rewards/margins": 0.11867674440145493, "rewards/rejected": -0.23557019233703613, "step": 5450 }, { "epoch": 2.86, "learning_rate": 3.4467004652442842e-09, "logits/chosen": 0.27505481243133545, "logits/rejected": 0.289678156375885, "logps/chosen": -343.13421630859375, "logps/rejected": -371.6892395019531, "loss": 0.6576, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1668560802936554, "rewards/margins": 0.056518711149692535, "rewards/rejected": -0.22337479889392853, "step": 5460 }, { "epoch": 2.86, "learning_rate": 3.1993496960127653e-09, "logits/chosen": 0.3392692506313324, "logits/rejected": 0.39931219816207886, "logps/chosen": -337.546142578125, "logps/rejected": -295.0710754394531, "loss": 0.6407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15826204419136047, "rewards/margins": 0.12155953794717789, "rewards/rejected": -0.27982157468795776, "step": 5470 }, { "epoch": 2.87, "learning_rate": 2.9611508769606663e-09, "logits/chosen": 0.3724585175514221, "logits/rejected": 0.352268248796463, "logps/chosen": -331.1094055175781, "logps/rejected": -318.9394226074219, "loss": 0.6465, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15400220453739166, "rewards/margins": 0.11374969780445099, "rewards/rejected": -0.26775187253952026, "step": 5480 }, { "epoch": 2.87, "learning_rate": 2.7321128410620344e-09, "logits/chosen": 0.27805405855178833, "logits/rejected": 0.2672078013420105, "logps/chosen": -279.8021545410156, "logps/rejected": -265.70623779296875, "loss": 0.6481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.168594628572464, "rewards/margins": 0.07576708495616913, "rewards/rejected": -0.2443617284297943, "step": 5490 }, { "epoch": 2.88, "learning_rate": 2.5122440815873724e-09, "logits/chosen": 0.17598295211791992, "logits/rejected": 0.17596113681793213, "logps/chosen": -323.70770263671875, "logps/rejected": -322.7217102050781, "loss": 0.6518, "rewards/accuracies": 0.75, "rewards/chosen": -0.12877288460731506, "rewards/margins": 0.17089615762233734, "rewards/rejected": -0.2996690571308136, "step": 5500 }, { "epoch": 2.88, "learning_rate": 2.301552751788838e-09, "logits/chosen": 0.1916041374206543, "logits/rejected": 0.2576969563961029, "logps/chosen": -275.68389892578125, "logps/rejected": -232.2618865966797, "loss": 0.6408, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16759951412677765, "rewards/margins": 0.09753932058811188, "rewards/rejected": -0.26513880491256714, "step": 5510 }, { "epoch": 2.89, "learning_rate": 2.1000466645978433e-09, "logits/chosen": 0.24124963581562042, "logits/rejected": 0.26684752106666565, "logps/chosen": -298.94512939453125, "logps/rejected": -264.2241516113281, "loss": 0.6566, "rewards/accuracies": 0.625, "rewards/chosen": -0.1678083837032318, "rewards/margins": 0.09957209974527359, "rewards/rejected": -0.2673804759979248, "step": 5520 }, { "epoch": 2.89, "learning_rate": 1.9077332923353728e-09, "logits/chosen": 0.2285464107990265, "logits/rejected": 0.2565908133983612, "logps/chosen": -353.0976867675781, "logps/rejected": -318.3909606933594, "loss": 0.6449, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12425023317337036, "rewards/margins": 0.18167167901992798, "rewards/rejected": -0.30592188239097595, "step": 5530 }, { "epoch": 2.9, "learning_rate": 1.7246197664347872e-09, "logits/chosen": 0.26658621430397034, "logits/rejected": 0.2418135702610016, "logps/chosen": -272.8917236328125, "logps/rejected": -229.7845001220703, "loss": 0.6553, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1345643848180771, "rewards/margins": 0.13201124966144562, "rewards/rejected": -0.2665756344795227, "step": 5540 }, { "epoch": 2.9, "learning_rate": 1.5507128771775346e-09, "logits/chosen": 0.12898774445056915, "logits/rejected": 0.2068668156862259, "logps/chosen": -326.1919860839844, "logps/rejected": -342.0452575683594, "loss": 0.6396, "rewards/accuracies": 0.625, "rewards/chosen": -0.19251129031181335, "rewards/margins": 0.06093855947256088, "rewards/rejected": -0.25344985723495483, "step": 5550 }, { "epoch": 2.91, "learning_rate": 1.3860190734411858e-09, "logits/chosen": 0.1829042136669159, "logits/rejected": 0.19009463489055634, "logps/chosen": -242.2975616455078, "logps/rejected": -209.1060333251953, "loss": 0.6528, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1318592131137848, "rewards/margins": 0.10238520056009293, "rewards/rejected": -0.2342444211244583, "step": 5560 }, { "epoch": 2.91, "learning_rate": 1.2305444624604034e-09, "logits/chosen": 0.21027176082134247, "logits/rejected": 0.21280896663665771, "logps/chosen": -309.7834167480469, "logps/rejected": -266.2825012207031, "loss": 0.6492, "rewards/accuracies": 0.75, "rewards/chosen": -0.15875230729579926, "rewards/margins": 0.10218574851751328, "rewards/rejected": -0.2609381079673767, "step": 5570 }, { "epoch": 2.92, "learning_rate": 1.0842948096004835e-09, "logits/chosen": 0.2691759467124939, "logits/rejected": 0.2674081325531006, "logps/chosen": -276.37274169921875, "logps/rejected": -237.9047393798828, "loss": 0.6444, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11138792335987091, "rewards/margins": 0.1745198369026184, "rewards/rejected": -0.2859077453613281, "step": 5580 }, { "epoch": 2.93, "learning_rate": 9.472755381434161e-10, "logits/chosen": 0.32743868231773376, "logits/rejected": 0.31935983896255493, "logps/chosen": -258.36651611328125, "logps/rejected": -276.4884948730469, "loss": 0.6507, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.19904500246047974, "rewards/margins": 0.02864205464720726, "rewards/rejected": -0.2276870757341385, "step": 5590 }, { "epoch": 2.93, "learning_rate": 8.194917290869907e-10, "logits/chosen": 0.28455591201782227, "logits/rejected": 0.2898333966732025, "logps/chosen": -334.1565856933594, "logps/rejected": -255.84130859375, "loss": 0.635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15140490233898163, "rewards/margins": 0.15295550227165222, "rewards/rejected": -0.30436041951179504, "step": 5600 }, { "epoch": 2.94, "learning_rate": 7.009481209561685e-10, "logits/chosen": 0.26674506068229675, "logits/rejected": 0.24132461845874786, "logps/chosen": -368.4808349609375, "logps/rejected": -337.98358154296875, "loss": 0.6436, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16283926367759705, "rewards/margins": 0.06844694167375565, "rewards/rejected": -0.2312861979007721, "step": 5610 }, { "epoch": 2.94, "learning_rate": 5.916491096275845e-10, "logits/chosen": 0.21892204880714417, "logits/rejected": 0.2129373550415039, "logps/chosen": -298.697998046875, "logps/rejected": -279.541015625, "loss": 0.6529, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.2142627239227295, "rewards/margins": 0.043218888342380524, "rewards/rejected": -0.2574816346168518, "step": 5620 }, { "epoch": 2.95, "learning_rate": 4.915987481662887e-10, "logits/chosen": 0.24894659221172333, "logits/rejected": 0.26731112599372864, "logps/chosen": -295.58404541015625, "logps/rejected": -274.62158203125, "loss": 0.6419, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1257566511631012, "rewards/margins": 0.1312040090560913, "rewards/rejected": -0.2569606900215149, "step": 5630 }, { "epoch": 2.95, "learning_rate": 4.0080074667570017e-10, "logits/chosen": 0.29957491159439087, "logits/rejected": 0.3261059820652008, "logps/chosen": -354.64599609375, "logps/rejected": -299.35784912109375, "loss": 0.6496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.16229000687599182, "rewards/margins": 0.11383986473083496, "rewards/rejected": -0.2761298716068268, "step": 5640 }, { "epoch": 2.96, "learning_rate": 3.1925847215980017e-10, "logits/chosen": 0.1392301321029663, "logits/rejected": 0.21554407477378845, "logps/chosen": -423.4371032714844, "logps/rejected": -379.70635986328125, "loss": 0.6422, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16402511298656464, "rewards/margins": 0.11438943445682526, "rewards/rejected": -0.2784145474433899, "step": 5650 }, { "epoch": 2.96, "learning_rate": 2.469749483985095e-10, "logits/chosen": 0.3565462827682495, "logits/rejected": 0.36484184861183167, "logps/chosen": -282.6678771972656, "logps/rejected": -300.89361572265625, "loss": 0.6606, "rewards/accuracies": 0.625, "rewards/chosen": -0.17763136327266693, "rewards/margins": 0.06572236865758896, "rewards/rejected": -0.2433536946773529, "step": 5660 }, { "epoch": 2.97, "learning_rate": 1.8395285583530652e-10, "logits/chosen": 0.24649211764335632, "logits/rejected": 0.2343049943447113, "logps/chosen": -367.00567626953125, "logps/rejected": -310.95733642578125, "loss": 0.6447, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11231978982686996, "rewards/margins": 0.1657448410987854, "rewards/rejected": -0.27806463837623596, "step": 5670 }, { "epoch": 2.97, "learning_rate": 1.3019453147805614e-10, "logits/chosen": 0.29656416177749634, "logits/rejected": 0.2665735185146332, "logps/chosen": -250.88455200195312, "logps/rejected": -238.7122802734375, "loss": 0.6589, "rewards/accuracies": 0.625, "rewards/chosen": -0.1941298097372055, "rewards/margins": 0.03171468526124954, "rewards/rejected": -0.22584450244903564, "step": 5680 }, { "epoch": 2.98, "learning_rate": 8.570196881216297e-11, "logits/chosen": 0.3129049837589264, "logits/rejected": 0.2954845726490021, "logps/chosen": -409.77923583984375, "logps/rejected": -367.190185546875, "loss": 0.6429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10958588123321533, "rewards/margins": 0.10764807462692261, "rewards/rejected": -0.21723394095897675, "step": 5690 }, { "epoch": 2.98, "learning_rate": 5.0476817726852194e-11, "logits/chosen": 0.2936273217201233, "logits/rejected": 0.27621278166770935, "logps/chosen": -325.35809326171875, "logps/rejected": -317.4355163574219, "loss": 0.6502, "rewards/accuracies": 0.75, "rewards/chosen": -0.1530463546514511, "rewards/margins": 0.14051346480846405, "rewards/rejected": -0.2935597896575928, "step": 5700 }, { "epoch": 2.99, "learning_rate": 2.4520384453746712e-11, "logits/chosen": 0.31328773498535156, "logits/rejected": 0.23820796608924866, "logps/chosen": -366.4006042480469, "logps/rejected": -264.0309753417969, "loss": 0.6488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1539250612258911, "rewards/margins": 0.1407986581325531, "rewards/rejected": -0.2947237193584442, "step": 5710 }, { "epoch": 2.99, "learning_rate": 7.833631518627815e-12, "logits/chosen": 0.197910338640213, "logits/rejected": 0.16825783252716064, "logps/chosen": -319.4342041015625, "logps/rejected": -324.4068908691406, "loss": 0.6445, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15306967496871948, "rewards/margins": 0.12025105953216553, "rewards/rejected": -0.273320734500885, "step": 5720 }, { "epoch": 3.0, "learning_rate": 4.1717770565830033e-13, "logits/chosen": 0.24477490782737732, "logits/rejected": 0.24169504642486572, "logps/chosen": -302.2257995605469, "logps/rejected": -296.517822265625, "loss": 0.6535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13678932189941406, "rewards/margins": 0.1357504278421402, "rewards/rejected": -0.27253976464271545, "step": 5730 }, { "epoch": 3.0, "step": 5733, "total_flos": 0.0, "train_loss": 0.6659654320045311, "train_runtime": 38381.4335, "train_samples_per_second": 4.778, "train_steps_per_second": 0.149 } ], "logging_steps": 10, "max_steps": 5733, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }