diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8598 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999182137891551, + "eval_steps": 500, + "global_step": 6113, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 8.169934640522877e-09, + "logits/chosen": -2.486781120300293, + "logits/rejected": -2.4319710731506348, + "logps/chosen": -108.4718017578125, + "logps/rejected": -120.67897033691406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 8.169934640522877e-08, + "logits/chosen": -1.995856761932373, + "logits/rejected": -1.6258703470230103, + "logps/chosen": -326.208984375, + "logps/rejected": -280.4178161621094, + "loss": 0.6932, + "rewards/accuracies": 0.3888888955116272, + "rewards/chosen": -0.00045196537394076586, + "rewards/margins": -0.0005653805565088987, + "rewards/rejected": 0.00011341518256813288, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.6339869281045755e-07, + "logits/chosen": -1.8469102382659912, + "logits/rejected": -1.7917423248291016, + "logps/chosen": -249.0980987548828, + "logps/rejected": -289.9396667480469, + "loss": 0.6931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00024314592883456498, + "rewards/margins": 0.0008744850638322532, + "rewards/rejected": -0.0011176310945302248, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 2.4509803921568627e-07, + "logits/chosen": -1.8108108043670654, + "logits/rejected": -1.795182466506958, + "logps/chosen": -233.0114288330078, + "logps/rejected": -269.768798828125, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0007357263239100575, + "rewards/margins": 0.00102086435072124, + "rewards/rejected": -0.0002851380850188434, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 3.267973856209151e-07, + "logits/chosen": -2.138723611831665, + "logits/rejected": -1.9693067073822021, + "logps/chosen": -265.2377624511719, + "logps/rejected": -265.48468017578125, + "loss": 0.6928, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00028720468981191516, + "rewards/margins": 0.004331021569669247, + "rewards/rejected": -0.004618226084858179, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 4.084967320261438e-07, + "logits/chosen": -1.982277512550354, + "logits/rejected": -1.935686707496643, + "logps/chosen": -215.1453399658203, + "logps/rejected": -243.1034393310547, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0010335505940020084, + "rewards/margins": 0.008878698572516441, + "rewards/rejected": -0.00784514844417572, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -1.8854526281356812, + "logits/rejected": -1.7856941223144531, + "logps/chosen": -249.6039276123047, + "logps/rejected": -227.83657836914062, + "loss": 0.6921, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0024430998601019382, + "rewards/margins": 0.011661484837532043, + "rewards/rejected": -0.014104584231972694, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 5.718954248366013e-07, + "logits/chosen": -1.975188970565796, + "logits/rejected": -1.7991580963134766, + "logps/chosen": -292.2620849609375, + "logps/rejected": -278.5631103515625, + "loss": 0.6921, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.004684583283960819, + "rewards/margins": 0.011653682217001915, + "rewards/rejected": -0.01633826456964016, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": -1.9011461734771729, + "logits/rejected": -1.9221051931381226, + "logps/chosen": -240.9386749267578, + "logps/rejected": -268.3078918457031, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0002921714331023395, + "rewards/margins": 0.03314762935042381, + "rewards/rejected": -0.03343980386853218, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 7.352941176470589e-07, + "logits/chosen": -2.029928684234619, + "logits/rejected": -1.5811048746109009, + "logps/chosen": -306.3148498535156, + "logps/rejected": -251.1549072265625, + "loss": 0.6876, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.008320676162838936, + "rewards/margins": 0.09511779248714447, + "rewards/rejected": -0.10343847423791885, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 8.169934640522876e-07, + "logits/chosen": -1.9319264888763428, + "logits/rejected": -1.5012649297714233, + "logps/chosen": -332.74249267578125, + "logps/rejected": -237.0785675048828, + "loss": 0.6915, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.010155880823731422, + "rewards/margins": 0.07823146134614944, + "rewards/rejected": -0.08838734030723572, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 8.986928104575164e-07, + "logits/chosen": -1.8104137182235718, + "logits/rejected": -1.827924370765686, + "logps/chosen": -293.01641845703125, + "logps/rejected": -269.2078857421875, + "loss": 0.6925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.024693164974451065, + "rewards/margins": 0.08393082767724991, + "rewards/rejected": -0.10862399637699127, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.0554537773132324, + "logits/rejected": -1.9354078769683838, + "logps/chosen": -290.5410461425781, + "logps/rejected": -305.27569580078125, + "loss": 0.6902, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.006353466305881739, + "rewards/margins": 0.05001940205693245, + "rewards/rejected": -0.05637286975979805, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 1.0620915032679739e-06, + "logits/chosen": -1.8481800556182861, + "logits/rejected": -1.79215407371521, + "logps/chosen": -241.8577423095703, + "logps/rejected": -224.6114501953125, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.015935102477669716, + "rewards/margins": 0.10451234877109528, + "rewards/rejected": -0.12044744193553925, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 1.1437908496732026e-06, + "logits/chosen": -1.8734920024871826, + "logits/rejected": -1.7983553409576416, + "logps/chosen": -278.6947326660156, + "logps/rejected": -284.36004638671875, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010003168135881424, + "rewards/margins": 0.0766625627875328, + "rewards/rejected": -0.08666571974754333, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 1.2254901960784314e-06, + "logits/chosen": -2.148531436920166, + "logits/rejected": -1.853938102722168, + "logps/chosen": -203.0059814453125, + "logps/rejected": -157.02220153808594, + "loss": 0.6915, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.003481155727058649, + "rewards/margins": 0.044050782918930054, + "rewards/rejected": -0.047531940042972565, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 1.3071895424836604e-06, + "logits/chosen": -1.930214285850525, + "logits/rejected": -1.9383434057235718, + "logps/chosen": -220.7246551513672, + "logps/rejected": -232.9812774658203, + "loss": 0.6903, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.007492788136005402, + "rewards/margins": 0.05902974680066109, + "rewards/rejected": -0.06652253121137619, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 1.3888888888888892e-06, + "logits/chosen": -2.112050771713257, + "logits/rejected": -2.001424789428711, + "logps/chosen": -267.34613037109375, + "logps/rejected": -274.67901611328125, + "loss": 0.6884, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02323351986706257, + "rewards/margins": 0.08129237592220306, + "rewards/rejected": -0.10452590137720108, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 1.4705882352941177e-06, + "logits/chosen": -2.0084445476531982, + "logits/rejected": -1.8466752767562866, + "logps/chosen": -271.5625, + "logps/rejected": -282.74951171875, + "loss": 0.6916, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03647018224000931, + "rewards/margins": 0.0640491470694542, + "rewards/rejected": -0.1005193218588829, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 1.5522875816993465e-06, + "logits/chosen": -2.072286367416382, + "logits/rejected": -1.8907339572906494, + "logps/chosen": -224.926025390625, + "logps/rejected": -233.1913604736328, + "loss": 0.6893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0047895037569105625, + "rewards/margins": 0.07405461370944977, + "rewards/rejected": -0.0788441151380539, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": -2.1158788204193115, + "logits/rejected": -2.0300536155700684, + "logps/chosen": -201.88473510742188, + "logps/rejected": -211.41372680664062, + "loss": 0.69, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03420134633779526, + "rewards/margins": 0.04929971694946289, + "rewards/rejected": -0.08350107818841934, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.715686274509804e-06, + "logits/chosen": -2.15541672706604, + "logits/rejected": -1.8206045627593994, + "logps/chosen": -315.76837158203125, + "logps/rejected": -234.51004028320312, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013131176121532917, + "rewards/margins": 0.05592525005340576, + "rewards/rejected": -0.069056436419487, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 1.7973856209150328e-06, + "logits/chosen": -2.0246963500976562, + "logits/rejected": -2.030097246170044, + "logps/chosen": -242.3164520263672, + "logps/rejected": -282.59051513671875, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.003397223772481084, + "rewards/margins": 0.09776968508958817, + "rewards/rejected": -0.10116690397262573, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 1.8790849673202616e-06, + "logits/chosen": -2.0559966564178467, + "logits/rejected": -1.8761097192764282, + "logps/chosen": -248.9442901611328, + "logps/rejected": -265.4543151855469, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011411285027861595, + "rewards/margins": 0.05242248624563217, + "rewards/rejected": -0.04101119562983513, + "step": 230 + }, + { + "epoch": 0.04, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.046117067337036, + "logits/rejected": -1.8383557796478271, + "logps/chosen": -260.7398986816406, + "logps/rejected": -231.1858673095703, + "loss": 0.6932, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013742564246058464, + "rewards/margins": 0.026972660794854164, + "rewards/rejected": -0.013230097480118275, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 2.042483660130719e-06, + "logits/chosen": -1.9848926067352295, + "logits/rejected": -2.002385139465332, + "logps/chosen": -212.72604370117188, + "logps/rejected": -233.0819091796875, + "loss": 0.6908, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007447829004377127, + "rewards/margins": 0.0746130421757698, + "rewards/rejected": -0.08206087350845337, + "step": 250 + }, + { + "epoch": 0.04, + "learning_rate": 2.1241830065359477e-06, + "logits/chosen": -1.9120912551879883, + "logits/rejected": -1.954754114151001, + "logps/chosen": -269.4970397949219, + "logps/rejected": -298.57086181640625, + "loss": 0.69, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.020706243813037872, + "rewards/margins": 0.054516397416591644, + "rewards/rejected": -0.03381015360355377, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 2.2058823529411767e-06, + "logits/chosen": -2.0166051387786865, + "logits/rejected": -1.7340428829193115, + "logps/chosen": -248.164794921875, + "logps/rejected": -208.91781616210938, + "loss": 0.691, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.009155301377177238, + "rewards/margins": 0.05221142619848251, + "rewards/rejected": -0.04305613040924072, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 2.2875816993464053e-06, + "logits/chosen": -2.0367040634155273, + "logits/rejected": -1.8216432332992554, + "logps/chosen": -252.8002471923828, + "logps/rejected": -261.8016357421875, + "loss": 0.6884, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03925459459424019, + "rewards/margins": 0.1245487779378891, + "rewards/rejected": -0.16380338370800018, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 2.3692810457516342e-06, + "logits/chosen": -1.9527132511138916, + "logits/rejected": -1.8044891357421875, + "logps/chosen": -217.4779815673828, + "logps/rejected": -206.2652130126953, + "loss": 0.6921, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.018538698554039, + "rewards/margins": 0.08644220978021622, + "rewards/rejected": -0.10498090088367462, + "step": 290 + }, + { + "epoch": 0.05, + "learning_rate": 2.450980392156863e-06, + "logits/chosen": -2.042166233062744, + "logits/rejected": -1.9893741607666016, + "logps/chosen": -289.8385925292969, + "logps/rejected": -284.45245361328125, + "loss": 0.692, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.027700364589691162, + "rewards/margins": 0.0368027426302433, + "rewards/rejected": -0.00910237617790699, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 2.532679738562092e-06, + "logits/chosen": -1.9160455465316772, + "logits/rejected": -1.9234968423843384, + "logps/chosen": -227.7736358642578, + "logps/rejected": -228.32583618164062, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.022115562111139297, + "rewards/margins": 0.05109254643321037, + "rewards/rejected": -0.028976986184716225, + "step": 310 + }, + { + "epoch": 0.05, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": -2.052905559539795, + "logits/rejected": -1.8477401733398438, + "logps/chosen": -239.7154541015625, + "logps/rejected": -240.03164672851562, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02235419675707817, + "rewards/margins": 0.04038622975349426, + "rewards/rejected": -0.018032027408480644, + "step": 320 + }, + { + "epoch": 0.05, + "learning_rate": 2.696078431372549e-06, + "logits/chosen": -2.0068612098693848, + "logits/rejected": -1.941676378250122, + "logps/chosen": -189.1168212890625, + "logps/rejected": -204.4237518310547, + "loss": 0.6895, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -7.205530710052699e-05, + "rewards/margins": 0.09253031760454178, + "rewards/rejected": -0.09260235726833344, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 2.7777777777777783e-06, + "logits/chosen": -1.876878023147583, + "logits/rejected": -1.839035987854004, + "logps/chosen": -281.45367431640625, + "logps/rejected": -308.483642578125, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.000742577773053199, + "rewards/margins": 0.08648402988910675, + "rewards/rejected": -0.08574144542217255, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 2.8594771241830065e-06, + "logits/chosen": -2.0031285285949707, + "logits/rejected": -1.6795644760131836, + "logps/chosen": -277.67987060546875, + "logps/rejected": -245.172607421875, + "loss": 0.6905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013115936890244484, + "rewards/margins": 0.0826214924454689, + "rewards/rejected": -0.06950554996728897, + "step": 350 + }, + { + "epoch": 0.06, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -1.9802716970443726, + "logits/rejected": -1.8036832809448242, + "logps/chosen": -262.0284729003906, + "logps/rejected": -263.77398681640625, + "loss": 0.6898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.018712202087044716, + "rewards/margins": 0.07143567502498627, + "rewards/rejected": -0.0527234748005867, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 3.022875816993464e-06, + "logits/chosen": -2.1279966831207275, + "logits/rejected": -1.8910629749298096, + "logps/chosen": -219.071044921875, + "logps/rejected": -208.7861785888672, + "loss": 0.6903, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01627056859433651, + "rewards/margins": 0.07586422562599182, + "rewards/rejected": -0.05959365889430046, + "step": 370 + }, + { + "epoch": 0.06, + "learning_rate": 3.104575163398693e-06, + "logits/chosen": -2.102811098098755, + "logits/rejected": -1.923095941543579, + "logps/chosen": -328.1366271972656, + "logps/rejected": -309.04833984375, + "loss": 0.691, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.007027629762887955, + "rewards/margins": 0.09556975215673447, + "rewards/rejected": -0.08854212611913681, + "step": 380 + }, + { + "epoch": 0.06, + "learning_rate": 3.1862745098039216e-06, + "logits/chosen": -2.019583225250244, + "logits/rejected": -1.9006500244140625, + "logps/chosen": -264.610107421875, + "logps/rejected": -236.01840209960938, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00022241678379941732, + "rewards/margins": 0.008316442370414734, + "rewards/rejected": -0.008538859896361828, + "step": 390 + }, + { + "epoch": 0.07, + "learning_rate": 3.2679738562091506e-06, + "logits/chosen": -1.8842191696166992, + "logits/rejected": -1.7512710094451904, + "logps/chosen": -166.9261016845703, + "logps/rejected": -183.53060913085938, + "loss": 0.6895, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.015238839201629162, + "rewards/margins": 0.0635746493935585, + "rewards/rejected": -0.04833581671118736, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 3.349673202614379e-06, + "logits/chosen": -1.8863931894302368, + "logits/rejected": -1.8892552852630615, + "logps/chosen": -189.72776794433594, + "logps/rejected": -194.29891967773438, + "loss": 0.6888, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.029690301045775414, + "rewards/margins": 0.10951326787471771, + "rewards/rejected": -0.07982297241687775, + "step": 410 + }, + { + "epoch": 0.07, + "learning_rate": 3.431372549019608e-06, + "logits/chosen": -2.127944231033325, + "logits/rejected": -1.882693886756897, + "logps/chosen": -342.33843994140625, + "logps/rejected": -320.3702087402344, + "loss": 0.6901, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.016823817044496536, + "rewards/margins": 0.10748317092657089, + "rewards/rejected": -0.09065935760736465, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 3.5130718954248367e-06, + "logits/chosen": -1.8037595748901367, + "logits/rejected": -1.818001389503479, + "logps/chosen": -274.9898681640625, + "logps/rejected": -282.52069091796875, + "loss": 0.6907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04266976937651634, + "rewards/margins": 0.056666333228349686, + "rewards/rejected": -0.013996558263897896, + "step": 430 + }, + { + "epoch": 0.07, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": -1.9084842205047607, + "logits/rejected": -1.6918470859527588, + "logps/chosen": -204.62680053710938, + "logps/rejected": -192.6732940673828, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02925197407603264, + "rewards/margins": 0.06457007676362991, + "rewards/rejected": -0.03531809523701668, + "step": 440 + }, + { + "epoch": 0.07, + "learning_rate": 3.6764705882352946e-06, + "logits/chosen": -1.7819035053253174, + "logits/rejected": -1.910544753074646, + "logps/chosen": -204.4004669189453, + "logps/rejected": -259.67498779296875, + "loss": 0.6885, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.019478099420666695, + "rewards/margins": 0.10720425844192505, + "rewards/rejected": -0.0877261608839035, + "step": 450 + }, + { + "epoch": 0.08, + "learning_rate": 3.758169934640523e-06, + "logits/chosen": -1.7165343761444092, + "logits/rejected": -1.6978862285614014, + "logps/chosen": -209.5369110107422, + "logps/rejected": -250.46908569335938, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05315498262643814, + "rewards/margins": 0.1209985613822937, + "rewards/rejected": -0.17415353655815125, + "step": 460 + }, + { + "epoch": 0.08, + "learning_rate": 3.839869281045752e-06, + "logits/chosen": -1.9889190196990967, + "logits/rejected": -1.909375548362732, + "logps/chosen": -266.21905517578125, + "logps/rejected": -259.3236083984375, + "loss": 0.6923, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.02212306298315525, + "rewards/margins": 0.05675049498677254, + "rewards/rejected": -0.07887355983257294, + "step": 470 + }, + { + "epoch": 0.08, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -1.984463095664978, + "logits/rejected": -1.932708978652954, + "logps/chosen": -242.6595001220703, + "logps/rejected": -215.62576293945312, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010744850151240826, + "rewards/margins": 0.0793093889951706, + "rewards/rejected": -0.0685645341873169, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 4.00326797385621e-06, + "logits/chosen": -2.038952350616455, + "logits/rejected": -2.078556776046753, + "logps/chosen": -256.6618957519531, + "logps/rejected": -302.7876281738281, + "loss": 0.6892, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013893517665565014, + "rewards/margins": 0.07112707197666168, + "rewards/rejected": -0.057233553379774094, + "step": 490 + }, + { + "epoch": 0.08, + "learning_rate": 4.084967320261438e-06, + "logits/chosen": -1.8132222890853882, + "logits/rejected": -1.8805878162384033, + "logps/chosen": -189.75491333007812, + "logps/rejected": -249.66567993164062, + "loss": 0.6884, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01711181178689003, + "rewards/margins": 0.10426501929759979, + "rewards/rejected": -0.12137682735919952, + "step": 500 + }, + { + "epoch": 0.08, + "learning_rate": 4.166666666666667e-06, + "logits/chosen": -2.306424140930176, + "logits/rejected": -1.9006808996200562, + "logps/chosen": -356.439208984375, + "logps/rejected": -292.4924621582031, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01603344827890396, + "rewards/margins": 0.0812697485089302, + "rewards/rejected": -0.06523631513118744, + "step": 510 + }, + { + "epoch": 0.09, + "learning_rate": 4.2483660130718954e-06, + "logits/chosen": -1.865846037864685, + "logits/rejected": -1.7648818492889404, + "logps/chosen": -266.7701416015625, + "logps/rejected": -235.7229461669922, + "loss": 0.6865, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03805357962846756, + "rewards/margins": 0.14672335982322693, + "rewards/rejected": -0.1847769320011139, + "step": 520 + }, + { + "epoch": 0.09, + "learning_rate": 4.330065359477125e-06, + "logits/chosen": -2.0248360633850098, + "logits/rejected": -2.0557875633239746, + "logps/chosen": -230.93197631835938, + "logps/rejected": -238.7276153564453, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04835163429379463, + "rewards/margins": 0.07609757781028748, + "rewards/rejected": -0.12444920837879181, + "step": 530 + }, + { + "epoch": 0.09, + "learning_rate": 4.411764705882353e-06, + "logits/chosen": -1.92678701877594, + "logits/rejected": -1.7919566631317139, + "logps/chosen": -256.6742858886719, + "logps/rejected": -257.69573974609375, + "loss": 0.6872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.047441355884075165, + "rewards/margins": 0.10895486176013947, + "rewards/rejected": -0.15639621019363403, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 4.493464052287582e-06, + "logits/chosen": -2.0128042697906494, + "logits/rejected": -1.902692437171936, + "logps/chosen": -301.0126953125, + "logps/rejected": -302.93560791015625, + "loss": 0.6917, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0794796571135521, + "rewards/margins": 0.08260070532560349, + "rewards/rejected": -0.16208036243915558, + "step": 550 + }, + { + "epoch": 0.09, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -1.7864646911621094, + "logits/rejected": -1.818052053451538, + "logps/chosen": -266.69219970703125, + "logps/rejected": -297.9820861816406, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05527070164680481, + "rewards/margins": 0.122100330889225, + "rewards/rejected": -0.17737102508544922, + "step": 560 + }, + { + "epoch": 0.09, + "learning_rate": 4.65686274509804e-06, + "logits/chosen": -1.7135837078094482, + "logits/rejected": -1.8005495071411133, + "logps/chosen": -205.53634643554688, + "logps/rejected": -268.8668212890625, + "loss": 0.6907, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0064945342019200325, + "rewards/margins": 0.09055610001087189, + "rewards/rejected": -0.09705062210559845, + "step": 570 + }, + { + "epoch": 0.09, + "learning_rate": 4.7385620915032685e-06, + "logits/chosen": -1.8014023303985596, + "logits/rejected": -1.811103105545044, + "logps/chosen": -254.2625274658203, + "logps/rejected": -271.00738525390625, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.023499321192502975, + "rewards/margins": 0.09068706631660461, + "rewards/rejected": -0.11418638378381729, + "step": 580 + }, + { + "epoch": 0.1, + "learning_rate": 4.820261437908497e-06, + "logits/chosen": -1.8691829442977905, + "logits/rejected": -1.6062300205230713, + "logps/chosen": -252.29086303710938, + "logps/rejected": -236.46194458007812, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05803394317626953, + "rewards/margins": 0.028398144990205765, + "rewards/rejected": -0.086432084441185, + "step": 590 + }, + { + "epoch": 0.1, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -2.005988597869873, + "logits/rejected": -1.739748239517212, + "logps/chosen": -321.431396484375, + "logps/rejected": -289.0257873535156, + "loss": 0.6891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.003983601462095976, + "rewards/margins": 0.07740481942892075, + "rewards/rejected": -0.0813884288072586, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 4.983660130718955e-06, + "logits/chosen": -1.8958543539047241, + "logits/rejected": -1.9557549953460693, + "logps/chosen": -227.0845184326172, + "logps/rejected": -257.05450439453125, + "loss": 0.6894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.024518854916095734, + "rewards/margins": 0.09726980328559875, + "rewards/rejected": -0.12178865820169449, + "step": 610 + }, + { + "epoch": 0.1, + "learning_rate": 4.999973908101102e-06, + "logits/chosen": -2.0896029472351074, + "logits/rejected": -1.9701532125473022, + "logps/chosen": -290.30926513671875, + "logps/rejected": -278.406005859375, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03560750186443329, + "rewards/margins": 0.10298409312963486, + "rewards/rejected": -0.06737658381462097, + "step": 620 + }, + { + "epoch": 0.1, + "learning_rate": 4.999867910695249e-06, + "logits/chosen": -1.781288743019104, + "logits/rejected": -1.8756494522094727, + "logps/chosen": -261.6845397949219, + "logps/rejected": -291.03826904296875, + "loss": 0.6914, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.041419465094804764, + "rewards/margins": 0.08097013086080551, + "rewards/rejected": -0.03955066204071045, + "step": 630 + }, + { + "epoch": 0.1, + "learning_rate": 4.9996803804931885e-06, + "logits/chosen": -2.070525884628296, + "logits/rejected": -1.7746680974960327, + "logps/chosen": -344.9931945800781, + "logps/rejected": -242.26437377929688, + "loss": 0.6911, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.011725488118827343, + "rewards/margins": 0.021290883421897888, + "rewards/rejected": -0.033016376197338104, + "step": 640 + }, + { + "epoch": 0.11, + "learning_rate": 4.99941132361119e-06, + "logits/chosen": -2.004568099975586, + "logits/rejected": -1.8413000106811523, + "logps/chosen": -269.6329040527344, + "logps/rejected": -238.25546264648438, + "loss": 0.6888, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04259689897298813, + "rewards/margins": 0.09798689931631088, + "rewards/rejected": -0.055390000343322754, + "step": 650 + }, + { + "epoch": 0.11, + "learning_rate": 4.9990607488245e-06, + "logits/chosen": -2.054567813873291, + "logits/rejected": -1.8265924453735352, + "logps/chosen": -263.1630554199219, + "logps/rejected": -268.4882507324219, + "loss": 0.6875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03993896767497063, + "rewards/margins": 0.11784611642360687, + "rewards/rejected": -0.07790714502334595, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 4.998628667567061e-06, + "logits/chosen": -1.9703295230865479, + "logits/rejected": -1.5748050212860107, + "logps/chosen": -232.3702850341797, + "logps/rejected": -173.08901977539062, + "loss": 0.6902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.030246425420045853, + "rewards/margins": 0.08843965828418732, + "rewards/rejected": -0.05819323658943176, + "step": 670 + }, + { + "epoch": 0.11, + "learning_rate": 4.998115093931133e-06, + "logits/chosen": -1.7992169857025146, + "logits/rejected": -1.7200311422348022, + "logps/chosen": -240.78414916992188, + "logps/rejected": -255.3190155029297, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012486319988965988, + "rewards/margins": 0.10194554179906845, + "rewards/rejected": -0.08945922553539276, + "step": 680 + }, + { + "epoch": 0.11, + "learning_rate": 4.9975200446668405e-06, + "logits/chosen": -1.8173624277114868, + "logits/rejected": -1.7265287637710571, + "logps/chosen": -238.39541625976562, + "logps/rejected": -283.1929626464844, + "loss": 0.6891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006398810539394617, + "rewards/margins": 0.11303986608982086, + "rewards/rejected": -0.10664103925228119, + "step": 690 + }, + { + "epoch": 0.11, + "learning_rate": 4.99684353918162e-06, + "logits/chosen": -1.9383537769317627, + "logits/rejected": -1.6988528966903687, + "logps/chosen": -217.1807098388672, + "logps/rejected": -219.0244598388672, + "loss": 0.6899, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.006116027943789959, + "rewards/margins": 0.10280124843120575, + "rewards/rejected": -0.09668521583080292, + "step": 700 + }, + { + "epoch": 0.12, + "learning_rate": 4.996085599539591e-06, + "logits/chosen": -1.9807647466659546, + "logits/rejected": -1.632709264755249, + "logps/chosen": -295.69256591796875, + "logps/rejected": -240.35525512695312, + "loss": 0.6885, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0028353859670460224, + "rewards/margins": 0.10749323666095734, + "rewards/rejected": -0.1046578511595726, + "step": 710 + }, + { + "epoch": 0.12, + "learning_rate": 4.995246250460835e-06, + "logits/chosen": -1.9093172550201416, + "logits/rejected": -1.6945396661758423, + "logps/chosen": -273.8838195800781, + "logps/rejected": -276.8046569824219, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02449687384068966, + "rewards/margins": 0.10259126126766205, + "rewards/rejected": -0.12708814442157745, + "step": 720 + }, + { + "epoch": 0.12, + "learning_rate": 4.99432551932059e-06, + "logits/chosen": -1.7671747207641602, + "logits/rejected": -1.7000166177749634, + "logps/chosen": -244.59732055664062, + "logps/rejected": -260.4505310058594, + "loss": 0.6879, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.009170468896627426, + "rewards/margins": 0.141859769821167, + "rewards/rejected": -0.13268931210041046, + "step": 730 + }, + { + "epoch": 0.12, + "learning_rate": 4.993323436148355e-06, + "logits/chosen": -2.115560531616211, + "logits/rejected": -1.992264986038208, + "logps/chosen": -280.55072021484375, + "logps/rejected": -292.7775573730469, + "loss": 0.6873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0003127576783299446, + "rewards/margins": 0.11162249743938446, + "rewards/rejected": -0.11130975186824799, + "step": 740 + }, + { + "epoch": 0.12, + "learning_rate": 4.9922400336269154e-06, + "logits/chosen": -1.9589831829071045, + "logits/rejected": -1.7921831607818604, + "logps/chosen": -274.3896789550781, + "logps/rejected": -268.0859680175781, + "loss": 0.687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.005586009938269854, + "rewards/margins": 0.1298276036977768, + "rewards/rejected": -0.12424159049987793, + "step": 750 + }, + { + "epoch": 0.12, + "learning_rate": 4.991075347091273e-06, + "logits/chosen": -1.7477144002914429, + "logits/rejected": -1.785167932510376, + "logps/chosen": -203.60494995117188, + "logps/rejected": -236.4550018310547, + "loss": 0.6916, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.005796524696052074, + "rewards/margins": 0.0926826149225235, + "rewards/rejected": -0.09847913682460785, + "step": 760 + }, + { + "epoch": 0.13, + "learning_rate": 4.9898294145274926e-06, + "logits/chosen": -1.8766326904296875, + "logits/rejected": -1.7080796957015991, + "logps/chosen": -265.076904296875, + "logps/rejected": -453.3878479003906, + "loss": 0.6849, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0015633024740964174, + "rewards/margins": 0.14012418687343597, + "rewards/rejected": -0.13856089115142822, + "step": 770 + }, + { + "epoch": 0.13, + "learning_rate": 4.988502276571471e-06, + "logits/chosen": -1.9038374423980713, + "logits/rejected": -1.6458237171173096, + "logps/chosen": -233.3724365234375, + "logps/rejected": -210.854248046875, + "loss": 0.6918, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00027247294201515615, + "rewards/margins": 0.05894937366247177, + "rewards/rejected": -0.059221845120191574, + "step": 780 + }, + { + "epoch": 0.13, + "learning_rate": 4.9870939765076e-06, + "logits/chosen": -1.983919382095337, + "logits/rejected": -2.041472911834717, + "logps/chosen": -284.9762878417969, + "logps/rejected": -290.635009765625, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.022699173539876938, + "rewards/margins": 0.0448235347867012, + "rewards/rejected": -0.06752269715070724, + "step": 790 + }, + { + "epoch": 0.13, + "learning_rate": 4.985604560267363e-06, + "logits/chosen": -1.9209531545639038, + "logits/rejected": -1.8662135601043701, + "logps/chosen": -254.82235717773438, + "logps/rejected": -283.30120849609375, + "loss": 0.6885, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020086321979761124, + "rewards/margins": 0.08462269604206085, + "rewards/rejected": -0.06453637778759003, + "step": 800 + }, + { + "epoch": 0.13, + "learning_rate": 4.984034076427838e-06, + "logits/chosen": -2.0287258625030518, + "logits/rejected": -1.7489858865737915, + "logps/chosen": -232.81179809570312, + "logps/rejected": -228.3260498046875, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013652363792061806, + "rewards/margins": 0.08594223856925964, + "rewards/rejected": -0.07228987663984299, + "step": 810 + }, + { + "epoch": 0.13, + "learning_rate": 4.982382576210103e-06, + "logits/chosen": -1.7876205444335938, + "logits/rejected": -1.795292854309082, + "logps/chosen": -289.850830078125, + "logps/rejected": -276.88897705078125, + "loss": 0.6923, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.024563539773225784, + "rewards/margins": 0.11165276914834976, + "rewards/rejected": -0.08708924055099487, + "step": 820 + }, + { + "epoch": 0.14, + "learning_rate": 4.9806501134775786e-06, + "logits/chosen": -1.977622628211975, + "logits/rejected": -1.7702114582061768, + "logps/chosen": -240.52053833007812, + "logps/rejected": -260.08734130859375, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018454570323228836, + "rewards/margins": 0.06337157636880875, + "rewards/rejected": -0.04491700604557991, + "step": 830 + }, + { + "epoch": 0.14, + "learning_rate": 4.9788367447342615e-06, + "logits/chosen": -1.9930822849273682, + "logits/rejected": -1.8632118701934814, + "logps/chosen": -223.88595581054688, + "logps/rejected": -231.3230743408203, + "loss": 0.6908, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.031216461211442947, + "rewards/margins": 0.06415192037820816, + "rewards/rejected": -0.032935455441474915, + "step": 840 + }, + { + "epoch": 0.14, + "learning_rate": 4.976942529122887e-06, + "logits/chosen": -1.910666823387146, + "logits/rejected": -1.8803300857543945, + "logps/chosen": -197.60812377929688, + "logps/rejected": -221.9577178955078, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.029210612177848816, + "rewards/margins": 0.07380051910877228, + "rewards/rejected": -0.044589899480342865, + "step": 850 + }, + { + "epoch": 0.14, + "learning_rate": 4.974967528422997e-06, + "logits/chosen": -1.8911033868789673, + "logits/rejected": -1.6391998529434204, + "logps/chosen": -284.83349609375, + "logps/rejected": -231.6648406982422, + "loss": 0.6899, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.018184982240200043, + "rewards/margins": 0.079526886343956, + "rewards/rejected": -0.06134190410375595, + "step": 860 + }, + { + "epoch": 0.14, + "learning_rate": 4.972911807048927e-06, + "logits/chosen": -2.0956811904907227, + "logits/rejected": -1.719270944595337, + "logps/chosen": -345.12445068359375, + "logps/rejected": -277.8197937011719, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04746173322200775, + "rewards/margins": 0.11509747803211212, + "rewards/rejected": -0.16255921125411987, + "step": 870 + }, + { + "epoch": 0.14, + "learning_rate": 4.970775432047704e-06, + "logits/chosen": -2.0564563274383545, + "logits/rejected": -1.9209903478622437, + "logps/chosen": -259.9346008300781, + "logps/rejected": -282.9585266113281, + "loss": 0.6878, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016783785074949265, + "rewards/margins": 0.10451418161392212, + "rewards/rejected": -0.12129797786474228, + "step": 880 + }, + { + "epoch": 0.15, + "learning_rate": 4.9685584730968605e-06, + "logits/chosen": -1.7383397817611694, + "logits/rejected": -1.8818151950836182, + "logps/chosen": -234.910400390625, + "logps/rejected": -303.4265441894531, + "loss": 0.6909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.006733441259711981, + "rewards/margins": 0.09848129749298096, + "rewards/rejected": -0.10521472990512848, + "step": 890 + }, + { + "epoch": 0.15, + "learning_rate": 4.966261002502162e-06, + "logits/chosen": -1.932641625404358, + "logits/rejected": -1.7558002471923828, + "logps/chosen": -228.53018188476562, + "logps/rejected": -231.5489501953125, + "loss": 0.6852, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.020324600860476494, + "rewards/margins": 0.17512337863445282, + "rewards/rejected": -0.15479877591133118, + "step": 900 + }, + { + "epoch": 0.15, + "learning_rate": 4.963883095195248e-06, + "logits/chosen": -2.1068947315216064, + "logits/rejected": -1.8657506704330444, + "logps/chosen": -250.7617950439453, + "logps/rejected": -238.71334838867188, + "loss": 0.6918, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.013454620726406574, + "rewards/margins": 0.07133384793996811, + "rewards/rejected": -0.08478846400976181, + "step": 910 + }, + { + "epoch": 0.15, + "learning_rate": 4.961424828731188e-06, + "logits/chosen": -2.041858673095703, + "logits/rejected": -1.9689133167266846, + "logps/chosen": -259.1144714355469, + "logps/rejected": -262.7490539550781, + "loss": 0.6873, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04987924173474312, + "rewards/margins": 0.08842237293720245, + "rewards/rejected": -0.038543134927749634, + "step": 920 + }, + { + "epoch": 0.15, + "learning_rate": 4.958886283285956e-06, + "logits/chosen": -2.016230583190918, + "logits/rejected": -1.9319770336151123, + "logps/chosen": -204.9761962890625, + "logps/rejected": -226.80081176757812, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07094229757785797, + "rewards/margins": 0.1403259038925171, + "rewards/rejected": -0.06938360631465912, + "step": 930 + }, + { + "epoch": 0.15, + "learning_rate": 4.956267541653808e-06, + "logits/chosen": -1.988416314125061, + "logits/rejected": -1.9086863994598389, + "logps/chosen": -220.4031524658203, + "logps/rejected": -230.15771484375, + "loss": 0.688, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.031581003218889236, + "rewards/margins": 0.11606631428003311, + "rewards/rejected": -0.08448532223701477, + "step": 940 + }, + { + "epoch": 0.16, + "learning_rate": 4.953568689244588e-06, + "logits/chosen": -1.978438377380371, + "logits/rejected": -1.7735925912857056, + "logps/chosen": -248.2462921142578, + "logps/rejected": -258.52545166015625, + "loss": 0.6889, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04585626721382141, + "rewards/margins": 0.08934807777404785, + "rewards/rejected": -0.043491803109645844, + "step": 950 + }, + { + "epoch": 0.16, + "learning_rate": 4.9507898140809406e-06, + "logits/chosen": -2.091243267059326, + "logits/rejected": -2.0064055919647217, + "logps/chosen": -284.07525634765625, + "logps/rejected": -278.12774658203125, + "loss": 0.6931, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0369267538189888, + "rewards/margins": 0.04481780156493187, + "rewards/rejected": -0.08174455910921097, + "step": 960 + }, + { + "epoch": 0.16, + "learning_rate": 4.94793100679544e-06, + "logits/chosen": -1.8839261531829834, + "logits/rejected": -1.8292548656463623, + "logps/chosen": -305.9725341796875, + "logps/rejected": -335.5926818847656, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0439421609044075, + "rewards/margins": 0.10483159124851227, + "rewards/rejected": -0.14877375960350037, + "step": 970 + }, + { + "epoch": 0.16, + "learning_rate": 4.944992360627631e-06, + "logits/chosen": -1.9397468566894531, + "logits/rejected": -1.8404371738433838, + "logps/chosen": -261.18463134765625, + "logps/rejected": -257.4517517089844, + "loss": 0.6926, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0762534886598587, + "rewards/margins": 0.07108769565820694, + "rewards/rejected": -0.14734117686748505, + "step": 980 + }, + { + "epoch": 0.16, + "learning_rate": 4.941973971420996e-06, + "logits/chosen": -2.0321812629699707, + "logits/rejected": -1.8147423267364502, + "logps/chosen": -302.6067199707031, + "logps/rejected": -260.30859375, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014757568016648293, + "rewards/margins": 0.021887850016355515, + "rewards/rejected": -0.036645419895648956, + "step": 990 + }, + { + "epoch": 0.16, + "learning_rate": 4.9388759376198194e-06, + "logits/chosen": -1.908969521522522, + "logits/rejected": -1.7781822681427002, + "logps/chosen": -324.4129638671875, + "logps/rejected": -313.5116271972656, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0005237011355347931, + "rewards/margins": 0.07187380641698837, + "rewards/rejected": -0.0713501051068306, + "step": 1000 + }, + { + "epoch": 0.17, + "learning_rate": 4.935698360265984e-06, + "logits/chosen": -1.799020528793335, + "logits/rejected": -1.9150434732437134, + "logps/chosen": -259.3772888183594, + "logps/rejected": -318.16278076171875, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.003594130277633667, + "rewards/margins": 0.05621428042650223, + "rewards/rejected": -0.059808410704135895, + "step": 1010 + }, + { + "epoch": 0.17, + "learning_rate": 4.932441342995671e-06, + "logits/chosen": -1.7904123067855835, + "logits/rejected": -1.8439342975616455, + "logps/chosen": -209.40402221679688, + "logps/rejected": -221.0392608642578, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.016126811504364014, + "rewards/margins": 0.05908411741256714, + "rewards/rejected": -0.07521092146635056, + "step": 1020 + }, + { + "epoch": 0.17, + "learning_rate": 4.929104992035985e-06, + "logits/chosen": -2.1821420192718506, + "logits/rejected": -1.7902252674102783, + "logps/chosen": -282.2015686035156, + "logps/rejected": -225.5486297607422, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014751900918781757, + "rewards/margins": 0.07297666370868683, + "rewards/rejected": -0.08772855997085571, + "step": 1030 + }, + { + "epoch": 0.17, + "learning_rate": 4.9256894162014836e-06, + "logits/chosen": -2.025057792663574, + "logits/rejected": -1.8538585901260376, + "logps/chosen": -295.452880859375, + "logps/rejected": -262.1081237792969, + "loss": 0.6895, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.024811143055558205, + "rewards/margins": 0.08870936930179596, + "rewards/rejected": -0.11352050304412842, + "step": 1040 + }, + { + "epoch": 0.17, + "learning_rate": 4.922194726890631e-06, + "logits/chosen": -2.1063590049743652, + "logits/rejected": -1.982398271560669, + "logps/chosen": -225.9454803466797, + "logps/rejected": -230.10397338867188, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03644616901874542, + "rewards/margins": 0.061360638588666916, + "rewards/rejected": -0.09780679643154144, + "step": 1050 + }, + { + "epoch": 0.17, + "learning_rate": 4.918621038082168e-06, + "logits/chosen": -1.912010908126831, + "logits/rejected": -2.078324556350708, + "logps/chosen": -192.97293090820312, + "logps/rejected": -255.25393676757812, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0213629137724638, + "rewards/margins": 0.11235551536083221, + "rewards/rejected": -0.13371846079826355, + "step": 1060 + }, + { + "epoch": 0.18, + "learning_rate": 4.914968466331388e-06, + "logits/chosen": -2.1915786266326904, + "logits/rejected": -1.789747953414917, + "logps/chosen": -273.31768798828125, + "logps/rejected": -239.19259643554688, + "loss": 0.6902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0006018438143655658, + "rewards/margins": 0.12897200882434845, + "rewards/rejected": -0.12837016582489014, + "step": 1070 + }, + { + "epoch": 0.18, + "learning_rate": 4.911237130766341e-06, + "logits/chosen": -1.8552300930023193, + "logits/rejected": -1.6933231353759766, + "logps/chosen": -250.517333984375, + "logps/rejected": -230.78469848632812, + "loss": 0.6863, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.012648197822272778, + "rewards/margins": 0.08683465421199799, + "rewards/rejected": -0.07418645918369293, + "step": 1080 + }, + { + "epoch": 0.18, + "learning_rate": 4.907427153083945e-06, + "logits/chosen": -1.9735358953475952, + "logits/rejected": -1.8760576248168945, + "logps/chosen": -230.2898712158203, + "logps/rejected": -249.6136474609375, + "loss": 0.6891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.002240245696157217, + "rewards/margins": 0.0894870012998581, + "rewards/rejected": -0.09172724932432175, + "step": 1090 + }, + { + "epoch": 0.18, + "learning_rate": 4.903538657546019e-06, + "logits/chosen": -2.074902057647705, + "logits/rejected": -1.9013715982437134, + "logps/chosen": -260.36065673828125, + "logps/rejected": -269.2521057128906, + "loss": 0.692, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.025573575869202614, + "rewards/margins": 0.026423092931509018, + "rewards/rejected": -0.05199667066335678, + "step": 1100 + }, + { + "epoch": 0.18, + "learning_rate": 4.899571770975231e-06, + "logits/chosen": -2.075524091720581, + "logits/rejected": -1.9014570713043213, + "logps/chosen": -247.8839569091797, + "logps/rejected": -235.8272247314453, + "loss": 0.6907, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012155195698142052, + "rewards/margins": 0.09406516700983047, + "rewards/rejected": -0.08190996944904327, + "step": 1110 + }, + { + "epoch": 0.18, + "learning_rate": 4.895526622750958e-06, + "logits/chosen": -1.8694976568222046, + "logits/rejected": -1.8426311016082764, + "logps/chosen": -236.99276733398438, + "logps/rejected": -281.50115966796875, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.005598927848041058, + "rewards/margins": 0.1126382127404213, + "rewards/rejected": -0.11823715269565582, + "step": 1120 + }, + { + "epoch": 0.18, + "learning_rate": 4.891403344805068e-06, + "logits/chosen": -1.8424018621444702, + "logits/rejected": -1.770166039466858, + "logps/chosen": -235.69754028320312, + "logps/rejected": -255.5775604248047, + "loss": 0.6866, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.028685424476861954, + "rewards/margins": 0.1630304753780365, + "rewards/rejected": -0.19171589612960815, + "step": 1130 + }, + { + "epoch": 0.19, + "learning_rate": 4.887202071617619e-06, + "logits/chosen": -1.8849172592163086, + "logits/rejected": -1.9000492095947266, + "logps/chosen": -269.3163757324219, + "logps/rejected": -302.06781005859375, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012176516465842724, + "rewards/margins": 0.08151105046272278, + "rewards/rejected": -0.09368755668401718, + "step": 1140 + }, + { + "epoch": 0.19, + "learning_rate": 4.882922940212472e-06, + "logits/chosen": -1.9040454626083374, + "logits/rejected": -1.89935302734375, + "logps/chosen": -246.7144317626953, + "logps/rejected": -264.3641357421875, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03635886311531067, + "rewards/margins": 0.057978712022304535, + "rewards/rejected": -0.0943375676870346, + "step": 1150 + }, + { + "epoch": 0.19, + "learning_rate": 4.87856609015282e-06, + "logits/chosen": -1.7455370426177979, + "logits/rejected": -1.7498114109039307, + "logps/chosen": -210.2646942138672, + "logps/rejected": -212.57119750976562, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004301241599023342, + "rewards/margins": 0.06981071829795837, + "rewards/rejected": -0.07411196082830429, + "step": 1160 + }, + { + "epoch": 0.19, + "learning_rate": 4.874131663536637e-06, + "logits/chosen": -2.123107671737671, + "logits/rejected": -2.07108736038208, + "logps/chosen": -322.74810791015625, + "logps/rejected": -290.347412109375, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.023334873840212822, + "rewards/margins": 0.08643420040607452, + "rewards/rejected": -0.06309932470321655, + "step": 1170 + }, + { + "epoch": 0.19, + "learning_rate": 4.869619804992046e-06, + "logits/chosen": -1.8806129693984985, + "logits/rejected": -1.8797521591186523, + "logps/chosen": -199.497802734375, + "logps/rejected": -234.2181396484375, + "loss": 0.6894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.005178377032279968, + "rewards/margins": 0.07378038018941879, + "rewards/rejected": -0.07895875722169876, + "step": 1180 + }, + { + "epoch": 0.19, + "learning_rate": 4.8650306616725985e-06, + "logits/chosen": -1.9584424495697021, + "logits/rejected": -1.8471086025238037, + "logps/chosen": -300.36175537109375, + "logps/rejected": -291.70989990234375, + "loss": 0.6893, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02619417943060398, + "rewards/margins": 0.09574911743402481, + "rewards/rejected": -0.12194329500198364, + "step": 1190 + }, + { + "epoch": 0.2, + "learning_rate": 4.8603643832524795e-06, + "logits/chosen": -2.018153429031372, + "logits/rejected": -1.905743956565857, + "logps/chosen": -254.3413543701172, + "logps/rejected": -244.395751953125, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023983823135495186, + "rewards/margins": 0.10571499913930893, + "rewards/rejected": -0.12969884276390076, + "step": 1200 + }, + { + "epoch": 0.2, + "learning_rate": 4.855621121921619e-06, + "logits/chosen": -2.1856629848480225, + "logits/rejected": -1.9142640829086304, + "logps/chosen": -292.64410400390625, + "logps/rejected": -253.95553588867188, + "loss": 0.692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03962177783250809, + "rewards/margins": 0.07444148510694504, + "rewards/rejected": -0.11406326293945312, + "step": 1210 + }, + { + "epoch": 0.2, + "learning_rate": 4.850801032380734e-06, + "logits/chosen": -2.093148946762085, + "logits/rejected": -1.7402007579803467, + "logps/chosen": -257.4886779785156, + "logps/rejected": -227.22128295898438, + "loss": 0.6904, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05255168676376343, + "rewards/margins": 0.07043623924255371, + "rewards/rejected": -0.12298792600631714, + "step": 1220 + }, + { + "epoch": 0.2, + "learning_rate": 4.8459042718362845e-06, + "logits/chosen": -2.0653700828552246, + "logits/rejected": -1.8576633930206299, + "logps/chosen": -264.504150390625, + "logps/rejected": -230.44314575195312, + "loss": 0.6886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0019277870887890458, + "rewards/margins": 0.09582323580980301, + "rewards/rejected": -0.09389545023441315, + "step": 1230 + }, + { + "epoch": 0.2, + "learning_rate": 4.840930999995339e-06, + "logits/chosen": -1.521937608718872, + "logits/rejected": -1.7331596612930298, + "logps/chosen": -258.92657470703125, + "logps/rejected": -302.350341796875, + "loss": 0.6895, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.039135195314884186, + "rewards/margins": 0.07316290587186813, + "rewards/rejected": -0.11229810863733292, + "step": 1240 + }, + { + "epoch": 0.2, + "learning_rate": 4.8358813790603715e-06, + "logits/chosen": -1.9663774967193604, + "logits/rejected": -1.7608230113983154, + "logps/chosen": -303.8472595214844, + "logps/rejected": -295.2870788574219, + "loss": 0.6893, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.010920828208327293, + "rewards/margins": 0.14066331088542938, + "rewards/rejected": -0.15158414840698242, + "step": 1250 + }, + { + "epoch": 0.21, + "learning_rate": 4.830755573723969e-06, + "logits/chosen": -2.015403985977173, + "logits/rejected": -1.7962592840194702, + "logps/chosen": -281.1839294433594, + "logps/rejected": -288.7331237792969, + "loss": 0.6878, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0035325761418789625, + "rewards/margins": 0.13974474370479584, + "rewards/rejected": -0.14327731728553772, + "step": 1260 + }, + { + "epoch": 0.21, + "learning_rate": 4.825553751163462e-06, + "logits/chosen": -2.1096456050872803, + "logits/rejected": -1.716141700744629, + "logps/chosen": -370.5509948730469, + "logps/rejected": -336.0577087402344, + "loss": 0.6933, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.014703398570418358, + "rewards/margins": 0.06139153242111206, + "rewards/rejected": -0.07609493285417557, + "step": 1270 + }, + { + "epoch": 0.21, + "learning_rate": 4.82027608103547e-06, + "logits/chosen": -2.0440964698791504, + "logits/rejected": -1.8985693454742432, + "logps/chosen": -270.3192443847656, + "logps/rejected": -218.48556518554688, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012839141301810741, + "rewards/margins": 0.07500091940164566, + "rewards/rejected": -0.06216178089380264, + "step": 1280 + }, + { + "epoch": 0.21, + "learning_rate": 4.814922735470368e-06, + "logits/chosen": -2.171933650970459, + "logits/rejected": -1.7564094066619873, + "logps/chosen": -272.65673828125, + "logps/rejected": -201.66278076171875, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05354836583137512, + "rewards/margins": 0.06802831590175629, + "rewards/rejected": -0.01447994727641344, + "step": 1290 + }, + { + "epoch": 0.21, + "learning_rate": 4.809493889066675e-06, + "logits/chosen": -2.1286778450012207, + "logits/rejected": -1.7546037435531616, + "logps/chosen": -279.53192138671875, + "logps/rejected": -226.7566375732422, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03710506856441498, + "rewards/margins": 0.02362453192472458, + "rewards/rejected": 0.013480538502335548, + "step": 1300 + }, + { + "epoch": 0.21, + "learning_rate": 4.803989718885356e-06, + "logits/chosen": -1.9331384897232056, + "logits/rejected": -1.7742788791656494, + "logps/chosen": -270.7037048339844, + "logps/rejected": -279.15155029296875, + "loss": 0.6904, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00419805059209466, + "rewards/margins": 0.06142327934503555, + "rewards/rejected": -0.05722523853182793, + "step": 1310 + }, + { + "epoch": 0.22, + "learning_rate": 4.798410404444052e-06, + "logits/chosen": -1.9901536703109741, + "logits/rejected": -1.756842017173767, + "logps/chosen": -205.8380889892578, + "logps/rejected": -171.04379272460938, + "loss": 0.6926, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.004553773440420628, + "rewards/margins": 0.060982923954725266, + "rewards/rejected": -0.056429147720336914, + "step": 1320 + }, + { + "epoch": 0.22, + "learning_rate": 4.792756127711219e-06, + "logits/chosen": -1.9817692041397095, + "logits/rejected": -1.9081203937530518, + "logps/chosen": -242.42013549804688, + "logps/rejected": -240.75192260742188, + "loss": 0.6894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009349822998046875, + "rewards/margins": 0.08752413839101791, + "rewards/rejected": -0.09687396883964539, + "step": 1330 + }, + { + "epoch": 0.22, + "learning_rate": 4.7870270731001976e-06, + "logits/chosen": -1.762141466140747, + "logits/rejected": -1.8428550958633423, + "logps/chosen": -243.0557861328125, + "logps/rejected": -317.4654846191406, + "loss": 0.6918, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03446834534406662, + "rewards/margins": 0.06733662635087967, + "rewards/rejected": -0.10180497169494629, + "step": 1340 + }, + { + "epoch": 0.22, + "learning_rate": 4.781223427463199e-06, + "logits/chosen": -1.979488730430603, + "logits/rejected": -1.8432090282440186, + "logps/chosen": -253.4580841064453, + "logps/rejected": -232.4419403076172, + "loss": 0.6915, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03006628155708313, + "rewards/margins": 0.03117140755057335, + "rewards/rejected": -0.06123769283294678, + "step": 1350 + }, + { + "epoch": 0.22, + "learning_rate": 4.775345380085204e-06, + "logits/chosen": -2.140706777572632, + "logits/rejected": -1.8629486560821533, + "logps/chosen": -304.911376953125, + "logps/rejected": -281.47320556640625, + "loss": 0.6902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011940655298531055, + "rewards/margins": 0.10842498391866684, + "rewards/rejected": -0.12036565691232681, + "step": 1360 + }, + { + "epoch": 0.22, + "learning_rate": 4.769393122677799e-06, + "logits/chosen": -1.8387647867202759, + "logits/rejected": -1.7667442560195923, + "logps/chosen": -239.5221405029297, + "logps/rejected": -241.0010223388672, + "loss": 0.6906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.013345925137400627, + "rewards/margins": 0.07718405872583389, + "rewards/rejected": -0.09052999317646027, + "step": 1370 + }, + { + "epoch": 0.23, + "learning_rate": 4.763366849372918e-06, + "logits/chosen": -1.8982566595077515, + "logits/rejected": -1.8602701425552368, + "logps/chosen": -253.1553192138672, + "logps/rejected": -246.602783203125, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016622189432382584, + "rewards/margins": 0.09810791909694672, + "rewards/rejected": -0.08148573338985443, + "step": 1380 + }, + { + "epoch": 0.23, + "learning_rate": 4.757266756716509e-06, + "logits/chosen": -2.005530834197998, + "logits/rejected": -1.9663074016571045, + "logps/chosen": -231.611572265625, + "logps/rejected": -231.8648681640625, + "loss": 0.692, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00047435759915970266, + "rewards/margins": 0.06099332496523857, + "rewards/rejected": -0.061467695981264114, + "step": 1390 + }, + { + "epoch": 0.23, + "learning_rate": 4.75109304366213e-06, + "logits/chosen": -1.8243262767791748, + "logits/rejected": -1.723083257675171, + "logps/chosen": -251.80477905273438, + "logps/rejected": -280.6353454589844, + "loss": 0.6861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015173451974987984, + "rewards/margins": 0.12839651107788086, + "rewards/rejected": -0.1435699462890625, + "step": 1400 + }, + { + "epoch": 0.23, + "learning_rate": 4.744845911564454e-06, + "logits/chosen": -2.0357000827789307, + "logits/rejected": -1.8238598108291626, + "logps/chosen": -345.0341796875, + "logps/rejected": -342.579833984375, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.013357339426875114, + "rewards/margins": 0.10512367635965347, + "rewards/rejected": -0.0917663425207138, + "step": 1410 + }, + { + "epoch": 0.23, + "learning_rate": 4.738525564172707e-06, + "logits/chosen": -1.8863226175308228, + "logits/rejected": -1.7519323825836182, + "logps/chosen": -255.8773956298828, + "logps/rejected": -263.6374816894531, + "loss": 0.6899, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.01787368394434452, + "rewards/margins": 0.07147493213415146, + "rewards/rejected": -0.05360124632716179, + "step": 1420 + }, + { + "epoch": 0.23, + "learning_rate": 4.732132207624017e-06, + "logits/chosen": -2.0990707874298096, + "logits/rejected": -1.927489995956421, + "logps/chosen": -245.4756622314453, + "logps/rejected": -276.53131103515625, + "loss": 0.6883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.015036756172776222, + "rewards/margins": 0.10826150327920914, + "rewards/rejected": -0.09322474151849747, + "step": 1430 + }, + { + "epoch": 0.24, + "learning_rate": 4.725666050436697e-06, + "logits/chosen": -2.1977810859680176, + "logits/rejected": -1.7903363704681396, + "logps/chosen": -304.98193359375, + "logps/rejected": -277.96807861328125, + "loss": 0.6901, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07072176039218903, + "rewards/margins": 0.13287675380706787, + "rewards/rejected": -0.062154997140169144, + "step": 1440 + }, + { + "epoch": 0.24, + "learning_rate": 4.719127303503439e-06, + "logits/chosen": -2.001084804534912, + "logits/rejected": -1.7518584728240967, + "logps/chosen": -220.5527801513672, + "logps/rejected": -196.35304260253906, + "loss": 0.6918, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.035161178559064865, + "rewards/margins": 0.07417868077754974, + "rewards/rejected": -0.03901750594377518, + "step": 1450 + }, + { + "epoch": 0.24, + "learning_rate": 4.712516180084441e-06, + "logits/chosen": -2.0459303855895996, + "logits/rejected": -1.9369564056396484, + "logps/chosen": -239.44668579101562, + "logps/rejected": -242.6125030517578, + "loss": 0.6852, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021737772971391678, + "rewards/margins": 0.14017881453037262, + "rewards/rejected": -0.11844104528427124, + "step": 1460 + }, + { + "epoch": 0.24, + "learning_rate": 4.705832895800445e-06, + "logits/chosen": -1.99789297580719, + "logits/rejected": -1.8965266942977905, + "logps/chosen": -237.51626586914062, + "logps/rejected": -256.2833557128906, + "loss": 0.6909, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03856717422604561, + "rewards/margins": 0.10859853029251099, + "rewards/rejected": -0.07003135979175568, + "step": 1470 + }, + { + "epoch": 0.24, + "learning_rate": 4.699077668625711e-06, + "logits/chosen": -1.879766821861267, + "logits/rejected": -1.9450807571411133, + "logps/chosen": -251.4164581298828, + "logps/rejected": -258.92022705078125, + "loss": 0.6887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02113732323050499, + "rewards/margins": 0.09613610059022903, + "rewards/rejected": -0.07499876618385315, + "step": 1480 + }, + { + "epoch": 0.24, + "learning_rate": 4.692250718880904e-06, + "logits/chosen": -1.9303529262542725, + "logits/rejected": -1.8214197158813477, + "logps/chosen": -255.1653594970703, + "logps/rejected": -275.32781982421875, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0008279670146293938, + "rewards/margins": 0.08050056546926498, + "rewards/rejected": -0.07967259734869003, + "step": 1490 + }, + { + "epoch": 0.25, + "learning_rate": 4.685352269225909e-06, + "logits/chosen": -1.8300201892852783, + "logits/rejected": -1.743168592453003, + "logps/chosen": -242.4856414794922, + "logps/rejected": -271.44696044921875, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010996214114129543, + "rewards/margins": 0.10735423862934113, + "rewards/rejected": -0.09635802358388901, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 4.67838254465257e-06, + "logits/chosen": -2.0610198974609375, + "logits/rejected": -1.8466627597808838, + "logps/chosen": -226.02304077148438, + "logps/rejected": -228.15628051757812, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02478194609284401, + "rewards/margins": 0.0838068276643753, + "rewards/rejected": -0.059024881571531296, + "step": 1510 + }, + { + "epoch": 0.25, + "learning_rate": 4.6713417724773505e-06, + "logits/chosen": -1.976702094078064, + "logits/rejected": -1.9141117334365845, + "logps/chosen": -253.24337768554688, + "logps/rejected": -250.56494140625, + "loss": 0.6916, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02221101149916649, + "rewards/margins": 0.040845468640327454, + "rewards/rejected": -0.018634457141160965, + "step": 1520 + }, + { + "epoch": 0.25, + "learning_rate": 4.66423018233392e-06, + "logits/chosen": -1.9664026498794556, + "logits/rejected": -1.6477758884429932, + "logps/chosen": -234.92788696289062, + "logps/rejected": -250.49813842773438, + "loss": 0.6881, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.05969662591814995, + "rewards/margins": 0.11318562924861908, + "rewards/rejected": -0.053488992154598236, + "step": 1530 + }, + { + "epoch": 0.25, + "learning_rate": 4.657048006165666e-06, + "logits/chosen": -1.9387773275375366, + "logits/rejected": -1.9748141765594482, + "logps/chosen": -233.7736053466797, + "logps/rejected": -257.48394775390625, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06510043889284134, + "rewards/margins": 0.0782768577337265, + "rewards/rejected": -0.013176411390304565, + "step": 1540 + }, + { + "epoch": 0.25, + "learning_rate": 4.649795478218127e-06, + "logits/chosen": -2.018745183944702, + "logits/rejected": -1.8233848810195923, + "logps/chosen": -311.59088134765625, + "logps/rejected": -298.1930847167969, + "loss": 0.6922, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.015644565224647522, + "rewards/margins": 0.05338485911488533, + "rewards/rejected": -0.03774028643965721, + "step": 1550 + }, + { + "epoch": 0.26, + "learning_rate": 4.6424728350313545e-06, + "logits/chosen": -1.920920729637146, + "logits/rejected": -2.090101718902588, + "logps/chosen": -227.4725341796875, + "logps/rejected": -269.5147705078125, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009294538758695126, + "rewards/margins": 0.0551295168697834, + "rewards/rejected": -0.045834980905056, + "step": 1560 + }, + { + "epoch": 0.26, + "learning_rate": 4.635080315432196e-06, + "logits/chosen": -1.9260780811309814, + "logits/rejected": -1.7062475681304932, + "logps/chosen": -279.51788330078125, + "logps/rejected": -260.5428466796875, + "loss": 0.6895, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.008236927911639214, + "rewards/margins": 0.09086431562900543, + "rewards/rejected": -0.08262738585472107, + "step": 1570 + }, + { + "epoch": 0.26, + "learning_rate": 4.627618160526509e-06, + "logits/chosen": -1.9100372791290283, + "logits/rejected": -1.9197540283203125, + "logps/chosen": -218.99429321289062, + "logps/rejected": -278.8215637207031, + "loss": 0.6904, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03638831898570061, + "rewards/margins": 0.07872694730758667, + "rewards/rejected": -0.042338620871305466, + "step": 1580 + }, + { + "epoch": 0.26, + "learning_rate": 4.6200866136912946e-06, + "logits/chosen": -2.0397140979766846, + "logits/rejected": -2.0222744941711426, + "logps/chosen": -280.08892822265625, + "logps/rejected": -324.85638427734375, + "loss": 0.688, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06009145453572273, + "rewards/margins": 0.10047173500061035, + "rewards/rejected": -0.04038027673959732, + "step": 1590 + }, + { + "epoch": 0.26, + "learning_rate": 4.61248592056676e-06, + "logits/chosen": -1.9272651672363281, + "logits/rejected": -1.770471215248108, + "logps/chosen": -188.28184509277344, + "logps/rejected": -177.3421630859375, + "loss": 0.6888, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.05323566868901253, + "rewards/margins": 0.09085603058338165, + "rewards/rejected": -0.03762035816907883, + "step": 1600 + }, + { + "epoch": 0.26, + "learning_rate": 4.604816329048309e-06, + "logits/chosen": -2.0539746284484863, + "logits/rejected": -1.9116235971450806, + "logps/chosen": -241.92007446289062, + "logps/rejected": -275.4397888183594, + "loss": 0.6897, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08154787123203278, + "rewards/margins": 0.10853314399719238, + "rewards/rejected": -0.0269852876663208, + "step": 1610 + }, + { + "epoch": 0.26, + "learning_rate": 4.5970780892784545e-06, + "logits/chosen": -2.0517537593841553, + "logits/rejected": -1.8177028894424438, + "logps/chosen": -266.09051513671875, + "logps/rejected": -237.227294921875, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017022933810949326, + "rewards/margins": 0.11547992378473282, + "rewards/rejected": -0.0984569862484932, + "step": 1620 + }, + { + "epoch": 0.27, + "learning_rate": 4.589271453638662e-06, + "logits/chosen": -2.1352388858795166, + "logits/rejected": -1.8022794723510742, + "logps/chosen": -277.32611083984375, + "logps/rejected": -220.5225067138672, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.028665948659181595, + "rewards/margins": 0.0959525853395462, + "rewards/rejected": -0.06728664040565491, + "step": 1630 + }, + { + "epoch": 0.27, + "learning_rate": 4.581396676741117e-06, + "logits/chosen": -2.0802879333496094, + "logits/rejected": -1.7723671197891235, + "logps/chosen": -254.9521942138672, + "logps/rejected": -240.1614532470703, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.026140743866562843, + "rewards/margins": 0.1523440182209015, + "rewards/rejected": -0.1262032687664032, + "step": 1640 + }, + { + "epoch": 0.27, + "learning_rate": 4.5734540154204215e-06, + "logits/chosen": -1.8795855045318604, + "logits/rejected": -2.032562732696533, + "logps/chosen": -199.833740234375, + "logps/rejected": -224.38442993164062, + "loss": 0.6893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0465850830078125, + "rewards/margins": 0.13779515027999878, + "rewards/rejected": -0.09121006727218628, + "step": 1650 + }, + { + "epoch": 0.27, + "learning_rate": 4.5654437287252175e-06, + "logits/chosen": -2.114125967025757, + "logits/rejected": -1.6600843667984009, + "logps/chosen": -292.65789794921875, + "logps/rejected": -273.93701171875, + "loss": 0.6909, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09322325885295868, + "rewards/margins": 0.10127697885036469, + "rewards/rejected": -0.008053727447986603, + "step": 1660 + }, + { + "epoch": 0.27, + "learning_rate": 4.557366077909737e-06, + "logits/chosen": -1.8140815496444702, + "logits/rejected": -1.8872156143188477, + "logps/chosen": -205.37716674804688, + "logps/rejected": -246.71603393554688, + "loss": 0.6887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.042984627187252045, + "rewards/margins": 0.08470744639635086, + "rewards/rejected": -0.04172281548380852, + "step": 1670 + }, + { + "epoch": 0.27, + "learning_rate": 4.5492213264252835e-06, + "logits/chosen": -1.9818798303604126, + "logits/rejected": -1.7638969421386719, + "logps/chosen": -238.5367889404297, + "logps/rejected": -235.301025390625, + "loss": 0.6893, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.044371359050273895, + "rewards/margins": 0.09514982998371124, + "rewards/rejected": -0.05077847093343735, + "step": 1680 + }, + { + "epoch": 0.28, + "learning_rate": 4.541009739911638e-06, + "logits/chosen": -2.0621535778045654, + "logits/rejected": -1.8884509801864624, + "logps/chosen": -258.1687316894531, + "logps/rejected": -211.55001831054688, + "loss": 0.6908, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.051352836191654205, + "rewards/margins": 0.0523812361061573, + "rewards/rejected": -0.0010284058516845107, + "step": 1690 + }, + { + "epoch": 0.28, + "learning_rate": 4.5327315861883935e-06, + "logits/chosen": -2.1555848121643066, + "logits/rejected": -1.7871919870376587, + "logps/chosen": -281.6337890625, + "logps/rejected": -220.19845581054688, + "loss": 0.6912, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0495150089263916, + "rewards/margins": 0.06315966695547104, + "rewards/rejected": -0.013644659891724586, + "step": 1700 + }, + { + "epoch": 0.28, + "learning_rate": 4.524387135246223e-06, + "logits/chosen": -2.1733145713806152, + "logits/rejected": -2.035409450531006, + "logps/chosen": -215.9166259765625, + "logps/rejected": -235.7335205078125, + "loss": 0.6894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.030558938160538673, + "rewards/margins": 0.08851588517427444, + "rewards/rejected": -0.05795693397521973, + "step": 1710 + }, + { + "epoch": 0.28, + "learning_rate": 4.515976659238075e-06, + "logits/chosen": -2.094460964202881, + "logits/rejected": -1.9709587097167969, + "logps/chosen": -218.61654663085938, + "logps/rejected": -212.4292449951172, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04615918546915054, + "rewards/margins": 0.06250666081905365, + "rewards/rejected": -0.01634746976196766, + "step": 1720 + }, + { + "epoch": 0.28, + "learning_rate": 4.507500432470292e-06, + "logits/chosen": -2.191953420639038, + "logits/rejected": -1.6874326467514038, + "logps/chosen": -321.4230041503906, + "logps/rejected": -246.4324951171875, + "loss": 0.6918, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.06153164058923721, + "rewards/margins": 0.049347423017024994, + "rewards/rejected": 0.012184219434857368, + "step": 1730 + }, + { + "epoch": 0.28, + "learning_rate": 4.498958731393669e-06, + "logits/chosen": -2.066990375518799, + "logits/rejected": -1.8487269878387451, + "logps/chosen": -266.86090087890625, + "logps/rejected": -244.6568145751953, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007399003952741623, + "rewards/margins": 0.08815360814332962, + "rewards/rejected": -0.09555260837078094, + "step": 1740 + }, + { + "epoch": 0.29, + "learning_rate": 4.490351834594433e-06, + "logits/chosen": -2.1965601444244385, + "logits/rejected": -1.872551679611206, + "logps/chosen": -220.11044311523438, + "logps/rejected": -230.28695678710938, + "loss": 0.6898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.059162575751543045, + "rewards/margins": 0.11480712890625, + "rewards/rejected": -0.055644553154706955, + "step": 1750 + }, + { + "epoch": 0.29, + "learning_rate": 4.481680022785162e-06, + "logits/chosen": -2.0913748741149902, + "logits/rejected": -1.9851386547088623, + "logps/chosen": -347.7581481933594, + "logps/rejected": -328.7959899902344, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06150129437446594, + "rewards/margins": 0.11646108329296112, + "rewards/rejected": -0.054959796369075775, + "step": 1760 + }, + { + "epoch": 0.29, + "learning_rate": 4.472943578795626e-06, + "logits/chosen": -2.03053617477417, + "logits/rejected": -1.6224664449691772, + "logps/chosen": -264.9629821777344, + "logps/rejected": -208.62155151367188, + "loss": 0.6877, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05098335072398186, + "rewards/margins": 0.10491319000720978, + "rewards/rejected": -0.05392984673380852, + "step": 1770 + }, + { + "epoch": 0.29, + "learning_rate": 4.464142787563562e-06, + "logits/chosen": -1.9274095296859741, + "logits/rejected": -1.6490062475204468, + "logps/chosen": -260.8162536621094, + "logps/rejected": -251.1756591796875, + "loss": 0.6889, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.06346863508224487, + "rewards/margins": 0.10570771992206573, + "rewards/rejected": -0.04223908483982086, + "step": 1780 + }, + { + "epoch": 0.29, + "learning_rate": 4.455277936125383e-06, + "logits/chosen": -2.20070219039917, + "logits/rejected": -1.8151031732559204, + "logps/chosen": -234.2545928955078, + "logps/rejected": -166.97052001953125, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04467929154634476, + "rewards/margins": 0.08422483503818512, + "rewards/rejected": -0.039545536041259766, + "step": 1790 + }, + { + "epoch": 0.29, + "learning_rate": 4.446349313606816e-06, + "logits/chosen": -2.0176234245300293, + "logits/rejected": -1.7023462057113647, + "logps/chosen": -274.6122131347656, + "logps/rejected": -245.50003051757812, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.033081669360399246, + "rewards/margins": 0.10413549095392227, + "rewards/rejected": -0.07105381786823273, + "step": 1800 + }, + { + "epoch": 0.3, + "learning_rate": 4.437357211213471e-06, + "logits/chosen": -1.9835269451141357, + "logits/rejected": -1.9444377422332764, + "logps/chosen": -246.85665893554688, + "logps/rejected": -286.31427001953125, + "loss": 0.688, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04472666233778, + "rewards/margins": 0.11357901990413666, + "rewards/rejected": -0.06885236501693726, + "step": 1810 + }, + { + "epoch": 0.3, + "learning_rate": 4.428301922221343e-06, + "logits/chosen": -1.8060262203216553, + "logits/rejected": -1.603062629699707, + "logps/chosen": -205.8793487548828, + "logps/rejected": -193.0597381591797, + "loss": 0.6877, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.024565210565924644, + "rewards/margins": 0.12199556827545166, + "rewards/rejected": -0.09743036329746246, + "step": 1820 + }, + { + "epoch": 0.3, + "learning_rate": 4.41918374196725e-06, + "logits/chosen": -1.9662978649139404, + "logits/rejected": -1.8880643844604492, + "logps/chosen": -300.921142578125, + "logps/rejected": -292.61077880859375, + "loss": 0.6895, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.06979396939277649, + "rewards/margins": 0.13754060864448547, + "rewards/rejected": -0.06774662435054779, + "step": 1830 + }, + { + "epoch": 0.3, + "learning_rate": 4.410002967839198e-06, + "logits/chosen": -2.1292946338653564, + "logits/rejected": -1.7908376455307007, + "logps/chosen": -212.7253875732422, + "logps/rejected": -207.7818145751953, + "loss": 0.6905, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.060895007103681564, + "rewards/margins": 0.04343985393643379, + "rewards/rejected": 0.017455147579312325, + "step": 1840 + }, + { + "epoch": 0.3, + "learning_rate": 4.400759899266683e-06, + "logits/chosen": -1.9126943349838257, + "logits/rejected": -1.5380419492721558, + "logps/chosen": -263.036376953125, + "logps/rejected": -212.6478271484375, + "loss": 0.6895, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07238466292619705, + "rewards/margins": 0.09089400619268417, + "rewards/rejected": -0.01850934885442257, + "step": 1850 + }, + { + "epoch": 0.3, + "learning_rate": 4.391454837710921e-06, + "logits/chosen": -1.7518205642700195, + "logits/rejected": -1.7788965702056885, + "logps/chosen": -218.858642578125, + "logps/rejected": -259.80511474609375, + "loss": 0.6925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0318743959069252, + "rewards/margins": 0.09685514867305756, + "rewards/rejected": -0.12872955203056335, + "step": 1860 + }, + { + "epoch": 0.31, + "learning_rate": 4.382088086655023e-06, + "logits/chosen": -1.8783429861068726, + "logits/rejected": -1.7901151180267334, + "logps/chosen": -216.3297882080078, + "logps/rejected": -254.869140625, + "loss": 0.6873, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.061540424823760986, + "rewards/margins": 0.14867649972438812, + "rewards/rejected": -0.08713604509830475, + "step": 1870 + }, + { + "epoch": 0.31, + "learning_rate": 4.3726599515940926e-06, + "logits/chosen": -1.9793615341186523, + "logits/rejected": -1.9763129949569702, + "logps/chosen": -232.66940307617188, + "logps/rejected": -276.1132507324219, + "loss": 0.6894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01744922623038292, + "rewards/margins": 0.10686904191970825, + "rewards/rejected": -0.08941982686519623, + "step": 1880 + }, + { + "epoch": 0.31, + "learning_rate": 4.363170740025261e-06, + "logits/chosen": -1.8423244953155518, + "logits/rejected": -1.7792813777923584, + "logps/chosen": -247.7403106689453, + "logps/rejected": -269.8460388183594, + "loss": 0.687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.009260248392820358, + "rewards/margins": 0.09420810639858246, + "rewards/rejected": -0.084947869181633, + "step": 1890 + }, + { + "epoch": 0.31, + "learning_rate": 4.353620761437663e-06, + "logits/chosen": -1.9299675226211548, + "logits/rejected": -1.7542650699615479, + "logps/chosen": -215.9551544189453, + "logps/rejected": -201.03982543945312, + "loss": 0.6902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026891371235251427, + "rewards/margins": 0.08191802352666855, + "rewards/rejected": -0.05502665787935257, + "step": 1900 + }, + { + "epoch": 0.31, + "learning_rate": 4.344010327302338e-06, + "logits/chosen": -2.143010139465332, + "logits/rejected": -1.953432321548462, + "logps/chosen": -262.2422180175781, + "logps/rejected": -225.7523193359375, + "loss": 0.6896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0065484484657645226, + "rewards/margins": 0.10255619138479233, + "rewards/rejected": -0.09600772708654404, + "step": 1910 + }, + { + "epoch": 0.31, + "learning_rate": 4.334339751062073e-06, + "logits/chosen": -2.1119582653045654, + "logits/rejected": -1.9017441272735596, + "logps/chosen": -259.5169677734375, + "logps/rejected": -253.2615509033203, + "loss": 0.6897, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03497769683599472, + "rewards/margins": 0.07566927373409271, + "rewards/rejected": -0.04069158434867859, + "step": 1920 + }, + { + "epoch": 0.32, + "learning_rate": 4.324609348121184e-06, + "logits/chosen": -1.9002918004989624, + "logits/rejected": -1.7799943685531616, + "logps/chosen": -289.1358337402344, + "logps/rejected": -263.1283874511719, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020220216363668442, + "rewards/margins": 0.055554766207933426, + "rewards/rejected": -0.03533454239368439, + "step": 1930 + }, + { + "epoch": 0.32, + "learning_rate": 4.3148194358352194e-06, + "logits/chosen": -1.9387191534042358, + "logits/rejected": -1.8383386135101318, + "logps/chosen": -238.0577392578125, + "logps/rejected": -263.00091552734375, + "loss": 0.6906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.030178342014551163, + "rewards/margins": 0.07893277704715729, + "rewards/rejected": -0.10911110788583755, + "step": 1940 + }, + { + "epoch": 0.32, + "learning_rate": 4.304970333500621e-06, + "logits/chosen": -1.9874976873397827, + "logits/rejected": -1.8078399896621704, + "logps/chosen": -214.51748657226562, + "logps/rejected": -224.7821807861328, + "loss": 0.6888, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0010104458779096603, + "rewards/margins": 0.12157710641622543, + "rewards/rejected": -0.12258754670619965, + "step": 1950 + }, + { + "epoch": 0.32, + "learning_rate": 4.295062362344301e-06, + "logits/chosen": -1.9883317947387695, + "logits/rejected": -1.8289381265640259, + "logps/chosen": -297.0546875, + "logps/rejected": -311.4903259277344, + "loss": 0.6895, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004276087507605553, + "rewards/margins": 0.09024181216955185, + "rewards/rejected": -0.09451790153980255, + "step": 1960 + }, + { + "epoch": 0.32, + "learning_rate": 4.285095845513172e-06, + "logits/chosen": -1.909839391708374, + "logits/rejected": -1.6441818475723267, + "logps/chosen": -286.56512451171875, + "logps/rejected": -246.447265625, + "loss": 0.6895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0007835488650016487, + "rewards/margins": 0.10691998153924942, + "rewards/rejected": -0.10613642632961273, + "step": 1970 + }, + { + "epoch": 0.32, + "learning_rate": 4.275071108063602e-06, + "logits/chosen": -1.9651696681976318, + "logits/rejected": -1.974016547203064, + "logps/chosen": -238.3361053466797, + "logps/rejected": -281.3122863769531, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03040384128689766, + "rewards/margins": 0.0969507172703743, + "rewards/rejected": -0.06654687225818634, + "step": 1980 + }, + { + "epoch": 0.33, + "learning_rate": 4.2649884769508165e-06, + "logits/chosen": -2.0062415599823, + "logits/rejected": -1.7536312341690063, + "logps/chosen": -257.6700744628906, + "logps/rejected": -285.87066650390625, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00508440425619483, + "rewards/margins": 0.09233252704143524, + "rewards/rejected": -0.09741692990064621, + "step": 1990 + }, + { + "epoch": 0.33, + "learning_rate": 4.254848281018232e-06, + "logits/chosen": -1.959925889968872, + "logits/rejected": -1.8569157123565674, + "logps/chosen": -269.3879699707031, + "logps/rejected": -264.1894836425781, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011408627033233643, + "rewards/margins": 0.08169892430305481, + "rewards/rejected": -0.09310754388570786, + "step": 2000 + }, + { + "epoch": 0.33, + "learning_rate": 4.244650850986735e-06, + "logits/chosen": -1.8479160070419312, + "logits/rejected": -1.805991530418396, + "logps/chosen": -257.6282653808594, + "logps/rejected": -279.82855224609375, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004832814447581768, + "rewards/margins": 0.12820498645305634, + "rewards/rejected": -0.13303782045841217, + "step": 2010 + }, + { + "epoch": 0.33, + "learning_rate": 4.234396519443892e-06, + "logits/chosen": -1.9335193634033203, + "logits/rejected": -1.7317018508911133, + "logps/chosen": -301.556884765625, + "logps/rejected": -302.5426025390625, + "loss": 0.6868, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016016531735658646, + "rewards/margins": 0.13123200833797455, + "rewards/rejected": -0.1152154803276062, + "step": 2020 + }, + { + "epoch": 0.33, + "learning_rate": 4.2240856208331025e-06, + "logits/chosen": -1.9219344854354858, + "logits/rejected": -2.0105109214782715, + "logps/chosen": -295.91668701171875, + "logps/rejected": -289.79705810546875, + "loss": 0.6932, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.013536686077713966, + "rewards/margins": 0.03441958501935005, + "rewards/rejected": -0.04795626923441887, + "step": 2030 + }, + { + "epoch": 0.33, + "learning_rate": 4.2137184914426936e-06, + "logits/chosen": -1.9980261325836182, + "logits/rejected": -2.0902438163757324, + "logps/chosen": -203.69271850585938, + "logps/rejected": -251.0220184326172, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0004331640084274113, + "rewards/margins": 0.06452645361423492, + "rewards/rejected": -0.06495962291955948, + "step": 2040 + }, + { + "epoch": 0.34, + "learning_rate": 4.2032954693949465e-06, + "logits/chosen": -2.0044169425964355, + "logits/rejected": -1.9429712295532227, + "logps/chosen": -226.9346160888672, + "logps/rejected": -226.55429077148438, + "loss": 0.6895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.008278614841401577, + "rewards/margins": 0.07847534120082855, + "rewards/rejected": -0.0867539495229721, + "step": 2050 + }, + { + "epoch": 0.34, + "learning_rate": 4.192816894635079e-06, + "logits/chosen": -1.9002869129180908, + "logits/rejected": -1.6745954751968384, + "logps/chosen": -292.475830078125, + "logps/rejected": -301.7112121582031, + "loss": 0.6883, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01651707850396633, + "rewards/margins": 0.12277685105800629, + "rewards/rejected": -0.13929395377635956, + "step": 2060 + }, + { + "epoch": 0.34, + "learning_rate": 4.182283108920146e-06, + "logits/chosen": -2.211832046508789, + "logits/rejected": -1.8905013799667358, + "logps/chosen": -336.63385009765625, + "logps/rejected": -278.13494873046875, + "loss": 0.6907, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.004111937712877989, + "rewards/margins": 0.09613887965679169, + "rewards/rejected": -0.10025081783533096, + "step": 2070 + }, + { + "epoch": 0.34, + "learning_rate": 4.171694455807903e-06, + "logits/chosen": -1.7019582986831665, + "logits/rejected": -1.6689764261245728, + "logps/chosen": -236.54074096679688, + "logps/rejected": -237.20108032226562, + "loss": 0.6923, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03014719858765602, + "rewards/margins": 0.05340196564793587, + "rewards/rejected": -0.023254770785570145, + "step": 2080 + }, + { + "epoch": 0.34, + "learning_rate": 4.1610512806455926e-06, + "logits/chosen": -1.9093754291534424, + "logits/rejected": -1.8548517227172852, + "logps/chosen": -260.9023742675781, + "logps/rejected": -244.33169555664062, + "loss": 0.6876, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.023425322026014328, + "rewards/margins": 0.11010787636041641, + "rewards/rejected": -0.08668255805969238, + "step": 2090 + }, + { + "epoch": 0.34, + "learning_rate": 4.150353930558691e-06, + "logits/chosen": -1.9689515829086304, + "logits/rejected": -2.1082370281219482, + "logps/chosen": -236.096923828125, + "logps/rejected": -248.3663330078125, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05312788486480713, + "rewards/margins": 0.05502711981534958, + "rewards/rejected": -0.00189923495054245, + "step": 2100 + }, + { + "epoch": 0.35, + "learning_rate": 4.139602754439576e-06, + "logits/chosen": -2.0180156230926514, + "logits/rejected": -1.7726625204086304, + "logps/chosen": -274.1260070800781, + "logps/rejected": -253.8714599609375, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05249173194169998, + "rewards/margins": 0.09263677150011063, + "rewards/rejected": -0.04014504700899124, + "step": 2110 + }, + { + "epoch": 0.35, + "learning_rate": 4.128798102936155e-06, + "logits/chosen": -2.0540578365325928, + "logits/rejected": -1.8796707391738892, + "logps/chosen": -250.83358764648438, + "logps/rejected": -250.8741912841797, + "loss": 0.6895, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.050901491194963455, + "rewards/margins": 0.1051471009850502, + "rewards/rejected": -0.05424562841653824, + "step": 2120 + }, + { + "epoch": 0.35, + "learning_rate": 4.117940328440426e-06, + "logits/chosen": -1.841291069984436, + "logits/rejected": -1.874315619468689, + "logps/chosen": -219.38510131835938, + "logps/rejected": -236.6707763671875, + "loss": 0.6887, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.053682975471019745, + "rewards/margins": 0.10231509059667587, + "rewards/rejected": -0.04863210767507553, + "step": 2130 + }, + { + "epoch": 0.35, + "learning_rate": 4.107029785076989e-06, + "logits/chosen": -2.0353333950042725, + "logits/rejected": -1.9927241802215576, + "logps/chosen": -234.94845581054688, + "logps/rejected": -220.33242797851562, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04302379488945007, + "rewards/margins": 0.0878501757979393, + "rewards/rejected": -0.044826384633779526, + "step": 2140 + }, + { + "epoch": 0.35, + "learning_rate": 4.0960668286914855e-06, + "logits/chosen": -1.897815465927124, + "logits/rejected": -1.9691070318222046, + "logps/chosen": -248.922607421875, + "logps/rejected": -350.72259521484375, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.050051648169755936, + "rewards/margins": 0.11443950235843658, + "rewards/rejected": -0.06438785791397095, + "step": 2150 + }, + { + "epoch": 0.35, + "learning_rate": 4.085051816839003e-06, + "logits/chosen": -2.1646692752838135, + "logits/rejected": -1.8242400884628296, + "logps/chosen": -289.3182373046875, + "logps/rejected": -270.5400390625, + "loss": 0.6884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026032155379652977, + "rewards/margins": 0.06948091834783554, + "rewards/rejected": -0.043448761105537415, + "step": 2160 + }, + { + "epoch": 0.35, + "learning_rate": 4.073985108772409e-06, + "logits/chosen": -2.0142769813537598, + "logits/rejected": -1.8925861120224, + "logps/chosen": -315.774169921875, + "logps/rejected": -383.51116943359375, + "loss": 0.689, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05054203420877457, + "rewards/margins": 0.07194459438323975, + "rewards/rejected": -0.02140256203711033, + "step": 2170 + }, + { + "epoch": 0.36, + "learning_rate": 4.062867065430639e-06, + "logits/chosen": -1.92087721824646, + "logits/rejected": -1.9275569915771484, + "logps/chosen": -210.78140258789062, + "logps/rejected": -247.7904052734375, + "loss": 0.6909, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05119187757372856, + "rewards/margins": 0.09619946777820587, + "rewards/rejected": -0.04500759392976761, + "step": 2180 + }, + { + "epoch": 0.36, + "learning_rate": 4.051698049426912e-06, + "logits/chosen": -1.927914023399353, + "logits/rejected": -1.6795618534088135, + "logps/chosen": -239.77157592773438, + "logps/rejected": -267.0010681152344, + "loss": 0.6869, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.028442364186048508, + "rewards/margins": 0.09931979328393936, + "rewards/rejected": -0.07087743282318115, + "step": 2190 + }, + { + "epoch": 0.36, + "learning_rate": 4.04047842503692e-06, + "logits/chosen": -1.9437057971954346, + "logits/rejected": -1.717492699623108, + "logps/chosen": -279.0477294921875, + "logps/rejected": -281.88385009765625, + "loss": 0.6897, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04171605780720711, + "rewards/margins": 0.10504599660634995, + "rewards/rejected": -0.06332994252443314, + "step": 2200 + }, + { + "epoch": 0.36, + "learning_rate": 4.029208558186936e-06, + "logits/chosen": -1.9665520191192627, + "logits/rejected": -1.713425874710083, + "logps/chosen": -389.9446716308594, + "logps/rejected": -293.543212890625, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.048733510076999664, + "rewards/margins": 0.09052356332540512, + "rewards/rejected": -0.04179006069898605, + "step": 2210 + }, + { + "epoch": 0.36, + "learning_rate": 4.017888816441883e-06, + "logits/chosen": -1.9207618236541748, + "logits/rejected": -1.786516547203064, + "logps/chosen": -212.2977752685547, + "logps/rejected": -213.4353485107422, + "loss": 0.6887, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.09640399366617203, + "rewards/margins": 0.13502225279808044, + "rewards/rejected": -0.038618285208940506, + "step": 2220 + }, + { + "epoch": 0.36, + "learning_rate": 4.0065195689933455e-06, + "logits/chosen": -1.9478075504302979, + "logits/rejected": -1.9051965475082397, + "logps/chosen": -217.53732299804688, + "logps/rejected": -230.9365234375, + "loss": 0.6896, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.08981997519731522, + "rewards/margins": 0.07283227145671844, + "rewards/rejected": 0.01698770932853222, + "step": 2230 + }, + { + "epoch": 0.37, + "learning_rate": 3.995101186647531e-06, + "logits/chosen": -2.0520479679107666, + "logits/rejected": -1.9175134897232056, + "logps/chosen": -250.7217254638672, + "logps/rejected": -287.9960021972656, + "loss": 0.6889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07995010912418365, + "rewards/margins": 0.10398320853710175, + "rewards/rejected": -0.024033088237047195, + "step": 2240 + }, + { + "epoch": 0.37, + "learning_rate": 3.9836340418131715e-06, + "logits/chosen": -1.848076581954956, + "logits/rejected": -1.7977304458618164, + "logps/chosen": -231.99856567382812, + "logps/rejected": -272.3209533691406, + "loss": 0.6869, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.09017027169466019, + "rewards/margins": 0.1269918829202652, + "rewards/rejected": -0.03682161122560501, + "step": 2250 + }, + { + "epoch": 0.37, + "learning_rate": 3.972118508489381e-06, + "logits/chosen": -1.827183723449707, + "logits/rejected": -1.5613261461257935, + "logps/chosen": -234.29251098632812, + "logps/rejected": -240.063720703125, + "loss": 0.6899, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.09156180173158646, + "rewards/margins": 0.12153921276330948, + "rewards/rejected": -0.02997741661965847, + "step": 2260 + }, + { + "epoch": 0.37, + "learning_rate": 3.960554962253456e-06, + "logits/chosen": -1.8875722885131836, + "logits/rejected": -1.779989242553711, + "logps/chosen": -208.01077270507812, + "logps/rejected": -206.5596466064453, + "loss": 0.6868, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.10735974460840225, + "rewards/margins": 0.11069619655609131, + "rewards/rejected": -0.0033364533446729183, + "step": 2270 + }, + { + "epoch": 0.37, + "learning_rate": 3.948943780248625e-06, + "logits/chosen": -1.95138418674469, + "logits/rejected": -2.031461000442505, + "logps/chosen": -219.1524200439453, + "logps/rejected": -268.9326171875, + "loss": 0.6909, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.08899588137865067, + "rewards/margins": 0.08747906982898712, + "rewards/rejected": 0.00151681003626436, + "step": 2280 + }, + { + "epoch": 0.37, + "learning_rate": 3.937285341171753e-06, + "logits/chosen": -1.933758020401001, + "logits/rejected": -1.9266717433929443, + "logps/chosen": -250.43264770507812, + "logps/rejected": -250.0739288330078, + "loss": 0.6908, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06596992909908295, + "rewards/margins": 0.045417845249176025, + "rewards/rejected": 0.020552081987261772, + "step": 2290 + }, + { + "epoch": 0.38, + "learning_rate": 3.925580025260986e-06, + "logits/chosen": -2.103649139404297, + "logits/rejected": -1.9044959545135498, + "logps/chosen": -225.7589569091797, + "logps/rejected": -226.8832244873047, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04439983516931534, + "rewards/margins": 0.09360732138156891, + "rewards/rejected": -0.04920748621225357, + "step": 2300 + }, + { + "epoch": 0.38, + "learning_rate": 3.9138282142833475e-06, + "logits/chosen": -1.7480590343475342, + "logits/rejected": -1.710512399673462, + "logps/chosen": -274.48260498046875, + "logps/rejected": -271.5543518066406, + "loss": 0.6904, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07808150351047516, + "rewards/margins": 0.06703110039234161, + "rewards/rejected": 0.011050407774746418, + "step": 2310 + }, + { + "epoch": 0.38, + "learning_rate": 3.902030291522293e-06, + "logits/chosen": -1.9251468181610107, + "logits/rejected": -1.6745094060897827, + "logps/chosen": -275.0399475097656, + "logps/rejected": -254.57046508789062, + "loss": 0.6889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.050824832171201706, + "rewards/margins": 0.1105736717581749, + "rewards/rejected": -0.05974883958697319, + "step": 2320 + }, + { + "epoch": 0.38, + "learning_rate": 3.890186641765206e-06, + "logits/chosen": -2.177738666534424, + "logits/rejected": -1.9951694011688232, + "logps/chosen": -292.4705505371094, + "logps/rejected": -250.95828247070312, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.017064593732357025, + "rewards/margins": 0.04558398202061653, + "rewards/rejected": -0.028519386425614357, + "step": 2330 + }, + { + "epoch": 0.38, + "learning_rate": 3.878297651290849e-06, + "logits/chosen": -1.9223817586898804, + "logits/rejected": -1.7750184535980225, + "logps/chosen": -222.8002471923828, + "logps/rejected": -235.87588500976562, + "loss": 0.6866, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.040984295308589935, + "rewards/margins": 0.14606739580631256, + "rewards/rejected": -0.10508308559656143, + "step": 2340 + }, + { + "epoch": 0.38, + "learning_rate": 3.866363707856764e-06, + "logits/chosen": -2.006964921951294, + "logits/rejected": -1.729931116104126, + "logps/chosen": -275.171875, + "logps/rejected": -290.58624267578125, + "loss": 0.6861, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0467514768242836, + "rewards/margins": 0.14492247998714447, + "rewards/rejected": -0.09817101806402206, + "step": 2350 + }, + { + "epoch": 0.39, + "learning_rate": 3.8543852006866265e-06, + "logits/chosen": -2.016918659210205, + "logits/rejected": -1.591439962387085, + "logps/chosen": -242.85928344726562, + "logps/rejected": -177.7174072265625, + "loss": 0.6909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02702418901026249, + "rewards/margins": 0.08906533569097519, + "rewards/rejected": -0.06204115226864815, + "step": 2360 + }, + { + "epoch": 0.39, + "learning_rate": 3.842362520457553e-06, + "logits/chosen": -2.021554470062256, + "logits/rejected": -1.9175100326538086, + "logps/chosen": -295.9591979980469, + "logps/rejected": -246.7713165283203, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.037607692182064056, + "rewards/margins": 0.0755528062582016, + "rewards/rejected": -0.03794512152671814, + "step": 2370 + }, + { + "epoch": 0.39, + "learning_rate": 3.830296059287355e-06, + "logits/chosen": -1.8837461471557617, + "logits/rejected": -1.753252387046814, + "logps/chosen": -287.8062438964844, + "logps/rejected": -313.91961669921875, + "loss": 0.6894, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04382115602493286, + "rewards/margins": 0.10359911620616913, + "rewards/rejected": -0.05977797508239746, + "step": 2380 + }, + { + "epoch": 0.39, + "learning_rate": 3.818186210721755e-06, + "logits/chosen": -2.1759252548217773, + "logits/rejected": -1.90180242061615, + "logps/chosen": -265.94354248046875, + "logps/rejected": -239.5464630126953, + "loss": 0.6868, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.05228731036186218, + "rewards/margins": 0.14388349652290344, + "rewards/rejected": -0.09159618616104126, + "step": 2390 + }, + { + "epoch": 0.39, + "learning_rate": 3.8060333697215457e-06, + "logits/chosen": -2.0004665851593018, + "logits/rejected": -1.9739125967025757, + "logps/chosen": -287.1126403808594, + "logps/rejected": -318.0423583984375, + "loss": 0.6895, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04832135885953903, + "rewards/margins": 0.12257120758295059, + "rewards/rejected": -0.07424985617399216, + "step": 2400 + }, + { + "epoch": 0.39, + "learning_rate": 3.793837932649713e-06, + "logits/chosen": -1.977357268333435, + "logits/rejected": -1.8086156845092773, + "logps/chosen": -256.2596130371094, + "logps/rejected": -236.8079071044922, + "loss": 0.6875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05115659162402153, + "rewards/margins": 0.10816211998462677, + "rewards/rejected": -0.05700553581118584, + "step": 2410 + }, + { + "epoch": 0.4, + "learning_rate": 3.7816002972585077e-06, + "logits/chosen": -1.8654048442840576, + "logits/rejected": -1.5370529890060425, + "logps/chosen": -235.6540069580078, + "logps/rejected": -229.72128295898438, + "loss": 0.6884, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.012810093350708485, + "rewards/margins": 0.11013920605182648, + "rewards/rejected": -0.09732911735773087, + "step": 2420 + }, + { + "epoch": 0.4, + "learning_rate": 3.7693208626764695e-06, + "logits/chosen": -1.7841002941131592, + "logits/rejected": -2.0260298252105713, + "logps/chosen": -235.7138214111328, + "logps/rejected": -288.21697998046875, + "loss": 0.6938, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.011109807528555393, + "rewards/margins": 0.052274029701948166, + "rewards/rejected": -0.04116421937942505, + "step": 2430 + }, + { + "epoch": 0.4, + "learning_rate": 3.7570000293954147e-06, + "logits/chosen": -1.9099944829940796, + "logits/rejected": -1.7196029424667358, + "logps/chosen": -279.9646911621094, + "logps/rejected": -260.13299560546875, + "loss": 0.6889, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05946454405784607, + "rewards/margins": 0.1111823320388794, + "rewards/rejected": -0.05171779915690422, + "step": 2440 + }, + { + "epoch": 0.4, + "learning_rate": 3.7446381992573687e-06, + "logits/chosen": -1.9336166381835938, + "logits/rejected": -1.6217072010040283, + "logps/chosen": -287.04388427734375, + "logps/rejected": -240.9578857421875, + "loss": 0.6912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0196889229118824, + "rewards/margins": 0.1250876486301422, + "rewards/rejected": -0.10539872944355011, + "step": 2450 + }, + { + "epoch": 0.4, + "learning_rate": 3.732235775441467e-06, + "logits/chosen": -1.7849498987197876, + "logits/rejected": -1.9439353942871094, + "logps/chosen": -225.54544067382812, + "logps/rejected": -240.29891967773438, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008863620460033417, + "rewards/margins": 0.053713519126176834, + "rewards/rejected": -0.044849902391433716, + "step": 2460 + }, + { + "epoch": 0.4, + "learning_rate": 3.719793162450799e-06, + "logits/chosen": -1.977881669998169, + "logits/rejected": -1.8794625997543335, + "logps/chosen": -254.0738983154297, + "logps/rejected": -275.44976806640625, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004926715977489948, + "rewards/margins": 0.10369889438152313, + "rewards/rejected": -0.108625628054142, + "step": 2470 + }, + { + "epoch": 0.41, + "learning_rate": 3.707310766099216e-06, + "logits/chosen": -2.0737271308898926, + "logits/rejected": -1.7972593307495117, + "logps/chosen": -326.5340881347656, + "logps/rejected": -253.9987335205078, + "loss": 0.6874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0273086316883564, + "rewards/margins": 0.09867372363805771, + "rewards/rejected": -0.0713651031255722, + "step": 2480 + }, + { + "epoch": 0.41, + "learning_rate": 3.6947889934981006e-06, + "logits/chosen": -1.9010915756225586, + "logits/rejected": -1.9596315622329712, + "logps/chosen": -253.33056640625, + "logps/rejected": -346.298095703125, + "loss": 0.6885, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02515488490462303, + "rewards/margins": 0.10070650279521942, + "rewards/rejected": -0.07555162161588669, + "step": 2490 + }, + { + "epoch": 0.41, + "learning_rate": 3.6822282530430843e-06, + "logits/chosen": -1.8843889236450195, + "logits/rejected": -1.9201133251190186, + "logps/chosen": -275.7688903808594, + "logps/rejected": -329.98095703125, + "loss": 0.6873, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.005144808441400528, + "rewards/margins": 0.15748818218708038, + "rewards/rejected": -0.1626329869031906, + "step": 2500 + }, + { + "epoch": 0.41, + "learning_rate": 3.6696289544007272e-06, + "logits/chosen": -1.8051694631576538, + "logits/rejected": -1.5340454578399658, + "logps/chosen": -268.29864501953125, + "logps/rejected": -235.5688934326172, + "loss": 0.6912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.002055754419416189, + "rewards/margins": 0.12187905609607697, + "rewards/rejected": -0.1239347904920578, + "step": 2510 + }, + { + "epoch": 0.41, + "learning_rate": 3.6569915084951623e-06, + "logits/chosen": -1.8412199020385742, + "logits/rejected": -1.771681785583496, + "logps/chosen": -277.91650390625, + "logps/rejected": -257.77081298828125, + "loss": 0.6912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.008546999655663967, + "rewards/margins": 0.10376652330160141, + "rewards/rejected": -0.11231352388858795, + "step": 2520 + }, + { + "epoch": 0.41, + "learning_rate": 3.6443163274946863e-06, + "logits/chosen": -1.9348150491714478, + "logits/rejected": -1.8355276584625244, + "logps/chosen": -323.781982421875, + "logps/rejected": -341.081298828125, + "loss": 0.6881, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01254656445235014, + "rewards/margins": 0.09151171892881393, + "rewards/rejected": -0.10405828803777695, + "step": 2530 + }, + { + "epoch": 0.42, + "learning_rate": 3.631603824798321e-06, + "logits/chosen": -2.0608162879943848, + "logits/rejected": -1.932202935218811, + "logps/chosen": -253.56704711914062, + "logps/rejected": -242.00064086914062, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02782347798347473, + "rewards/margins": 0.0919298380613327, + "rewards/rejected": -0.11975331604480743, + "step": 2540 + }, + { + "epoch": 0.42, + "learning_rate": 3.61885441502233e-06, + "logits/chosen": -1.9878294467926025, + "logits/rejected": -1.7774133682250977, + "logps/chosen": -363.02069091796875, + "logps/rejected": -338.56280517578125, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02506803348660469, + "rewards/margins": 0.08811817318201065, + "rewards/rejected": -0.11318621784448624, + "step": 2550 + }, + { + "epoch": 0.42, + "learning_rate": 3.6060685139866957e-06, + "logits/chosen": -1.8786475658416748, + "logits/rejected": -1.8794273138046265, + "logps/chosen": -243.3047332763672, + "logps/rejected": -245.9431915283203, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027969714254140854, + "rewards/margins": 0.08065790683031082, + "rewards/rejected": -0.10862763226032257, + "step": 2560 + }, + { + "epoch": 0.42, + "learning_rate": 3.5932465387015553e-06, + "logits/chosen": -2.08577823638916, + "logits/rejected": -1.6685502529144287, + "logps/chosen": -274.0918273925781, + "logps/rejected": -254.17758178710938, + "loss": 0.6902, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.026298820972442627, + "rewards/margins": 0.16592226922512054, + "rewards/rejected": -0.19222110509872437, + "step": 2570 + }, + { + "epoch": 0.42, + "learning_rate": 3.5803889073536034e-06, + "logits/chosen": -1.8342199325561523, + "logits/rejected": -1.8192392587661743, + "logps/chosen": -213.46896362304688, + "logps/rejected": -213.1082000732422, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027767345309257507, + "rewards/margins": 0.06983424723148346, + "rewards/rejected": -0.09760158509016037, + "step": 2580 + }, + { + "epoch": 0.42, + "learning_rate": 3.5674960392924524e-06, + "logits/chosen": -2.04657244682312, + "logits/rejected": -1.981021523475647, + "logps/chosen": -227.22311401367188, + "logps/rejected": -257.6139221191406, + "loss": 0.6895, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.021385611966252327, + "rewards/margins": 0.07885028421878815, + "rewards/rejected": -0.10023589432239532, + "step": 2590 + }, + { + "epoch": 0.43, + "learning_rate": 3.554568355016952e-06, + "logits/chosen": -2.279806613922119, + "logits/rejected": -1.9767131805419922, + "logps/chosen": -335.621826171875, + "logps/rejected": -277.49871826171875, + "loss": 0.6909, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01808878406882286, + "rewards/margins": 0.04943593963980675, + "rewards/rejected": -0.06752472370862961, + "step": 2600 + }, + { + "epoch": 0.43, + "learning_rate": 3.5416062761614784e-06, + "logits/chosen": -2.1609983444213867, + "logits/rejected": -1.9818646907806396, + "logps/chosen": -253.66262817382812, + "logps/rejected": -284.16168212890625, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0011263730702921748, + "rewards/margins": 0.09885554760694504, + "rewards/rejected": -0.09998192638158798, + "step": 2610 + }, + { + "epoch": 0.43, + "learning_rate": 3.5286102254821825e-06, + "logits/chosen": -2.008579730987549, + "logits/rejected": -1.8898801803588867, + "logps/chosen": -261.5553283691406, + "logps/rejected": -236.6925506591797, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.018027102574706078, + "rewards/margins": 0.11090080440044403, + "rewards/rejected": -0.12892790138721466, + "step": 2620 + }, + { + "epoch": 0.43, + "learning_rate": 3.5155806268432e-06, + "logits/chosen": -1.947180986404419, + "logits/rejected": -1.7861868143081665, + "logps/chosen": -311.41937255859375, + "logps/rejected": -282.10113525390625, + "loss": 0.688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.025078509002923965, + "rewards/margins": 0.11865492910146713, + "rewards/rejected": -0.1437334418296814, + "step": 2630 + }, + { + "epoch": 0.43, + "learning_rate": 3.502517905202828e-06, + "logits/chosen": -2.057283401489258, + "logits/rejected": -1.7685312032699585, + "logps/chosen": -217.6374053955078, + "logps/rejected": -201.20973205566406, + "loss": 0.6858, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012522416189312935, + "rewards/margins": 0.13865813612937927, + "rewards/rejected": -0.12613573670387268, + "step": 2640 + }, + { + "epoch": 0.43, + "learning_rate": 3.489422486599665e-06, + "logits/chosen": -1.7685298919677734, + "logits/rejected": -1.628252387046814, + "logps/chosen": -208.57266235351562, + "logps/rejected": -185.796875, + "loss": 0.6933, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.009864556603133678, + "rewards/margins": 0.054461248219013214, + "rewards/rejected": -0.04459669068455696, + "step": 2650 + }, + { + "epoch": 0.44, + "learning_rate": 3.476294798138716e-06, + "logits/chosen": -2.025513172149658, + "logits/rejected": -1.7537561655044556, + "logps/chosen": -260.26263427734375, + "logps/rejected": -236.11892700195312, + "loss": 0.6905, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0054704658687114716, + "rewards/margins": 0.06158498674631119, + "rewards/rejected": -0.056114524602890015, + "step": 2660 + }, + { + "epoch": 0.44, + "learning_rate": 3.463135267977462e-06, + "logits/chosen": -2.0419468879699707, + "logits/rejected": -1.9679908752441406, + "logps/chosen": -300.6115417480469, + "logps/rejected": -283.63238525390625, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02576897107064724, + "rewards/margins": 0.06509675085544586, + "rewards/rejected": -0.03932777792215347, + "step": 2670 + }, + { + "epoch": 0.44, + "learning_rate": 3.4499443253118965e-06, + "logits/chosen": -2.0757360458374023, + "logits/rejected": -1.745690107345581, + "logps/chosen": -252.88388061523438, + "logps/rejected": -213.86734008789062, + "loss": 0.69, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04276670515537262, + "rewards/margins": 0.06988374143838882, + "rewards/rejected": -0.027117043733596802, + "step": 2680 + }, + { + "epoch": 0.44, + "learning_rate": 3.436722400362526e-06, + "logits/chosen": -2.039062738418579, + "logits/rejected": -2.000053882598877, + "logps/chosen": -196.8014373779297, + "logps/rejected": -196.71328735351562, + "loss": 0.6904, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03877304866909981, + "rewards/margins": 0.07110652327537537, + "rewards/rejected": -0.03233347833156586, + "step": 2690 + }, + { + "epoch": 0.44, + "learning_rate": 3.423469924360341e-06, + "logits/chosen": -2.1352858543395996, + "logits/rejected": -1.8620598316192627, + "logps/chosen": -298.80645751953125, + "logps/rejected": -271.24835205078125, + "loss": 0.6843, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.019208915531635284, + "rewards/margins": 0.13577969372272491, + "rewards/rejected": -0.11657078564167023, + "step": 2700 + }, + { + "epoch": 0.44, + "learning_rate": 3.410187329532747e-06, + "logits/chosen": -1.9511646032333374, + "logits/rejected": -1.653136968612671, + "logps/chosen": -266.6639404296875, + "logps/rejected": -266.87213134765625, + "loss": 0.6893, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04101669043302536, + "rewards/margins": 0.13958540558815002, + "rewards/rejected": -0.09856870770454407, + "step": 2710 + }, + { + "epoch": 0.44, + "learning_rate": 3.3968750490894726e-06, + "logits/chosen": -1.8839699029922485, + "logits/rejected": -1.5932048559188843, + "logps/chosen": -293.69476318359375, + "logps/rejected": -267.09002685546875, + "loss": 0.691, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0013989139115437865, + "rewards/margins": 0.0521574541926384, + "rewards/rejected": -0.050758540630340576, + "step": 2720 + }, + { + "epoch": 0.45, + "learning_rate": 3.3835335172084377e-06, + "logits/chosen": -2.036839723587036, + "logits/rejected": -1.8415123224258423, + "logps/chosen": -218.12734985351562, + "logps/rejected": -227.59619140625, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04816644638776779, + "rewards/margins": 0.08818621188402176, + "rewards/rejected": -0.04001976549625397, + "step": 2730 + }, + { + "epoch": 0.45, + "learning_rate": 3.3701631690215926e-06, + "logits/chosen": -1.5828115940093994, + "logits/rejected": -1.5227618217468262, + "logps/chosen": -246.00082397460938, + "logps/rejected": -253.458740234375, + "loss": 0.6902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.026154646649956703, + "rewards/margins": 0.11396100372076035, + "rewards/rejected": -0.08780635893344879, + "step": 2740 + }, + { + "epoch": 0.45, + "learning_rate": 3.356764440600726e-06, + "logits/chosen": -1.9018070697784424, + "logits/rejected": -1.8422409296035767, + "logps/chosen": -260.69268798828125, + "logps/rejected": -268.7093811035156, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.019899263978004456, + "rewards/margins": 0.07203265279531479, + "rewards/rejected": -0.05213339254260063, + "step": 2750 + }, + { + "epoch": 0.45, + "learning_rate": 3.3433377689432458e-06, + "logits/chosen": -2.0541279315948486, + "logits/rejected": -1.807157278060913, + "logps/chosen": -284.118408203125, + "logps/rejected": -263.0935974121094, + "loss": 0.6908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02089863456785679, + "rewards/margins": 0.09378832578659058, + "rewards/rejected": -0.07288969308137894, + "step": 2760 + }, + { + "epoch": 0.45, + "learning_rate": 3.329883591957923e-06, + "logits/chosen": -1.9219194650650024, + "logits/rejected": -1.789971113204956, + "logps/chosen": -237.2939910888672, + "logps/rejected": -258.3555908203125, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00797148235142231, + "rewards/margins": 0.04242387413978577, + "rewards/rejected": -0.050395358353853226, + "step": 2770 + }, + { + "epoch": 0.45, + "learning_rate": 3.3164023484506086e-06, + "logits/chosen": -2.1369566917419434, + "logits/rejected": -1.81415593624115, + "logps/chosen": -298.08160400390625, + "logps/rejected": -267.9482421875, + "loss": 0.6883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04161067679524422, + "rewards/margins": 0.09494088590145111, + "rewards/rejected": -0.053330205380916595, + "step": 2780 + }, + { + "epoch": 0.46, + "learning_rate": 3.302894478109927e-06, + "logits/chosen": -1.7750136852264404, + "logits/rejected": -1.831941843032837, + "logps/chosen": -266.43804931640625, + "logps/rejected": -341.4742126464844, + "loss": 0.6878, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.037947382777929306, + "rewards/margins": 0.12771761417388916, + "rewards/rejected": -0.08977022767066956, + "step": 2790 + }, + { + "epoch": 0.46, + "learning_rate": 3.289360421492932e-06, + "logits/chosen": -2.0122945308685303, + "logits/rejected": -2.017423152923584, + "logps/chosen": -258.4410400390625, + "logps/rejected": -270.700927734375, + "loss": 0.6886, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03344310447573662, + "rewards/margins": 0.08679075539112091, + "rewards/rejected": -0.053347647190093994, + "step": 2800 + }, + { + "epoch": 0.46, + "learning_rate": 3.275800620010735e-06, + "logits/chosen": -2.064495325088501, + "logits/rejected": -1.8607257604599, + "logps/chosen": -336.98309326171875, + "logps/rejected": -258.5537109375, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05154595524072647, + "rewards/margins": 0.09546490013599396, + "rewards/rejected": -0.04391894489526749, + "step": 2810 + }, + { + "epoch": 0.46, + "learning_rate": 3.262215515914118e-06, + "logits/chosen": -1.741107702255249, + "logits/rejected": -1.761845350265503, + "logps/chosen": -221.742431640625, + "logps/rejected": -267.625732421875, + "loss": 0.6885, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06182179972529411, + "rewards/margins": 0.1309191733598709, + "rewards/rejected": -0.06909738481044769, + "step": 2820 + }, + { + "epoch": 0.46, + "learning_rate": 3.2486055522790994e-06, + "logits/chosen": -2.0962462425231934, + "logits/rejected": -2.087583065032959, + "logps/chosen": -324.60107421875, + "logps/rejected": -345.61822509765625, + "loss": 0.6896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.015905166044831276, + "rewards/margins": 0.07873152941465378, + "rewards/rejected": -0.06282636523246765, + "step": 2830 + }, + { + "epoch": 0.46, + "learning_rate": 3.234971172992489e-06, + "logits/chosen": -2.066453456878662, + "logits/rejected": -1.9098031520843506, + "logps/chosen": -235.388671875, + "logps/rejected": -223.5497589111328, + "loss": 0.6913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.045773353427648544, + "rewards/margins": 0.06405707448720932, + "rewards/rejected": -0.018283721059560776, + "step": 2840 + }, + { + "epoch": 0.47, + "learning_rate": 3.2213128227374084e-06, + "logits/chosen": -1.8920425176620483, + "logits/rejected": -1.9583616256713867, + "logps/chosen": -215.0764617919922, + "logps/rejected": -285.7313537597656, + "loss": 0.6912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06663038581609726, + "rewards/margins": 0.05271860212087631, + "rewards/rejected": 0.013911793008446693, + "step": 2850 + }, + { + "epoch": 0.47, + "learning_rate": 3.2076309469787904e-06, + "logits/chosen": -1.89080011844635, + "logits/rejected": -1.9107465744018555, + "logps/chosen": -221.8553009033203, + "logps/rejected": -265.05718994140625, + "loss": 0.6863, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.06828014552593231, + "rewards/margins": 0.1333601176738739, + "rewards/rejected": -0.06507997214794159, + "step": 2860 + }, + { + "epoch": 0.47, + "learning_rate": 3.1939259919488467e-06, + "logits/chosen": -1.8215852975845337, + "logits/rejected": -1.6720565557479858, + "logps/chosen": -216.31454467773438, + "logps/rejected": -236.94522094726562, + "loss": 0.6907, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.08463671058416367, + "rewards/margins": 0.056152183562517166, + "rewards/rejected": 0.028484534472227097, + "step": 2870 + }, + { + "epoch": 0.47, + "learning_rate": 3.180198404632516e-06, + "logits/chosen": -2.1175122261047363, + "logits/rejected": -1.9298546314239502, + "logps/chosen": -244.73086547851562, + "logps/rejected": -245.91372680664062, + "loss": 0.6903, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0812285840511322, + "rewards/margins": 0.08755110204219818, + "rewards/rejected": -0.006322516594082117, + "step": 2880 + }, + { + "epoch": 0.47, + "learning_rate": 3.1664486327528866e-06, + "logits/chosen": -2.1393871307373047, + "logits/rejected": -1.8310749530792236, + "logps/chosen": -236.1334686279297, + "logps/rejected": -265.08917236328125, + "loss": 0.6869, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07632098346948624, + "rewards/margins": 0.10083069652318954, + "rewards/rejected": -0.024509713053703308, + "step": 2890 + }, + { + "epoch": 0.47, + "learning_rate": 3.15267712475659e-06, + "logits/chosen": -1.851496934890747, + "logits/rejected": -1.8025314807891846, + "logps/chosen": -240.52188110351562, + "logps/rejected": -262.5421142578125, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09199874103069305, + "rewards/margins": 0.08169518411159515, + "rewards/rejected": 0.010303549468517303, + "step": 2900 + }, + { + "epoch": 0.48, + "learning_rate": 3.138884329799182e-06, + "logits/chosen": -1.599341630935669, + "logits/rejected": -1.730376958847046, + "logps/chosen": -177.09951782226562, + "logps/rejected": -195.95318603515625, + "loss": 0.6903, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.06895118206739426, + "rewards/margins": 0.07616007328033447, + "rewards/rejected": -0.007208888418972492, + "step": 2910 + }, + { + "epoch": 0.48, + "learning_rate": 3.1250706977304857e-06, + "logits/chosen": -1.9655840396881104, + "logits/rejected": -1.891181230545044, + "logps/chosen": -268.06060791015625, + "logps/rejected": -286.9012756347656, + "loss": 0.6888, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08773650974035263, + "rewards/margins": 0.12291065603494644, + "rewards/rejected": -0.03517414256930351, + "step": 2920 + }, + { + "epoch": 0.48, + "learning_rate": 3.1112366790799236e-06, + "logits/chosen": -2.0948500633239746, + "logits/rejected": -1.9891760349273682, + "logps/chosen": -271.17474365234375, + "logps/rejected": -251.1312255859375, + "loss": 0.6912, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07265276461839676, + "rewards/margins": 0.06062566488981247, + "rewards/rejected": 0.012027103453874588, + "step": 2930 + }, + { + "epoch": 0.48, + "learning_rate": 3.0973827250418257e-06, + "logits/chosen": -1.8720979690551758, + "logits/rejected": -1.832418441772461, + "logps/chosen": -236.34414672851562, + "logps/rejected": -209.13442993164062, + "loss": 0.6908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0848507359623909, + "rewards/margins": 0.0793834701180458, + "rewards/rejected": 0.005467262119054794, + "step": 2940 + }, + { + "epoch": 0.48, + "learning_rate": 3.08350928746071e-06, + "logits/chosen": -2.1480631828308105, + "logits/rejected": -1.6580727100372314, + "logps/chosen": -324.90826416015625, + "logps/rejected": -249.8586883544922, + "loss": 0.6879, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.09290337562561035, + "rewards/margins": 0.09645406156778336, + "rewards/rejected": -0.0035506817512214184, + "step": 2950 + }, + { + "epoch": 0.48, + "learning_rate": 3.0696168188165453e-06, + "logits/chosen": -1.8901182413101196, + "logits/rejected": -1.8035783767700195, + "logps/chosen": -215.082763671875, + "logps/rejected": -243.25906372070312, + "loss": 0.6877, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.08278520405292511, + "rewards/margins": 0.1506078839302063, + "rewards/rejected": -0.06782267242670059, + "step": 2960 + }, + { + "epoch": 0.49, + "learning_rate": 3.0557057722099994e-06, + "logits/chosen": -2.195089340209961, + "logits/rejected": -2.0160412788391113, + "logps/chosen": -240.93399047851562, + "logps/rejected": -253.69100952148438, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0755130797624588, + "rewards/margins": 0.10724164545536041, + "rewards/rejected": -0.03172856941819191, + "step": 2970 + }, + { + "epoch": 0.49, + "learning_rate": 3.0417766013476557e-06, + "logits/chosen": -2.1110661029815674, + "logits/rejected": -1.773144006729126, + "logps/chosen": -256.28338623046875, + "logps/rejected": -220.7050323486328, + "loss": 0.6902, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06079406663775444, + "rewards/margins": 0.04874235764145851, + "rewards/rejected": 0.01205170713365078, + "step": 2980 + }, + { + "epoch": 0.49, + "learning_rate": 3.0278297605272158e-06, + "logits/chosen": -1.8296396732330322, + "logits/rejected": -1.7259023189544678, + "logps/chosen": -245.2227325439453, + "logps/rejected": -251.06515502929688, + "loss": 0.6892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.057640980929136276, + "rewards/margins": 0.09454314410686493, + "rewards/rejected": -0.03690217062830925, + "step": 2990 + }, + { + "epoch": 0.49, + "learning_rate": 3.013865704622685e-06, + "logits/chosen": -1.8805160522460938, + "logits/rejected": -1.8036752939224243, + "logps/chosen": -221.59078979492188, + "logps/rejected": -232.763427734375, + "loss": 0.6894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.045867644250392914, + "rewards/margins": 0.11604690551757812, + "rewards/rejected": -0.07017925381660461, + "step": 3000 + }, + { + "epoch": 0.49, + "learning_rate": 2.9998848890695376e-06, + "logits/chosen": -2.0802533626556396, + "logits/rejected": -1.84062922000885, + "logps/chosen": -273.2664489746094, + "logps/rejected": -262.6578369140625, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06462962180376053, + "rewards/margins": 0.10107561200857162, + "rewards/rejected": -0.0364459827542305, + "step": 3010 + }, + { + "epoch": 0.49, + "learning_rate": 2.985887769849859e-06, + "logits/chosen": -1.9342610836029053, + "logits/rejected": -1.8625415563583374, + "logps/chosen": -285.3690185546875, + "logps/rejected": -295.5030212402344, + "loss": 0.6885, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.08939331024885178, + "rewards/margins": 0.11333982646465302, + "rewards/rejected": -0.023946519941091537, + "step": 3020 + }, + { + "epoch": 0.5, + "learning_rate": 2.9718748034774776e-06, + "logits/chosen": -1.979699730873108, + "logits/rejected": -1.8891689777374268, + "logps/chosen": -216.3461456298828, + "logps/rejected": -270.50518798828125, + "loss": 0.6897, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07241294533014297, + "rewards/margins": 0.13637953996658325, + "rewards/rejected": -0.06396660953760147, + "step": 3030 + }, + { + "epoch": 0.5, + "learning_rate": 2.9578464469830732e-06, + "logits/chosen": -1.8731129169464111, + "logits/rejected": -1.892673134803772, + "logps/chosen": -244.26089477539062, + "logps/rejected": -304.71527099609375, + "loss": 0.6872, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06775057315826416, + "rewards/margins": 0.1028917208313942, + "rewards/rejected": -0.035141147673130035, + "step": 3040 + }, + { + "epoch": 0.5, + "learning_rate": 2.943803157899272e-06, + "logits/chosen": -2.2134296894073486, + "logits/rejected": -2.092992067337036, + "logps/chosen": -199.4912872314453, + "logps/rejected": -188.5738067626953, + "loss": 0.6913, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04073139652609825, + "rewards/margins": 0.04918772354722023, + "rewards/rejected": -0.00845632515847683, + "step": 3050 + }, + { + "epoch": 0.5, + "learning_rate": 2.929745394245725e-06, + "logits/chosen": -1.996862769126892, + "logits/rejected": -2.081085443496704, + "logps/chosen": -270.2159729003906, + "logps/rejected": -270.0609130859375, + "loss": 0.6921, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.003329369006678462, + "rewards/margins": 0.07538504898548126, + "rewards/rejected": -0.07205567508935928, + "step": 3060 + }, + { + "epoch": 0.5, + "learning_rate": 2.915673614514169e-06, + "logits/chosen": -1.8620731830596924, + "logits/rejected": -1.8943582773208618, + "logps/chosen": -261.0270080566406, + "logps/rejected": -312.7596435546875, + "loss": 0.691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03929787129163742, + "rewards/margins": 0.13370563089847565, + "rewards/rejected": -0.09440775960683823, + "step": 3070 + }, + { + "epoch": 0.5, + "learning_rate": 2.901588277653472e-06, + "logits/chosen": -2.1425204277038574, + "logits/rejected": -1.667665719985962, + "logps/chosen": -245.58761596679688, + "logps/rejected": -233.13232421875, + "loss": 0.6907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.041470836848020554, + "rewards/margins": 0.07364167273044586, + "rewards/rejected": -0.03217083588242531, + "step": 3080 + }, + { + "epoch": 0.51, + "learning_rate": 2.8874898430546654e-06, + "logits/chosen": -2.089482307434082, + "logits/rejected": -1.947522759437561, + "logps/chosen": -218.27426147460938, + "logps/rejected": -206.7089385986328, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.040116842836141586, + "rewards/margins": 0.10671831667423248, + "rewards/rejected": -0.06660146266222, + "step": 3090 + }, + { + "epoch": 0.51, + "learning_rate": 2.873378770535964e-06, + "logits/chosen": -2.1854147911071777, + "logits/rejected": -1.729306936264038, + "logps/chosen": -310.63330078125, + "logps/rejected": -251.4937744140625, + "loss": 0.6888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02613145485520363, + "rewards/margins": 0.08506246656179428, + "rewards/rejected": -0.05893101543188095, + "step": 3100 + }, + { + "epoch": 0.51, + "learning_rate": 2.859255520327762e-06, + "logits/chosen": -1.9528080224990845, + "logits/rejected": -1.8293895721435547, + "logps/chosen": -233.9651641845703, + "logps/rejected": -266.45770263671875, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04076070338487625, + "rewards/margins": 0.08312708139419556, + "rewards/rejected": -0.042366381734609604, + "step": 3110 + }, + { + "epoch": 0.51, + "learning_rate": 2.8451205530576313e-06, + "logits/chosen": -1.9040266275405884, + "logits/rejected": -1.8335864543914795, + "logps/chosen": -219.0716552734375, + "logps/rejected": -207.28292846679688, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03474966436624527, + "rewards/margins": 0.10749310255050659, + "rewards/rejected": -0.07274345308542252, + "step": 3120 + }, + { + "epoch": 0.51, + "learning_rate": 2.8309743297352915e-06, + "logits/chosen": -1.8073034286499023, + "logits/rejected": -1.7719571590423584, + "logps/chosen": -238.9850311279297, + "logps/rejected": -239.19107055664062, + "loss": 0.688, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04055961221456528, + "rewards/margins": 0.11201056092977524, + "rewards/rejected": -0.07145096361637115, + "step": 3130 + }, + { + "epoch": 0.51, + "learning_rate": 2.8168173117375765e-06, + "logits/chosen": -2.089149236679077, + "logits/rejected": -1.8613367080688477, + "logps/chosen": -194.557861328125, + "logps/rejected": -193.1913604736328, + "loss": 0.6903, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02647765912115574, + "rewards/margins": 0.05180682614445686, + "rewards/rejected": -0.025329168885946274, + "step": 3140 + }, + { + "epoch": 0.52, + "learning_rate": 2.802649960793388e-06, + "logits/chosen": -1.8931782245635986, + "logits/rejected": -1.9736407995224, + "logps/chosen": -253.19540405273438, + "logps/rejected": -316.647216796875, + "loss": 0.6898, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0508209690451622, + "rewards/margins": 0.08829973638057709, + "rewards/rejected": -0.03747876361012459, + "step": 3150 + }, + { + "epoch": 0.52, + "learning_rate": 2.788472738968634e-06, + "logits/chosen": -1.96945321559906, + "logits/rejected": -1.520216703414917, + "logps/chosen": -262.4095764160156, + "logps/rejected": -203.01177978515625, + "loss": 0.692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04128976911306381, + "rewards/margins": 0.045503370463848114, + "rewards/rejected": -0.004213607404381037, + "step": 3160 + }, + { + "epoch": 0.52, + "learning_rate": 2.7742861086511602e-06, + "logits/chosen": -1.9047476053237915, + "logits/rejected": -1.6291382312774658, + "logps/chosen": -214.72885131835938, + "logps/rejected": -167.4122772216797, + "loss": 0.692, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.029269644990563393, + "rewards/margins": 0.03847484663128853, + "rewards/rejected": -0.009205199778079987, + "step": 3170 + }, + { + "epoch": 0.52, + "learning_rate": 2.760090532535669e-06, + "logits/chosen": -2.1130504608154297, + "logits/rejected": -1.7649333477020264, + "logps/chosen": -345.9830017089844, + "logps/rejected": -302.0968322753906, + "loss": 0.6909, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03296752646565437, + "rewards/margins": 0.08277735859155655, + "rewards/rejected": -0.04980982840061188, + "step": 3180 + }, + { + "epoch": 0.52, + "learning_rate": 2.74588647360863e-06, + "logits/chosen": -2.063931941986084, + "logits/rejected": -1.946104645729065, + "logps/chosen": -213.52294921875, + "logps/rejected": -248.04019165039062, + "loss": 0.6888, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007502102758735418, + "rewards/margins": 0.1336408257484436, + "rewards/rejected": -0.12613873183727264, + "step": 3190 + }, + { + "epoch": 0.52, + "learning_rate": 2.7316743951331735e-06, + "logits/chosen": -2.043703079223633, + "logits/rejected": -1.6404660940170288, + "logps/chosen": -271.6043701171875, + "logps/rejected": -190.3723602294922, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001568490406498313, + "rewards/margins": 0.06983982026576996, + "rewards/rejected": -0.07140831649303436, + "step": 3200 + }, + { + "epoch": 0.53, + "learning_rate": 2.717454760633992e-06, + "logits/chosen": -2.0612053871154785, + "logits/rejected": -1.93795645236969, + "logps/chosen": -275.68682861328125, + "logps/rejected": -256.82586669921875, + "loss": 0.6889, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0507802739739418, + "rewards/margins": 0.12036697566509247, + "rewards/rejected": -0.06958670169115067, + "step": 3210 + }, + { + "epoch": 0.53, + "learning_rate": 2.7032280338822138e-06, + "logits/chosen": -1.832506775856018, + "logits/rejected": -1.6844663619995117, + "logps/chosen": -224.65805053710938, + "logps/rejected": -190.48744201660156, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.003930827137082815, + "rewards/margins": 0.08170115947723389, + "rewards/rejected": -0.07777033746242523, + "step": 3220 + }, + { + "epoch": 0.53, + "learning_rate": 2.6889946788802777e-06, + "logits/chosen": -1.993975043296814, + "logits/rejected": -1.9381256103515625, + "logps/chosen": -276.79351806640625, + "logps/rejected": -273.7513427734375, + "loss": 0.6898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.025919729843735695, + "rewards/margins": 0.10560141503810883, + "rewards/rejected": -0.07968167960643768, + "step": 3230 + }, + { + "epoch": 0.53, + "learning_rate": 2.6747551598468062e-06, + "logits/chosen": -2.0369162559509277, + "logits/rejected": -1.806553602218628, + "logps/chosen": -268.41339111328125, + "logps/rejected": -256.11627197265625, + "loss": 0.6886, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03422645479440689, + "rewards/margins": 0.13793575763702393, + "rewards/rejected": -0.10370929539203644, + "step": 3240 + }, + { + "epoch": 0.53, + "learning_rate": 2.6605099412014597e-06, + "logits/chosen": -1.9734976291656494, + "logits/rejected": -1.9299424886703491, + "logps/chosen": -208.2463836669922, + "logps/rejected": -215.38912963867188, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003633850021287799, + "rewards/margins": 0.06675741821527481, + "rewards/rejected": -0.07039127498865128, + "step": 3250 + }, + { + "epoch": 0.53, + "learning_rate": 2.6462594875497854e-06, + "logits/chosen": -1.819993257522583, + "logits/rejected": -1.816554307937622, + "logps/chosen": -244.3020477294922, + "logps/rejected": -256.3065185546875, + "loss": 0.6863, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0006682433886453509, + "rewards/margins": 0.12039496749639511, + "rewards/rejected": -0.11972671747207642, + "step": 3260 + }, + { + "epoch": 0.53, + "learning_rate": 2.632004263668077e-06, + "logits/chosen": -2.026594638824463, + "logits/rejected": -1.7842613458633423, + "logps/chosen": -219.2827606201172, + "logps/rejected": -229.45346069335938, + "loss": 0.6852, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01553265005350113, + "rewards/margins": 0.16587623953819275, + "rewards/rejected": -0.15034359693527222, + "step": 3270 + }, + { + "epoch": 0.54, + "learning_rate": 2.617744734488204e-06, + "logits/chosen": -1.8513953685760498, + "logits/rejected": -1.8250501155853271, + "logps/chosen": -180.07769775390625, + "logps/rejected": -197.42684936523438, + "loss": 0.6907, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.007306984160095453, + "rewards/margins": 0.06650448590517044, + "rewards/rejected": -0.05919749662280083, + "step": 3280 + }, + { + "epoch": 0.54, + "learning_rate": 2.603481365082453e-06, + "logits/chosen": -1.9926726818084717, + "logits/rejected": -1.7971982955932617, + "logps/chosen": -259.9145812988281, + "logps/rejected": -258.8713684082031, + "loss": 0.6912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0009309362503699958, + "rewards/margins": 0.1266021430492401, + "rewards/rejected": -0.12753307819366455, + "step": 3290 + }, + { + "epoch": 0.54, + "learning_rate": 2.5892146206483595e-06, + "logits/chosen": -1.8069559335708618, + "logits/rejected": -1.9334291219711304, + "logps/chosen": -223.09619140625, + "logps/rejected": -259.0249938964844, + "loss": 0.6867, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.008823653683066368, + "rewards/margins": 0.13950976729393005, + "rewards/rejected": -0.14833341538906097, + "step": 3300 + }, + { + "epoch": 0.54, + "learning_rate": 2.5749449664935356e-06, + "logits/chosen": -1.700807809829712, + "logits/rejected": -1.6444162130355835, + "logps/chosen": -290.1138000488281, + "logps/rejected": -273.6895446777344, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018889624625444412, + "rewards/margins": 0.08735049515962601, + "rewards/rejected": -0.0684608668088913, + "step": 3310 + }, + { + "epoch": 0.54, + "learning_rate": 2.560672868020493e-06, + "logits/chosen": -1.955906629562378, + "logits/rejected": -1.8659934997558594, + "logps/chosen": -299.176513671875, + "logps/rejected": -297.19573974609375, + "loss": 0.6908, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.019999492913484573, + "rewards/margins": 0.09212388843297958, + "rewards/rejected": -0.07212439924478531, + "step": 3320 + }, + { + "epoch": 0.54, + "learning_rate": 2.5463987907114623e-06, + "logits/chosen": -2.0190176963806152, + "logits/rejected": -1.6607792377471924, + "logps/chosen": -294.9454650878906, + "logps/rejected": -277.91064453125, + "loss": 0.6882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014214855618774891, + "rewards/margins": 0.11215618997812271, + "rewards/rejected": -0.0979413315653801, + "step": 3330 + }, + { + "epoch": 0.55, + "learning_rate": 2.5321232001132184e-06, + "logits/chosen": -2.068220853805542, + "logits/rejected": -1.6832239627838135, + "logps/chosen": -232.2666015625, + "logps/rejected": -195.67013549804688, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03616483509540558, + "rewards/margins": 0.09798556566238403, + "rewards/rejected": -0.06182073429226875, + "step": 3340 + }, + { + "epoch": 0.55, + "learning_rate": 2.5178465618218863e-06, + "logits/chosen": -2.04811954498291, + "logits/rejected": -1.7602752447128296, + "logps/chosen": -316.37164306640625, + "logps/rejected": -298.69293212890625, + "loss": 0.6902, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.032330166548490524, + "rewards/margins": 0.10765840858221054, + "rewards/rejected": -0.07532824575901031, + "step": 3350 + }, + { + "epoch": 0.55, + "learning_rate": 2.5035693414677664e-06, + "logits/chosen": -2.0388569831848145, + "logits/rejected": -1.834509253501892, + "logps/chosen": -291.74127197265625, + "logps/rejected": -240.7653045654297, + "loss": 0.6919, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02704227901995182, + "rewards/margins": 0.053462136536836624, + "rewards/rejected": -0.026419857516884804, + "step": 3360 + }, + { + "epoch": 0.55, + "learning_rate": 2.489292004700139e-06, + "logits/chosen": -1.9263681173324585, + "logits/rejected": -1.817287802696228, + "logps/chosen": -252.87167358398438, + "logps/rejected": -310.994384765625, + "loss": 0.6844, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07156406342983246, + "rewards/margins": 0.14183077216148376, + "rewards/rejected": -0.07026670128107071, + "step": 3370 + }, + { + "epoch": 0.55, + "learning_rate": 2.4750150171720834e-06, + "logits/chosen": -2.052155017852783, + "logits/rejected": -1.7053096294403076, + "logps/chosen": -271.7879333496094, + "logps/rejected": -211.4127197265625, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04199618473649025, + "rewards/margins": 0.09472035616636276, + "rewards/rejected": -0.05272418260574341, + "step": 3380 + }, + { + "epoch": 0.55, + "learning_rate": 2.4607388445252878e-06, + "logits/chosen": -1.8254659175872803, + "logits/rejected": -1.9821802377700806, + "logps/chosen": -190.3391876220703, + "logps/rejected": -217.31777954101562, + "loss": 0.6894, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07746318727731705, + "rewards/margins": 0.11010380834341049, + "rewards/rejected": -0.03264062479138374, + "step": 3390 + }, + { + "epoch": 0.56, + "learning_rate": 2.4464639523748636e-06, + "logits/chosen": -1.9187381267547607, + "logits/rejected": -1.9416077136993408, + "logps/chosen": -241.7182159423828, + "logps/rejected": -264.7229309082031, + "loss": 0.6875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.06877437978982925, + "rewards/margins": 0.12464741617441177, + "rewards/rejected": -0.055873043835163116, + "step": 3400 + }, + { + "epoch": 0.56, + "learning_rate": 2.4321908062941585e-06, + "logits/chosen": -1.9229618310928345, + "logits/rejected": -2.0184249877929688, + "logps/chosen": -243.48257446289062, + "logps/rejected": -250.8048858642578, + "loss": 0.6901, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.051273830235004425, + "rewards/margins": 0.08538023382425308, + "rewards/rejected": -0.03410639613866806, + "step": 3410 + }, + { + "epoch": 0.56, + "learning_rate": 2.4179198717995724e-06, + "logits/chosen": -2.0954782962799072, + "logits/rejected": -1.89650559425354, + "logps/chosen": -252.18533325195312, + "logps/rejected": -247.2293701171875, + "loss": 0.6897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.060106076300144196, + "rewards/margins": 0.08642920106649399, + "rewards/rejected": -0.02632312849164009, + "step": 3420 + }, + { + "epoch": 0.56, + "learning_rate": 2.4036516143353755e-06, + "logits/chosen": -1.7370727062225342, + "logits/rejected": -1.755359411239624, + "logps/chosen": -216.1916961669922, + "logps/rejected": -259.7597351074219, + "loss": 0.6915, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.06400546431541443, + "rewards/margins": 0.047087766230106354, + "rewards/rejected": 0.016917699947953224, + "step": 3430 + }, + { + "epoch": 0.56, + "learning_rate": 2.3893864992585252e-06, + "logits/chosen": -1.9870882034301758, + "logits/rejected": -1.9042809009552002, + "logps/chosen": -231.1096954345703, + "logps/rejected": -223.4515838623047, + "loss": 0.69, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.08247943967580795, + "rewards/margins": 0.10874517261981964, + "rewards/rejected": -0.026265714317560196, + "step": 3440 + }, + { + "epoch": 0.56, + "learning_rate": 2.375124991823494e-06, + "logits/chosen": -2.002474069595337, + "logits/rejected": -1.6728159189224243, + "logps/chosen": -286.3111877441406, + "logps/rejected": -245.68905639648438, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05589104816317558, + "rewards/margins": 0.08703849464654922, + "rewards/rejected": -0.031147439032793045, + "step": 3450 + }, + { + "epoch": 0.57, + "learning_rate": 2.3608675571670888e-06, + "logits/chosen": -1.7617969512939453, + "logits/rejected": -1.731100082397461, + "logps/chosen": -287.35333251953125, + "logps/rejected": -345.6080627441406, + "loss": 0.6894, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.018671926110982895, + "rewards/margins": 0.0912076085805893, + "rewards/rejected": -0.0725356787443161, + "step": 3460 + }, + { + "epoch": 0.57, + "learning_rate": 2.3466146602932813e-06, + "logits/chosen": -2.1233835220336914, + "logits/rejected": -1.8582971096038818, + "logps/chosen": -258.5401306152344, + "logps/rejected": -242.26797485351562, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05651939660310745, + "rewards/margins": 0.08908345550298691, + "rewards/rejected": -0.03256406635046005, + "step": 3470 + }, + { + "epoch": 0.57, + "learning_rate": 2.332366766058051e-06, + "logits/chosen": -1.8025176525115967, + "logits/rejected": -1.6172561645507812, + "logps/chosen": -216.12698364257812, + "logps/rejected": -223.8036651611328, + "loss": 0.6901, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008582862094044685, + "rewards/margins": 0.09050121158361435, + "rewards/rejected": -0.08191834390163422, + "step": 3480 + }, + { + "epoch": 0.57, + "learning_rate": 2.318124339154211e-06, + "logits/chosen": -1.9672091007232666, + "logits/rejected": -1.8547817468643188, + "logps/chosen": -227.04574584960938, + "logps/rejected": -259.188720703125, + "loss": 0.6884, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.021852396428585052, + "rewards/margins": 0.07857809960842133, + "rewards/rejected": -0.05672571808099747, + "step": 3490 + }, + { + "epoch": 0.57, + "learning_rate": 2.3038878440962594e-06, + "logits/chosen": -2.0270333290100098, + "logits/rejected": -1.6864337921142578, + "logps/chosen": -334.8361511230469, + "logps/rejected": -277.0719299316406, + "loss": 0.7038, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0032265130430459976, + "rewards/margins": 0.12994110584259033, + "rewards/rejected": -0.13316760957241058, + "step": 3500 + }, + { + "epoch": 0.57, + "learning_rate": 2.2896577452052312e-06, + "logits/chosen": -1.8724663257598877, + "logits/rejected": -1.7028734683990479, + "logps/chosen": -243.1988067626953, + "logps/rejected": -260.4635314941406, + "loss": 0.6917, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.025984305888414383, + "rewards/margins": 0.06646469235420227, + "rewards/rejected": -0.040480393916368484, + "step": 3510 + }, + { + "epoch": 0.58, + "learning_rate": 2.275434506593549e-06, + "logits/chosen": -1.6671438217163086, + "logits/rejected": -1.9076242446899414, + "logps/chosen": -234.4957275390625, + "logps/rejected": -273.50421142578125, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026661807671189308, + "rewards/margins": 0.0871274322271347, + "rewards/rejected": -0.060465626418590546, + "step": 3520 + }, + { + "epoch": 0.58, + "learning_rate": 2.2612185921498864e-06, + "logits/chosen": -1.896277666091919, + "logits/rejected": -1.798769235610962, + "logps/chosen": -222.9064483642578, + "logps/rejected": -209.21127319335938, + "loss": 0.6884, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03127544745802879, + "rewards/margins": 0.10691280663013458, + "rewards/rejected": -0.07563735544681549, + "step": 3530 + }, + { + "epoch": 0.58, + "learning_rate": 2.247010465524046e-06, + "logits/chosen": -1.97471022605896, + "logits/rejected": -1.7489280700683594, + "logps/chosen": -282.816650390625, + "logps/rejected": -235.22640991210938, + "loss": 0.6896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.039089880883693695, + "rewards/margins": 0.11432500183582306, + "rewards/rejected": -0.07523511350154877, + "step": 3540 + }, + { + "epoch": 0.58, + "learning_rate": 2.232810590111827e-06, + "logits/chosen": -1.8925158977508545, + "logits/rejected": -1.7392536401748657, + "logps/chosen": -226.97842407226562, + "logps/rejected": -245.08944702148438, + "loss": 0.6926, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03177841752767563, + "rewards/margins": 0.058675311505794525, + "rewards/rejected": -0.026896893978118896, + "step": 3550 + }, + { + "epoch": 0.58, + "learning_rate": 2.2186194290399176e-06, + "logits/chosen": -1.9743719100952148, + "logits/rejected": -1.8463367223739624, + "logps/chosen": -241.1482391357422, + "logps/rejected": -253.6489715576172, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02476176992058754, + "rewards/margins": 0.09164849668741226, + "rewards/rejected": -0.06688673794269562, + "step": 3560 + }, + { + "epoch": 0.58, + "learning_rate": 2.204437445150791e-06, + "logits/chosen": -2.1271920204162598, + "logits/rejected": -2.0711147785186768, + "logps/chosen": -180.4447021484375, + "logps/rejected": -188.18809509277344, + "loss": 0.6878, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005936390720307827, + "rewards/margins": 0.09173784404993057, + "rewards/rejected": -0.08580145239830017, + "step": 3570 + }, + { + "epoch": 0.59, + "learning_rate": 2.1902651009876074e-06, + "logits/chosen": -2.038252353668213, + "logits/rejected": -1.747252106666565, + "logps/chosen": -308.51385498046875, + "logps/rejected": -289.5480041503906, + "loss": 0.6887, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.017800649628043175, + "rewards/margins": 0.08497454971075058, + "rewards/rejected": -0.06717389822006226, + "step": 3580 + }, + { + "epoch": 0.59, + "learning_rate": 2.176102858779126e-06, + "logits/chosen": -1.9265037775039673, + "logits/rejected": -1.726426362991333, + "logps/chosen": -300.06207275390625, + "logps/rejected": -297.90643310546875, + "loss": 0.6895, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.016286805272102356, + "rewards/margins": 0.05805863067507744, + "rewards/rejected": -0.041771821677684784, + "step": 3590 + }, + { + "epoch": 0.59, + "learning_rate": 2.1619511804246373e-06, + "logits/chosen": -1.615669846534729, + "logits/rejected": -1.7264668941497803, + "logps/chosen": -212.6259002685547, + "logps/rejected": -263.0503845214844, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02405395917594433, + "rewards/margins": 0.0971323773264885, + "rewards/rejected": -0.07307841628789902, + "step": 3600 + }, + { + "epoch": 0.59, + "learning_rate": 2.1478105274788893e-06, + "logits/chosen": -1.953674554824829, + "logits/rejected": -1.8604261875152588, + "logps/chosen": -268.23712158203125, + "logps/rejected": -272.2950439453125, + "loss": 0.6888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01134217344224453, + "rewards/margins": 0.08840439468622208, + "rewards/rejected": -0.0770622193813324, + "step": 3610 + }, + { + "epoch": 0.59, + "learning_rate": 2.1336813611370407e-06, + "logits/chosen": -2.106854200363159, + "logits/rejected": -1.7753673791885376, + "logps/chosen": -267.7677917480469, + "logps/rejected": -244.2574005126953, + "loss": 0.6904, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0017211989033967257, + "rewards/margins": 0.07512366026639938, + "rewards/rejected": -0.07684485614299774, + "step": 3620 + }, + { + "epoch": 0.59, + "learning_rate": 2.119564142219614e-06, + "logits/chosen": -2.035209894180298, + "logits/rejected": -1.9218047857284546, + "logps/chosen": -294.58441162109375, + "logps/rejected": -279.90631103515625, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03484421595931053, + "rewards/margins": 0.08571028709411621, + "rewards/rejected": -0.05086606740951538, + "step": 3630 + }, + { + "epoch": 0.6, + "learning_rate": 2.10545933115747e-06, + "logits/chosen": -1.982475996017456, + "logits/rejected": -1.8726341724395752, + "logps/chosen": -204.7448272705078, + "logps/rejected": -245.2632598876953, + "loss": 0.6863, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0300886332988739, + "rewards/margins": 0.15390489995479584, + "rewards/rejected": -0.12381626665592194, + "step": 3640 + }, + { + "epoch": 0.6, + "learning_rate": 2.0913673879767895e-06, + "logits/chosen": -1.9688513278961182, + "logits/rejected": -1.8036651611328125, + "logps/chosen": -211.7208709716797, + "logps/rejected": -198.80421447753906, + "loss": 0.6892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.040870435535907745, + "rewards/margins": 0.08654484897851944, + "rewards/rejected": -0.04567441716790199, + "step": 3650 + }, + { + "epoch": 0.6, + "learning_rate": 2.077288772284068e-06, + "logits/chosen": -1.9703762531280518, + "logits/rejected": -1.8456685543060303, + "logps/chosen": -267.1191711425781, + "logps/rejected": -264.4983825683594, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007728018797934055, + "rewards/margins": 0.10859616845846176, + "rewards/rejected": -0.10086814314126968, + "step": 3660 + }, + { + "epoch": 0.6, + "learning_rate": 2.0632239432511276e-06, + "logits/chosen": -2.1699681282043457, + "logits/rejected": -1.8136274814605713, + "logps/chosen": -348.83123779296875, + "logps/rejected": -247.5390625, + "loss": 0.6909, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.014403874054551125, + "rewards/margins": 0.08090604841709137, + "rewards/rejected": -0.0665021687746048, + "step": 3670 + }, + { + "epoch": 0.6, + "learning_rate": 2.049173359600142e-06, + "logits/chosen": -1.9132258892059326, + "logits/rejected": -1.6752735376358032, + "logps/chosen": -273.9823303222656, + "logps/rejected": -246.5084991455078, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02720949985086918, + "rewards/margins": 0.10625307261943817, + "rewards/rejected": -0.07904357463121414, + "step": 3680 + }, + { + "epoch": 0.6, + "learning_rate": 2.035137479588672e-06, + "logits/chosen": -2.0918102264404297, + "logits/rejected": -1.7736587524414062, + "logps/chosen": -283.82269287109375, + "logps/rejected": -291.10162353515625, + "loss": 0.6856, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03867720440030098, + "rewards/margins": 0.15277202427387238, + "rewards/rejected": -0.1140948161482811, + "step": 3690 + }, + { + "epoch": 0.61, + "learning_rate": 2.0211167609947227e-06, + "logits/chosen": -1.8878008127212524, + "logits/rejected": -1.7873342037200928, + "logps/chosen": -318.696533203125, + "logps/rejected": -280.882080078125, + "loss": 0.6873, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.027534520253539085, + "rewards/margins": 0.16803458333015442, + "rewards/rejected": -0.1405000537633896, + "step": 3700 + }, + { + "epoch": 0.61, + "learning_rate": 2.007111661101812e-06, + "logits/chosen": -1.9294395446777344, + "logits/rejected": -2.0270745754241943, + "logps/chosen": -202.47837829589844, + "logps/rejected": -268.4256591796875, + "loss": 0.6917, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.025194451212882996, + "rewards/margins": 0.046659648418426514, + "rewards/rejected": -0.021465197205543518, + "step": 3710 + }, + { + "epoch": 0.61, + "learning_rate": 1.9931226366840557e-06, + "logits/chosen": -2.163055896759033, + "logits/rejected": -1.9630800485610962, + "logps/chosen": -286.60919189453125, + "logps/rejected": -298.59735107421875, + "loss": 0.688, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.006681902799755335, + "rewards/margins": 0.08237860351800919, + "rewards/rejected": -0.07569669187068939, + "step": 3720 + }, + { + "epoch": 0.61, + "learning_rate": 1.979150143991271e-06, + "logits/chosen": -1.8792731761932373, + "logits/rejected": -1.8855215311050415, + "logps/chosen": -196.13365173339844, + "logps/rejected": -190.86868286132812, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014629209414124489, + "rewards/margins": 0.09507983177900314, + "rewards/rejected": -0.0804506242275238, + "step": 3730 + }, + { + "epoch": 0.61, + "learning_rate": 1.965194638734095e-06, + "logits/chosen": -1.9320497512817383, + "logits/rejected": -1.8574485778808594, + "logps/chosen": -295.15130615234375, + "logps/rejected": -293.37725830078125, + "loss": 0.6912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03767026588320732, + "rewards/margins": 0.10137882083654404, + "rewards/rejected": -0.06370856612920761, + "step": 3740 + }, + { + "epoch": 0.61, + "learning_rate": 1.9512565760691237e-06, + "logits/chosen": -1.7468655109405518, + "logits/rejected": -1.5838582515716553, + "logps/chosen": -254.8179168701172, + "logps/rejected": -227.5720977783203, + "loss": 0.6887, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009489252232015133, + "rewards/margins": 0.08286987245082855, + "rewards/rejected": -0.07338062673807144, + "step": 3750 + }, + { + "epoch": 0.62, + "learning_rate": 1.9373364105840637e-06, + "logits/chosen": -1.8656480312347412, + "logits/rejected": -1.699899673461914, + "logps/chosen": -280.9601135253906, + "logps/rejected": -224.30410766601562, + "loss": 0.6888, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0034142795484513044, + "rewards/margins": 0.12753145396709442, + "rewards/rejected": -0.1241171583533287, + "step": 3760 + }, + { + "epoch": 0.62, + "learning_rate": 1.9234345962829096e-06, + "logits/chosen": -1.955436110496521, + "logits/rejected": -1.9946479797363281, + "logps/chosen": -197.0948944091797, + "logps/rejected": -212.3218231201172, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022992201149463654, + "rewards/margins": 0.06577096879482269, + "rewards/rejected": -0.04277877137064934, + "step": 3770 + }, + { + "epoch": 0.62, + "learning_rate": 1.9095515865711334e-06, + "logits/chosen": -2.0003161430358887, + "logits/rejected": -1.9256742000579834, + "logps/chosen": -282.3370666503906, + "logps/rejected": -265.92340087890625, + "loss": 0.6881, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03596245497465134, + "rewards/margins": 0.0868336409330368, + "rewards/rejected": -0.05087118595838547, + "step": 3780 + }, + { + "epoch": 0.62, + "learning_rate": 1.8956878342409002e-06, + "logits/chosen": -1.7658329010009766, + "logits/rejected": -1.6050666570663452, + "logps/chosen": -235.8589324951172, + "logps/rejected": -290.2850646972656, + "loss": 0.6947, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.010485892184078693, + "rewards/margins": 0.10974961519241333, + "rewards/rejected": -0.09926371276378632, + "step": 3790 + }, + { + "epoch": 0.62, + "learning_rate": 1.8818437914562976e-06, + "logits/chosen": -2.1148884296417236, + "logits/rejected": -2.0245347023010254, + "logps/chosen": -237.17919921875, + "logps/rejected": -244.34423828125, + "loss": 0.6907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.015136534348130226, + "rewards/margins": 0.10181474685668945, + "rewards/rejected": -0.08667820692062378, + "step": 3800 + }, + { + "epoch": 0.62, + "learning_rate": 1.868019909738589e-06, + "logits/chosen": -1.9636512994766235, + "logits/rejected": -1.793367624282837, + "logps/chosen": -253.23764038085938, + "logps/rejected": -272.4161682128906, + "loss": 0.687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05569325014948845, + "rewards/margins": 0.12819358706474304, + "rewards/rejected": -0.0725003331899643, + "step": 3810 + }, + { + "epoch": 0.62, + "learning_rate": 1.8542166399514893e-06, + "logits/chosen": -1.7474151849746704, + "logits/rejected": -1.9631903171539307, + "logps/chosen": -228.8423614501953, + "logps/rejected": -264.17047119140625, + "loss": 0.6913, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.015829307958483696, + "rewards/margins": 0.05424109101295471, + "rewards/rejected": -0.03841177746653557, + "step": 3820 + }, + { + "epoch": 0.63, + "learning_rate": 1.8404344322864578e-06, + "logits/chosen": -2.0446016788482666, + "logits/rejected": -1.8142505884170532, + "logps/chosen": -212.7781524658203, + "logps/rejected": -236.95425415039062, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05546920746564865, + "rewards/margins": 0.11937643587589264, + "rewards/rejected": -0.06390722841024399, + "step": 3830 + }, + { + "epoch": 0.63, + "learning_rate": 1.8266737362480147e-06, + "logits/chosen": -1.9768800735473633, + "logits/rejected": -1.6116721630096436, + "logps/chosen": -286.94097900390625, + "logps/rejected": -227.5266571044922, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.005762220825999975, + "rewards/margins": 0.03792198747396469, + "rewards/rejected": -0.03215976804494858, + "step": 3840 + }, + { + "epoch": 0.63, + "learning_rate": 1.8129350006390848e-06, + "logits/chosen": -1.774665117263794, + "logits/rejected": -1.8227713108062744, + "logps/chosen": -199.89584350585938, + "logps/rejected": -285.16326904296875, + "loss": 0.6872, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06966588646173477, + "rewards/margins": 0.14950847625732422, + "rewards/rejected": -0.07984259724617004, + "step": 3850 + }, + { + "epoch": 0.63, + "learning_rate": 1.799218673546354e-06, + "logits/chosen": -1.9744856357574463, + "logits/rejected": -2.1060657501220703, + "logps/chosen": -209.1972198486328, + "logps/rejected": -230.60897827148438, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04481729492545128, + "rewards/margins": 0.10723283141851425, + "rewards/rejected": -0.062415529042482376, + "step": 3860 + }, + { + "epoch": 0.63, + "learning_rate": 1.7855252023256596e-06, + "logits/chosen": -1.8472774028778076, + "logits/rejected": -1.9882090091705322, + "logps/chosen": -276.21746826171875, + "logps/rejected": -317.9681396484375, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0013205833965912461, + "rewards/margins": 0.08621053397655487, + "rewards/rejected": -0.08753112703561783, + "step": 3870 + }, + { + "epoch": 0.63, + "learning_rate": 1.7718550335873985e-06, + "logits/chosen": -1.8858211040496826, + "logits/rejected": -1.6977182626724243, + "logps/chosen": -247.96212768554688, + "logps/rejected": -312.2625732421875, + "loss": 0.6902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04083698242902756, + "rewards/margins": 0.09789662063121796, + "rewards/rejected": -0.0570596344769001, + "step": 3880 + }, + { + "epoch": 0.64, + "learning_rate": 1.7582086131819587e-06, + "logits/chosen": -2.024104595184326, + "logits/rejected": -1.9461679458618164, + "logps/chosen": -280.4755554199219, + "logps/rejected": -276.5445556640625, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0638902336359024, + "rewards/margins": 0.12354433536529541, + "rewards/rejected": -0.0596541091799736, + "step": 3890 + }, + { + "epoch": 0.64, + "learning_rate": 1.7445863861851844e-06, + "logits/chosen": -2.025374412536621, + "logits/rejected": -1.9536526203155518, + "logps/chosen": -226.6920623779297, + "logps/rejected": -251.90682983398438, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08101359754800797, + "rewards/margins": 0.08197014778852463, + "rewards/rejected": -0.0009565517539158463, + "step": 3900 + }, + { + "epoch": 0.64, + "learning_rate": 1.7309887968838508e-06, + "logits/chosen": -1.7727241516113281, + "logits/rejected": -1.9834120273590088, + "logps/chosen": -166.94589233398438, + "logps/rejected": -217.8294677734375, + "loss": 0.6924, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03976789116859436, + "rewards/margins": 0.080923892557621, + "rewards/rejected": -0.04115601256489754, + "step": 3910 + }, + { + "epoch": 0.64, + "learning_rate": 1.71741628876118e-06, + "logits/chosen": -2.10400652885437, + "logits/rejected": -2.028578758239746, + "logps/chosen": -282.19219970703125, + "logps/rejected": -281.91693115234375, + "loss": 0.6898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06409434974193573, + "rewards/margins": 0.09701544791460037, + "rewards/rejected": -0.032921094447374344, + "step": 3920 + }, + { + "epoch": 0.64, + "learning_rate": 1.7038693044823784e-06, + "logits/chosen": -2.0744760036468506, + "logits/rejected": -1.735467553138733, + "logps/chosen": -253.09121704101562, + "logps/rejected": -245.7253875732422, + "loss": 0.6888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07376577705144882, + "rewards/margins": 0.11053715646266937, + "rewards/rejected": -0.03677137568593025, + "step": 3930 + }, + { + "epoch": 0.64, + "learning_rate": 1.6903482858801927e-06, + "logits/chosen": -1.9665076732635498, + "logits/rejected": -1.8858578205108643, + "logps/chosen": -237.39633178710938, + "logps/rejected": -267.989013671875, + "loss": 0.6941, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.05215789005160332, + "rewards/margins": 0.004937427584081888, + "rewards/rejected": 0.04722046107053757, + "step": 3940 + }, + { + "epoch": 0.65, + "learning_rate": 1.6768536739405028e-06, + "logits/chosen": -1.9320926666259766, + "logits/rejected": -1.9269946813583374, + "logps/chosen": -198.1126708984375, + "logps/rejected": -235.67037963867188, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09189413487911224, + "rewards/margins": 0.08291897922754288, + "rewards/rejected": 0.00897514820098877, + "step": 3950 + }, + { + "epoch": 0.65, + "learning_rate": 1.6633859087879439e-06, + "logits/chosen": -2.0057902336120605, + "logits/rejected": -1.639290452003479, + "logps/chosen": -257.0266418457031, + "logps/rejected": -235.34738159179688, + "loss": 0.6889, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05043790861964226, + "rewards/margins": 0.08079390227794647, + "rewards/rejected": -0.030355989933013916, + "step": 3960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6499454296715436e-06, + "logits/chosen": -1.8263343572616577, + "logits/rejected": -1.681847333908081, + "logps/chosen": -168.26449584960938, + "logps/rejected": -178.63772583007812, + "loss": 0.6897, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.06204873323440552, + "rewards/margins": 0.0909951850771904, + "rewards/rejected": -0.028946470469236374, + "step": 3970 + }, + { + "epoch": 0.65, + "learning_rate": 1.636532674950399e-06, + "logits/chosen": -1.8303172588348389, + "logits/rejected": -1.884616494178772, + "logps/chosen": -244.7496337890625, + "logps/rejected": -301.1932067871094, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03978592902421951, + "rewards/margins": 0.08861847966909409, + "rewards/rejected": -0.04883255064487457, + "step": 3980 + }, + { + "epoch": 0.65, + "learning_rate": 1.623148082079385e-06, + "logits/chosen": -2.0112624168395996, + "logits/rejected": -2.0004377365112305, + "logps/chosen": -224.08560180664062, + "logps/rejected": -213.24072265625, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03501930460333824, + "rewards/margins": 0.10594482719898224, + "rewards/rejected": -0.0709255188703537, + "step": 3990 + }, + { + "epoch": 0.65, + "learning_rate": 1.6097920875948789e-06, + "logits/chosen": -1.6724388599395752, + "logits/rejected": -1.6080735921859741, + "logps/chosen": -258.5229797363281, + "logps/rejected": -285.4081115722656, + "loss": 0.6856, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05480584502220154, + "rewards/margins": 0.1415126919746399, + "rewards/rejected": -0.08670683950185776, + "step": 4000 + }, + { + "epoch": 0.66, + "learning_rate": 1.5964651271005243e-06, + "logits/chosen": -2.108524799346924, + "logits/rejected": -1.9123685359954834, + "logps/chosen": -198.67739868164062, + "logps/rejected": -197.94039916992188, + "loss": 0.69, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05730252340435982, + "rewards/margins": 0.0865309089422226, + "rewards/rejected": -0.02922838367521763, + "step": 4010 + }, + { + "epoch": 0.66, + "learning_rate": 1.5831676352530311e-06, + "logits/chosen": -1.8528759479522705, + "logits/rejected": -1.8431529998779297, + "logps/chosen": -269.096923828125, + "logps/rejected": -311.3492431640625, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01895049959421158, + "rewards/margins": 0.08751226961612701, + "rewards/rejected": -0.06856177002191544, + "step": 4020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5699000457479896e-06, + "logits/chosen": -1.9469417333602905, + "logits/rejected": -1.7575393915176392, + "logps/chosen": -260.447998046875, + "logps/rejected": -256.695068359375, + "loss": 0.6861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05366157740354538, + "rewards/margins": 0.12260446697473526, + "rewards/rejected": -0.06894288957118988, + "step": 4030 + }, + { + "epoch": 0.66, + "learning_rate": 1.556662791305732e-06, + "logits/chosen": -1.9396159648895264, + "logits/rejected": -1.7526096105575562, + "logps/chosen": -251.2663116455078, + "logps/rejected": -254.6268768310547, + "loss": 0.6854, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.06592310965061188, + "rewards/margins": 0.1591605246067047, + "rewards/rejected": -0.09323740750551224, + "step": 4040 + }, + { + "epoch": 0.66, + "learning_rate": 1.5434563036572164e-06, + "logits/chosen": -1.8305898904800415, + "logits/rejected": -1.6222941875457764, + "logps/chosen": -253.9580841064453, + "logps/rejected": -242.6853790283203, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03931085765361786, + "rewards/margins": 0.057317234575748444, + "rewards/rejected": -0.018006378784775734, + "step": 4050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5302810135299464e-06, + "logits/chosen": -1.8167709112167358, + "logits/rejected": -1.8124256134033203, + "logps/chosen": -251.0693359375, + "logps/rejected": -275.0294494628906, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.053562603890895844, + "rewards/margins": 0.0707794800400734, + "rewards/rejected": -0.017216864973306656, + "step": 4060 + }, + { + "epoch": 0.67, + "learning_rate": 1.5171373506339254e-06, + "logits/chosen": -1.907436728477478, + "logits/rejected": -1.6711196899414062, + "logps/chosen": -211.8785858154297, + "logps/rejected": -176.47817993164062, + "loss": 0.6917, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.003779857186600566, + "rewards/margins": 0.08432294428348541, + "rewards/rejected": -0.08054308593273163, + "step": 4070 + }, + { + "epoch": 0.67, + "learning_rate": 1.5040257436476372e-06, + "logits/chosen": -2.0499775409698486, + "logits/rejected": -1.92025625705719, + "logps/chosen": -235.1408233642578, + "logps/rejected": -265.2506103515625, + "loss": 0.6879, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006761627737432718, + "rewards/margins": 0.09623605012893677, + "rewards/rejected": -0.08947442471981049, + "step": 4080 + }, + { + "epoch": 0.67, + "learning_rate": 1.490946620204068e-06, + "logits/chosen": -2.0341100692749023, + "logits/rejected": -2.053640127182007, + "logps/chosen": -245.68838500976562, + "logps/rejected": -260.537353515625, + "loss": 0.6875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.06585260480642319, + "rewards/margins": 0.09338479489088058, + "rewards/rejected": -0.027532195672392845, + "step": 4090 + }, + { + "epoch": 0.67, + "learning_rate": 1.47790040687676e-06, + "logits/chosen": -1.837704062461853, + "logits/rejected": -1.6969772577285767, + "logps/chosen": -261.976318359375, + "logps/rejected": -209.097900390625, + "loss": 0.6893, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03728323429822922, + "rewards/margins": 0.07640411704778671, + "rewards/rejected": -0.039120886474847794, + "step": 4100 + }, + { + "epoch": 0.67, + "learning_rate": 1.4648875291658943e-06, + "logits/chosen": -2.1464531421661377, + "logits/rejected": -1.961042046546936, + "logps/chosen": -303.98590087890625, + "logps/rejected": -275.2481384277344, + "loss": 0.6913, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.033067576587200165, + "rewards/margins": 0.05102933198213577, + "rewards/rejected": -0.01796174980700016, + "step": 4110 + }, + { + "epoch": 0.67, + "learning_rate": 1.451908411484418e-06, + "logits/chosen": -1.9316068887710571, + "logits/rejected": -1.6800320148468018, + "logps/chosen": -219.444580078125, + "logps/rejected": -208.5972442626953, + "loss": 0.6934, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.029658477753400803, + "rewards/margins": 0.030775953084230423, + "rewards/rejected": -0.0011174723040312529, + "step": 4120 + }, + { + "epoch": 0.68, + "learning_rate": 1.4389634771442007e-06, + "logits/chosen": -1.9291706085205078, + "logits/rejected": -1.8093160390853882, + "logps/chosen": -261.5254211425781, + "logps/rejected": -253.5171356201172, + "loss": 0.6883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04316166043281555, + "rewards/margins": 0.10762909799814224, + "rewards/rejected": -0.06446744501590729, + "step": 4130 + }, + { + "epoch": 0.68, + "learning_rate": 1.4260531483422264e-06, + "logits/chosen": -1.8333079814910889, + "logits/rejected": -1.8727391958236694, + "logps/chosen": -310.33087158203125, + "logps/rejected": -284.795166015625, + "loss": 0.6862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03705599904060364, + "rewards/margins": 0.12344212830066681, + "rewards/rejected": -0.08638612180948257, + "step": 4140 + }, + { + "epoch": 0.68, + "learning_rate": 1.4131778461468242e-06, + "logits/chosen": -2.0362675189971924, + "logits/rejected": -1.9904148578643799, + "logps/chosen": -274.14752197265625, + "logps/rejected": -292.0305480957031, + "loss": 0.6872, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04761797934770584, + "rewards/margins": 0.10303517431020737, + "rewards/rejected": -0.05541719123721123, + "step": 4150 + }, + { + "epoch": 0.68, + "learning_rate": 1.4003379904839403e-06, + "logits/chosen": -2.075604200363159, + "logits/rejected": -1.9746042490005493, + "logps/chosen": -280.7740783691406, + "logps/rejected": -306.5553283691406, + "loss": 0.6859, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.06282542645931244, + "rewards/margins": 0.13339146971702576, + "rewards/rejected": -0.07056603580713272, + "step": 4160 + }, + { + "epoch": 0.68, + "learning_rate": 1.3875340001234306e-06, + "logits/chosen": -2.048374891281128, + "logits/rejected": -1.755875825881958, + "logps/chosen": -246.71475219726562, + "logps/rejected": -242.1110076904297, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04026619344949722, + "rewards/margins": 0.11913969367742538, + "rewards/rejected": -0.07887350022792816, + "step": 4170 + }, + { + "epoch": 0.68, + "learning_rate": 1.374766292665417e-06, + "logits/chosen": -2.0502915382385254, + "logits/rejected": -1.7364475727081299, + "logps/chosen": -308.61187744140625, + "logps/rejected": -312.65509033203125, + "loss": 0.6892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.025928910821676254, + "rewards/margins": 0.11827240139245987, + "rewards/rejected": -0.09234348684549332, + "step": 4180 + }, + { + "epoch": 0.69, + "learning_rate": 1.3620352845266568e-06, + "logits/chosen": -2.0131843090057373, + "logits/rejected": -1.5834753513336182, + "logps/chosen": -259.1450500488281, + "logps/rejected": -223.02297973632812, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023545850068330765, + "rewards/margins": 0.09495096653699875, + "rewards/rejected": -0.07140512764453888, + "step": 4190 + }, + { + "epoch": 0.69, + "learning_rate": 1.3493413909269638e-06, + "logits/chosen": -2.017613649368286, + "logits/rejected": -1.9325535297393799, + "logps/chosen": -268.7721862792969, + "logps/rejected": -273.37445068359375, + "loss": 0.6881, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.056198857724666595, + "rewards/margins": 0.10630736500024796, + "rewards/rejected": -0.05010849982500076, + "step": 4200 + }, + { + "epoch": 0.69, + "learning_rate": 1.3366850258756703e-06, + "logits/chosen": -2.037518262863159, + "logits/rejected": -1.8576653003692627, + "logps/chosen": -270.2255554199219, + "logps/rejected": -216.90579223632812, + "loss": 0.6885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.025181492790579796, + "rewards/margins": 0.09487873315811157, + "rewards/rejected": -0.06969723105430603, + "step": 4210 + }, + { + "epoch": 0.69, + "learning_rate": 1.3240666021581192e-06, + "logits/chosen": -2.004150867462158, + "logits/rejected": -1.942948341369629, + "logps/chosen": -283.01507568359375, + "logps/rejected": -292.94976806640625, + "loss": 0.6972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.008712954819202423, + "rewards/margins": 0.05070185661315918, + "rewards/rejected": -0.04198889806866646, + "step": 4220 + }, + { + "epoch": 0.69, + "learning_rate": 1.3114865313221997e-06, + "logits/chosen": -1.9680296182632446, + "logits/rejected": -1.7952978610992432, + "logps/chosen": -291.58160400390625, + "logps/rejected": -264.63623046875, + "loss": 0.6914, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0417017862200737, + "rewards/margins": 0.0717376247048378, + "rewards/rejected": -0.03003583289682865, + "step": 4230 + }, + { + "epoch": 0.69, + "learning_rate": 1.2989452236649342e-06, + "logits/chosen": -2.0225701332092285, + "logits/rejected": -1.7830438613891602, + "logps/chosen": -254.3218231201172, + "logps/rejected": -242.79116821289062, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021719761192798615, + "rewards/margins": 0.0733468234539032, + "rewards/rejected": -0.051627062261104584, + "step": 4240 + }, + { + "epoch": 0.7, + "learning_rate": 1.2864430882190854e-06, + "logits/chosen": -2.206603765487671, + "logits/rejected": -2.1329753398895264, + "logps/chosen": -313.025146484375, + "logps/rejected": -315.17535400390625, + "loss": 0.6906, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05296817421913147, + "rewards/margins": 0.0819120854139328, + "rewards/rejected": -0.02894391119480133, + "step": 4250 + }, + { + "epoch": 0.7, + "learning_rate": 1.2739805327398207e-06, + "logits/chosen": -1.9877465963363647, + "logits/rejected": -1.6225574016571045, + "logps/chosen": -269.83526611328125, + "logps/rejected": -245.19546508789062, + "loss": 0.688, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.030856329947710037, + "rewards/margins": 0.13017356395721436, + "rewards/rejected": -0.09931723773479462, + "step": 4260 + }, + { + "epoch": 0.7, + "learning_rate": 1.2615579636914171e-06, + "logits/chosen": -1.8992605209350586, + "logits/rejected": -1.777557611465454, + "logps/chosen": -239.50033569335938, + "logps/rejected": -220.56173706054688, + "loss": 0.6882, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07178390771150589, + "rewards/margins": 0.10688172280788422, + "rewards/rejected": -0.035097815096378326, + "step": 4270 + }, + { + "epoch": 0.7, + "learning_rate": 1.2491757862339974e-06, + "logits/chosen": -1.9508241415023804, + "logits/rejected": -1.8350235223770142, + "logps/chosen": -242.15267944335938, + "logps/rejected": -210.6447296142578, + "loss": 0.6918, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03772275149822235, + "rewards/margins": 0.06242946535348892, + "rewards/rejected": -0.02470671571791172, + "step": 4280 + }, + { + "epoch": 0.7, + "learning_rate": 1.2368344042103162e-06, + "logits/chosen": -1.7067248821258545, + "logits/rejected": -1.7072360515594482, + "logps/chosen": -183.6341094970703, + "logps/rejected": -221.71676635742188, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03602017089724541, + "rewards/margins": 0.08357967436313629, + "rewards/rejected": -0.04755949229001999, + "step": 4290 + }, + { + "epoch": 0.7, + "learning_rate": 1.224534220132598e-06, + "logits/chosen": -2.036830425262451, + "logits/rejected": -1.9161618947982788, + "logps/chosen": -298.37457275390625, + "logps/rejected": -292.7165222167969, + "loss": 0.6906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.039535969495773315, + "rewards/margins": 0.08433219790458679, + "rewards/rejected": -0.04479622468352318, + "step": 4300 + }, + { + "epoch": 0.7, + "learning_rate": 1.2122756351693982e-06, + "logits/chosen": -2.141428232192993, + "logits/rejected": -1.9392160177230835, + "logps/chosen": -313.14056396484375, + "logps/rejected": -271.04791259765625, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.035480231046676636, + "rewards/margins": 0.03099055215716362, + "rewards/rejected": 0.004489685408771038, + "step": 4310 + }, + { + "epoch": 0.71, + "learning_rate": 1.2000590491325242e-06, + "logits/chosen": -1.9684911966323853, + "logits/rejected": -1.7699092626571655, + "logps/chosen": -286.86981201171875, + "logps/rejected": -304.16192626953125, + "loss": 0.69, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02910950407385826, + "rewards/margins": 0.08558689057826996, + "rewards/rejected": -0.0564773753285408, + "step": 4320 + }, + { + "epoch": 0.71, + "learning_rate": 1.187884860463998e-06, + "logits/chosen": -1.7463651895523071, + "logits/rejected": -1.7770828008651733, + "logps/chosen": -266.72149658203125, + "logps/rejected": -270.5861511230469, + "loss": 0.6885, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04683711379766464, + "rewards/margins": 0.10662221908569336, + "rewards/rejected": -0.05978509783744812, + "step": 4330 + }, + { + "epoch": 0.71, + "learning_rate": 1.1757534662230547e-06, + "logits/chosen": -2.009153366088867, + "logits/rejected": -1.8481454849243164, + "logps/chosen": -201.46534729003906, + "logps/rejected": -206.9662322998047, + "loss": 0.6871, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04587510973215103, + "rewards/margins": 0.10486087948083878, + "rewards/rejected": -0.058985769748687744, + "step": 4340 + }, + { + "epoch": 0.71, + "learning_rate": 1.1636652620731972e-06, + "logits/chosen": -2.0605030059814453, + "logits/rejected": -1.8221979141235352, + "logps/chosen": -312.77459716796875, + "logps/rejected": -326.3934631347656, + "loss": 0.6887, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.056065984070301056, + "rewards/margins": 0.12480566650629044, + "rewards/rejected": -0.06873968988656998, + "step": 4350 + }, + { + "epoch": 0.71, + "learning_rate": 1.1516206422692909e-06, + "logits/chosen": -1.9227218627929688, + "logits/rejected": -1.9631532430648804, + "logps/chosen": -189.7957305908203, + "logps/rejected": -257.11785888671875, + "loss": 0.6874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05524547025561333, + "rewards/margins": 0.11956918239593506, + "rewards/rejected": -0.06432371586561203, + "step": 4360 + }, + { + "epoch": 0.71, + "learning_rate": 1.1396199996447025e-06, + "logits/chosen": -1.868121862411499, + "logits/rejected": -1.8856451511383057, + "logps/chosen": -252.13156127929688, + "logps/rejected": -299.2048034667969, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04457041248679161, + "rewards/margins": 0.08434700965881348, + "rewards/rejected": -0.039776600897312164, + "step": 4370 + }, + { + "epoch": 0.72, + "learning_rate": 1.1276637255984938e-06, + "logits/chosen": -2.008445978164673, + "logits/rejected": -1.7021849155426025, + "logps/chosen": -273.2459411621094, + "logps/rejected": -277.62884521484375, + "loss": 0.6902, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.010119467973709106, + "rewards/margins": 0.08600667119026184, + "rewards/rejected": -0.07588718831539154, + "step": 4380 + }, + { + "epoch": 0.72, + "learning_rate": 1.1157522100826495e-06, + "logits/chosen": -2.0732522010803223, + "logits/rejected": -2.0065195560455322, + "logps/chosen": -241.1151885986328, + "logps/rejected": -270.804443359375, + "loss": 0.6915, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.036236830055713654, + "rewards/margins": 0.06304161995649338, + "rewards/rejected": -0.02680479921400547, + "step": 4390 + }, + { + "epoch": 0.72, + "learning_rate": 1.1038858415893627e-06, + "logits/chosen": -2.099731206893921, + "logits/rejected": -2.0057384967803955, + "logps/chosen": -307.8930969238281, + "logps/rejected": -325.12908935546875, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.047270067036151886, + "rewards/margins": 0.12810155749320984, + "rewards/rejected": -0.08083148300647736, + "step": 4400 + }, + { + "epoch": 0.72, + "learning_rate": 1.0920650071383634e-06, + "logits/chosen": -2.0078065395355225, + "logits/rejected": -1.5530612468719482, + "logps/chosen": -284.6280212402344, + "logps/rejected": -247.44143676757812, + "loss": 0.689, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05955351144075394, + "rewards/margins": 0.09325577318668365, + "rewards/rejected": -0.03370226174592972, + "step": 4410 + }, + { + "epoch": 0.72, + "learning_rate": 1.0802900922642962e-06, + "logits/chosen": -1.8685283660888672, + "logits/rejected": -1.926815390586853, + "logps/chosen": -177.49530029296875, + "logps/rejected": -200.04544067382812, + "loss": 0.6883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06064902991056442, + "rewards/margins": 0.09300986677408218, + "rewards/rejected": -0.032360829412937164, + "step": 4420 + }, + { + "epoch": 0.72, + "learning_rate": 1.0685614810041447e-06, + "logits/chosen": -1.66841721534729, + "logits/rejected": -1.7949146032333374, + "logps/chosen": -184.6566619873047, + "logps/rejected": -243.0388641357422, + "loss": 0.6919, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.022721420973539352, + "rewards/margins": 0.04902507737278938, + "rewards/rejected": -0.02630365453660488, + "step": 4430 + }, + { + "epoch": 0.73, + "learning_rate": 1.056879555884711e-06, + "logits/chosen": -2.146012544631958, + "logits/rejected": -1.8666210174560547, + "logps/chosen": -227.6302032470703, + "logps/rejected": -204.6166229248047, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.046085383743047714, + "rewards/margins": 0.08928387612104416, + "rewards/rejected": -0.043198492377996445, + "step": 4440 + }, + { + "epoch": 0.73, + "learning_rate": 1.0452446979101322e-06, + "logits/chosen": -1.8724308013916016, + "logits/rejected": -1.6827560663223267, + "logps/chosen": -296.57208251953125, + "logps/rejected": -268.01165771484375, + "loss": 0.6884, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06381593644618988, + "rewards/margins": 0.10133771598339081, + "rewards/rejected": -0.03752177208662033, + "step": 4450 + }, + { + "epoch": 0.73, + "learning_rate": 1.03365728654946e-06, + "logits/chosen": -1.9703824520111084, + "logits/rejected": -2.0277533531188965, + "logps/chosen": -233.4096221923828, + "logps/rejected": -299.0975646972656, + "loss": 0.6869, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0499785915017128, + "rewards/margins": 0.13295868039131165, + "rewards/rejected": -0.08298008143901825, + "step": 4460 + }, + { + "epoch": 0.73, + "learning_rate": 1.0221176997242812e-06, + "logits/chosen": -1.788368821144104, + "logits/rejected": -1.5777761936187744, + "logps/chosen": -233.46505737304688, + "logps/rejected": -213.0185089111328, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02467377670109272, + "rewards/margins": 0.1397322118282318, + "rewards/rejected": -0.11505842208862305, + "step": 4470 + }, + { + "epoch": 0.73, + "learning_rate": 1.0106263137963935e-06, + "logits/chosen": -2.0318593978881836, + "logits/rejected": -1.7418485879898071, + "logps/chosen": -250.01473999023438, + "logps/rejected": -255.5223846435547, + "loss": 0.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05033870413899422, + "rewards/margins": 0.13712117075920105, + "rewards/rejected": -0.08678247779607773, + "step": 4480 + }, + { + "epoch": 0.73, + "learning_rate": 9.991835035555309e-07, + "logits/chosen": -1.9825347661972046, + "logits/rejected": -1.8630472421646118, + "logps/chosen": -275.9469909667969, + "logps/rejected": -260.3446960449219, + "loss": 0.6917, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.05290894955396652, + "rewards/margins": 0.07545296847820282, + "rewards/rejected": -0.022544022649526596, + "step": 4490 + }, + { + "epoch": 0.74, + "learning_rate": 9.877896422071373e-07, + "logits/chosen": -2.147432804107666, + "logits/rejected": -1.7488676309585571, + "logps/chosen": -297.3431396484375, + "logps/rejected": -309.7465515136719, + "loss": 0.689, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.024388272315263748, + "rewards/margins": 0.08879055827856064, + "rewards/rejected": -0.06440228968858719, + "step": 4500 + }, + { + "epoch": 0.74, + "learning_rate": 9.764451013601977e-07, + "logits/chosen": -2.0858919620513916, + "logits/rejected": -1.7984225749969482, + "logps/chosen": -284.49017333984375, + "logps/rejected": -284.25042724609375, + "loss": 0.6863, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08087717741727829, + "rewards/margins": 0.11444105207920074, + "rewards/rejected": -0.03356386721134186, + "step": 4510 + }, + { + "epoch": 0.74, + "learning_rate": 9.651502510151159e-07, + "logits/chosen": -1.8363707065582275, + "logits/rejected": -1.6957581043243408, + "logps/chosen": -213.1376953125, + "logps/rejected": -223.9606170654297, + "loss": 0.6912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.026563560590147972, + "rewards/margins": 0.06659894436597824, + "rewards/rejected": -0.04003538191318512, + "step": 4520 + }, + { + "epoch": 0.74, + "learning_rate": 9.539054595516475e-07, + "logits/chosen": -1.8742939233779907, + "logits/rejected": -1.7824186086654663, + "logps/chosen": -245.001220703125, + "logps/rejected": -233.607666015625, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05337308719754219, + "rewards/margins": 0.11830373853445053, + "rewards/rejected": -0.06493064016103745, + "step": 4530 + }, + { + "epoch": 0.74, + "learning_rate": 9.42711093716886e-07, + "logits/chosen": -2.090010404586792, + "logits/rejected": -1.8893673419952393, + "logps/chosen": -229.95657348632812, + "logps/rejected": -217.9515380859375, + "loss": 0.6871, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05845467373728752, + "rewards/margins": 0.09776453673839569, + "rewards/rejected": -0.03930987790226936, + "step": 4540 + }, + { + "epoch": 0.74, + "learning_rate": 9.315675186133025e-07, + "logits/chosen": -2.0794684886932373, + "logits/rejected": -1.7525269985198975, + "logps/chosen": -262.1893005371094, + "logps/rejected": -255.99899291992188, + "loss": 0.6866, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.062481749802827835, + "rewards/margins": 0.12840762734413147, + "rewards/rejected": -0.06592587381601334, + "step": 4550 + }, + { + "epoch": 0.75, + "learning_rate": 9.204750976868343e-07, + "logits/chosen": -1.979670524597168, + "logits/rejected": -1.8853089809417725, + "logps/chosen": -267.8191223144531, + "logps/rejected": -283.6781921386719, + "loss": 0.6895, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05032741278409958, + "rewards/margins": 0.12455575168132782, + "rewards/rejected": -0.07422833889722824, + "step": 4560 + }, + { + "epoch": 0.75, + "learning_rate": 9.094341927150344e-07, + "logits/chosen": -1.9258708953857422, + "logits/rejected": -1.8195915222167969, + "logps/chosen": -257.40203857421875, + "logps/rejected": -242.8057403564453, + "loss": 0.6896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.034415312111377716, + "rewards/margins": 0.06482852250337601, + "rewards/rejected": -0.03041321039199829, + "step": 4570 + }, + { + "epoch": 0.75, + "learning_rate": 8.984451637952701e-07, + "logits/chosen": -1.9825639724731445, + "logits/rejected": -1.8399512767791748, + "logps/chosen": -346.85504150390625, + "logps/rejected": -313.1272277832031, + "loss": 0.6901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.08686750382184982, + "rewards/margins": 0.09531555324792862, + "rewards/rejected": -0.008448043838143349, + "step": 4580 + }, + { + "epoch": 0.75, + "learning_rate": 8.875083693329811e-07, + "logits/chosen": -2.065396547317505, + "logits/rejected": -2.030501365661621, + "logps/chosen": -310.3236999511719, + "logps/rejected": -317.00634765625, + "loss": 0.6903, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.047193437814712524, + "rewards/margins": 0.07261225581169128, + "rewards/rejected": -0.025418821722269058, + "step": 4590 + }, + { + "epoch": 0.75, + "learning_rate": 8.766241660299859e-07, + "logits/chosen": -1.9235721826553345, + "logits/rejected": -1.9196590185165405, + "logps/chosen": -277.69384765625, + "logps/rejected": -294.94842529296875, + "loss": 0.6873, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.048479460179805756, + "rewards/margins": 0.11079399287700653, + "rewards/rejected": -0.06231454759836197, + "step": 4600 + }, + { + "epoch": 0.75, + "learning_rate": 8.657929088728548e-07, + "logits/chosen": -2.0473766326904297, + "logits/rejected": -1.6952641010284424, + "logps/chosen": -268.0438537597656, + "logps/rejected": -233.0344696044922, + "loss": 0.6877, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.056067366153001785, + "rewards/margins": 0.1220368891954422, + "rewards/rejected": -0.06596951186656952, + "step": 4610 + }, + { + "epoch": 0.76, + "learning_rate": 8.550149511213232e-07, + "logits/chosen": -2.0314929485321045, + "logits/rejected": -1.9261184930801392, + "logps/chosen": -258.6137390136719, + "logps/rejected": -241.1985321044922, + "loss": 0.6902, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.010534616187214851, + "rewards/margins": 0.08134926110506058, + "rewards/rejected": -0.07081464678049088, + "step": 4620 + }, + { + "epoch": 0.76, + "learning_rate": 8.442906442967793e-07, + "logits/chosen": -1.9804624319076538, + "logits/rejected": -1.7029688358306885, + "logps/chosen": -242.30068969726562, + "logps/rejected": -217.71353149414062, + "loss": 0.6914, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04330366104841232, + "rewards/margins": 0.05849918723106384, + "rewards/rejected": -0.01519552432000637, + "step": 4630 + }, + { + "epoch": 0.76, + "learning_rate": 8.336203381707925e-07, + "logits/chosen": -1.8597434759140015, + "logits/rejected": -1.5952028036117554, + "logps/chosen": -239.4828338623047, + "logps/rejected": -191.03257751464844, + "loss": 0.6859, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04028228297829628, + "rewards/margins": 0.12073497474193573, + "rewards/rejected": -0.08045269548892975, + "step": 4640 + }, + { + "epoch": 0.76, + "learning_rate": 8.230043807537055e-07, + "logits/chosen": -1.9212249517440796, + "logits/rejected": -1.9095710515975952, + "logps/chosen": -221.8144073486328, + "logps/rejected": -233.2452392578125, + "loss": 0.689, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018605869263410568, + "rewards/margins": 0.0842328891158104, + "rewards/rejected": -0.06562703102827072, + "step": 4650 + }, + { + "epoch": 0.76, + "learning_rate": 8.124431182832917e-07, + "logits/chosen": -1.9624087810516357, + "logits/rejected": -2.0035080909729004, + "logps/chosen": -205.16561889648438, + "logps/rejected": -221.8397674560547, + "loss": 0.6909, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04547245427966118, + "rewards/margins": 0.08553670346736908, + "rewards/rejected": -0.0400642454624176, + "step": 4660 + }, + { + "epoch": 0.76, + "learning_rate": 8.019368952134538e-07, + "logits/chosen": -2.0490593910217285, + "logits/rejected": -1.4868111610412598, + "logps/chosen": -280.35711669921875, + "logps/rejected": -227.6632843017578, + "loss": 0.6872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04855798929929733, + "rewards/margins": 0.12709589302539825, + "rewards/rejected": -0.07853789627552032, + "step": 4670 + }, + { + "epoch": 0.77, + "learning_rate": 7.914860542029937e-07, + "logits/chosen": -1.9347574710845947, + "logits/rejected": -1.8532896041870117, + "logps/chosen": -222.35140991210938, + "logps/rejected": -261.5762023925781, + "loss": 0.6879, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.06442270427942276, + "rewards/margins": 0.13227374851703644, + "rewards/rejected": -0.06785104423761368, + "step": 4680 + }, + { + "epoch": 0.77, + "learning_rate": 7.810909361044381e-07, + "logits/chosen": -1.9474098682403564, + "logits/rejected": -1.9329410791397095, + "logps/chosen": -282.7720642089844, + "logps/rejected": -273.2689208984375, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03368307650089264, + "rewards/margins": 0.04440515488386154, + "rewards/rejected": -0.010722076520323753, + "step": 4690 + }, + { + "epoch": 0.77, + "learning_rate": 7.707518799529184e-07, + "logits/chosen": -1.8605735301971436, + "logits/rejected": -1.769443154335022, + "logps/chosen": -211.7412109375, + "logps/rejected": -240.9458465576172, + "loss": 0.6875, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05152294784784317, + "rewards/margins": 0.13337931036949158, + "rewards/rejected": -0.08185636252164841, + "step": 4700 + }, + { + "epoch": 0.77, + "learning_rate": 7.604692229551123e-07, + "logits/chosen": -2.144954204559326, + "logits/rejected": -1.9347127676010132, + "logps/chosen": -297.1256408691406, + "logps/rejected": -251.3172149658203, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02543928660452366, + "rewards/margins": 0.06580036133527756, + "rewards/rejected": -0.040361080318689346, + "step": 4710 + }, + { + "epoch": 0.77, + "learning_rate": 7.502433004782519e-07, + "logits/chosen": -1.996259093284607, + "logits/rejected": -1.5893630981445312, + "logps/chosen": -249.7122802734375, + "logps/rejected": -222.06362915039062, + "loss": 0.6894, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.029524225741624832, + "rewards/margins": 0.10011275857686996, + "rewards/rejected": -0.07058852910995483, + "step": 4720 + }, + { + "epoch": 0.77, + "learning_rate": 7.400744460391801e-07, + "logits/chosen": -2.0372085571289062, + "logits/rejected": -1.886000633239746, + "logps/chosen": -288.56341552734375, + "logps/rejected": -282.03387451171875, + "loss": 0.6881, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0595458559691906, + "rewards/margins": 0.10794460773468018, + "rewards/rejected": -0.04839874804019928, + "step": 4730 + }, + { + "epoch": 0.78, + "learning_rate": 7.299629912934733e-07, + "logits/chosen": -2.0486679077148438, + "logits/rejected": -1.7858707904815674, + "logps/chosen": -271.2407531738281, + "logps/rejected": -250.60879516601562, + "loss": 0.6866, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05823897570371628, + "rewards/margins": 0.11514576524496078, + "rewards/rejected": -0.0569068007171154, + "step": 4740 + }, + { + "epoch": 0.78, + "learning_rate": 7.199092660246295e-07, + "logits/chosen": -1.969683051109314, + "logits/rejected": -1.9144268035888672, + "logps/chosen": -236.26821899414062, + "logps/rejected": -292.5906982421875, + "loss": 0.6882, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.052231885492801666, + "rewards/margins": 0.11933587491512299, + "rewards/rejected": -0.06710399687290192, + "step": 4750 + }, + { + "epoch": 0.78, + "learning_rate": 7.099135981333063e-07, + "logits/chosen": -1.9586591720581055, + "logits/rejected": -1.8132894039154053, + "logps/chosen": -288.1571350097656, + "logps/rejected": -274.9515075683594, + "loss": 0.6905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.022615481168031693, + "rewards/margins": 0.07729291170835495, + "rewards/rejected": -0.054677434265613556, + "step": 4760 + }, + { + "epoch": 0.78, + "learning_rate": 6.999763136266297e-07, + "logits/chosen": -1.7302477359771729, + "logits/rejected": -1.4962124824523926, + "logps/chosen": -178.3135986328125, + "logps/rejected": -193.91867065429688, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.040229640901088715, + "rewards/margins": 0.10458607971668243, + "rewards/rejected": -0.06435644626617432, + "step": 4770 + }, + { + "epoch": 0.78, + "learning_rate": 6.900977366075607e-07, + "logits/chosen": -1.9973119497299194, + "logits/rejected": -2.0461840629577637, + "logps/chosen": -236.4487762451172, + "logps/rejected": -307.76812744140625, + "loss": 0.6924, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04095907881855965, + "rewards/margins": 0.09361156821250916, + "rewards/rejected": -0.05265247821807861, + "step": 4780 + }, + { + "epoch": 0.78, + "learning_rate": 6.802781892643243e-07, + "logits/chosen": -1.8436357975006104, + "logits/rejected": -1.916351556777954, + "logps/chosen": -265.3631286621094, + "logps/rejected": -283.3978576660156, + "loss": 0.6906, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.022146575152873993, + "rewards/margins": 0.07471368461847305, + "rewards/rejected": -0.05256710201501846, + "step": 4790 + }, + { + "epoch": 0.79, + "learning_rate": 6.705179918599045e-07, + "logits/chosen": -2.192960262298584, + "logits/rejected": -1.863831877708435, + "logps/chosen": -309.80963134765625, + "logps/rejected": -263.73663330078125, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013725156895816326, + "rewards/margins": 0.06032697483897209, + "rewards/rejected": -0.04660182073712349, + "step": 4800 + }, + { + "epoch": 0.79, + "learning_rate": 6.608174627215941e-07, + "logits/chosen": -1.8951025009155273, + "logits/rejected": -1.9017969369888306, + "logps/chosen": -273.18524169921875, + "logps/rejected": -261.05767822265625, + "loss": 0.6899, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04155946522951126, + "rewards/margins": 0.09929979592561722, + "rewards/rejected": -0.05774033069610596, + "step": 4810 + }, + { + "epoch": 0.79, + "learning_rate": 6.511769182306149e-07, + "logits/chosen": -1.9886624813079834, + "logits/rejected": -1.64678955078125, + "logps/chosen": -319.06744384765625, + "logps/rejected": -275.1119384765625, + "loss": 0.6872, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03561226651072502, + "rewards/margins": 0.15290844440460205, + "rewards/rejected": -0.11729617416858673, + "step": 4820 + }, + { + "epoch": 0.79, + "learning_rate": 6.415966728118003e-07, + "logits/chosen": -1.8008005619049072, + "logits/rejected": -1.8340575695037842, + "logps/chosen": -233.0964813232422, + "logps/rejected": -234.482666015625, + "loss": 0.6881, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.046227846294641495, + "rewards/margins": 0.09424199163913727, + "rewards/rejected": -0.048014137893915176, + "step": 4830 + }, + { + "epoch": 0.79, + "learning_rate": 6.320770389233375e-07, + "logits/chosen": -1.9089634418487549, + "logits/rejected": -1.7035424709320068, + "logps/chosen": -223.0059051513672, + "logps/rejected": -211.8623809814453, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028857877478003502, + "rewards/margins": 0.08327598869800568, + "rewards/rejected": -0.054418109357357025, + "step": 4840 + }, + { + "epoch": 0.79, + "learning_rate": 6.226183270465785e-07, + "logits/chosen": -2.0115928649902344, + "logits/rejected": -1.8683221340179443, + "logps/chosen": -230.6569366455078, + "logps/rejected": -238.6436004638672, + "loss": 0.6891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06479893624782562, + "rewards/margins": 0.10844061523675919, + "rewards/rejected": -0.04364169389009476, + "step": 4850 + }, + { + "epoch": 0.79, + "learning_rate": 6.132208456759154e-07, + "logits/chosen": -1.8064136505126953, + "logits/rejected": -1.605400800704956, + "logps/chosen": -222.38235473632812, + "logps/rejected": -220.9353790283203, + "loss": 0.6908, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.015949774533510208, + "rewards/margins": 0.07902668416500092, + "rewards/rejected": -0.06307690590620041, + "step": 4860 + }, + { + "epoch": 0.8, + "learning_rate": 6.038849013087147e-07, + "logits/chosen": -1.9043411016464233, + "logits/rejected": -1.9105335474014282, + "logps/chosen": -276.54364013671875, + "logps/rejected": -300.62640380859375, + "loss": 0.6865, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0528806634247303, + "rewards/margins": 0.11862605810165405, + "rewards/rejected": -0.06574538350105286, + "step": 4870 + }, + { + "epoch": 0.8, + "learning_rate": 5.946107984353242e-07, + "logits/chosen": -2.087777614593506, + "logits/rejected": -1.9068291187286377, + "logps/chosen": -259.9031677246094, + "logps/rejected": -241.4491424560547, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04787147045135498, + "rewards/margins": 0.068679079413414, + "rewards/rejected": -0.020807605236768723, + "step": 4880 + }, + { + "epoch": 0.8, + "learning_rate": 5.853988395291413e-07, + "logits/chosen": -1.919274091720581, + "logits/rejected": -2.0133798122406006, + "logps/chosen": -222.20346069335938, + "logps/rejected": -261.1437072753906, + "loss": 0.6898, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04286234453320503, + "rewards/margins": 0.05956288054585457, + "rewards/rejected": -0.016700534150004387, + "step": 4890 + }, + { + "epoch": 0.8, + "learning_rate": 5.762493250367468e-07, + "logits/chosen": -2.0304346084594727, + "logits/rejected": -1.974848985671997, + "logps/chosen": -241.8739013671875, + "logps/rejected": -263.9255065917969, + "loss": 0.6921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.06621870398521423, + "rewards/margins": 0.0673833042383194, + "rewards/rejected": -0.0011645968770608306, + "step": 4900 + }, + { + "epoch": 0.8, + "learning_rate": 5.671625533681091e-07, + "logits/chosen": -1.9979667663574219, + "logits/rejected": -2.039113759994507, + "logps/chosen": -197.3440399169922, + "logps/rejected": -257.90240478515625, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05688558891415596, + "rewards/margins": 0.09198100119829178, + "rewards/rejected": -0.03509540483355522, + "step": 4910 + }, + { + "epoch": 0.8, + "learning_rate": 5.581388208868469e-07, + "logits/chosen": -2.003237247467041, + "logits/rejected": -1.8534616231918335, + "logps/chosen": -282.1229248046875, + "logps/rejected": -295.9018249511719, + "loss": 0.6906, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.012591725215315819, + "rewards/margins": 0.07004506140947342, + "rewards/rejected": -0.05745333433151245, + "step": 4920 + }, + { + "epoch": 0.81, + "learning_rate": 5.49178421900567e-07, + "logits/chosen": -1.845060110092163, + "logits/rejected": -1.863368034362793, + "logps/chosen": -239.7712860107422, + "logps/rejected": -249.05026245117188, + "loss": 0.6875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02955322340130806, + "rewards/margins": 0.09490497410297394, + "rewards/rejected": -0.06535176187753677, + "step": 4930 + }, + { + "epoch": 0.81, + "learning_rate": 5.402816486512636e-07, + "logits/chosen": -1.8557863235473633, + "logits/rejected": -1.7680429220199585, + "logps/chosen": -208.1840057373047, + "logps/rejected": -233.869140625, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04516824707388878, + "rewards/margins": 0.11142860352993011, + "rewards/rejected": -0.06626035273075104, + "step": 4940 + }, + { + "epoch": 0.81, + "learning_rate": 5.314487913057884e-07, + "logits/chosen": -2.0888142585754395, + "logits/rejected": -1.9783337116241455, + "logps/chosen": -302.77032470703125, + "logps/rejected": -303.0293884277344, + "loss": 0.6906, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06654806435108185, + "rewards/margins": 0.07475318014621735, + "rewards/rejected": -0.008205116726458073, + "step": 4950 + }, + { + "epoch": 0.81, + "learning_rate": 5.226801379463848e-07, + "logits/chosen": -1.9599567651748657, + "logits/rejected": -1.7727677822113037, + "logps/chosen": -177.09768676757812, + "logps/rejected": -184.1103515625, + "loss": 0.6889, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.051064323633909225, + "rewards/margins": 0.08855545520782471, + "rewards/rejected": -0.03749113529920578, + "step": 4960 + }, + { + "epoch": 0.81, + "learning_rate": 5.139759745612951e-07, + "logits/chosen": -2.008896589279175, + "logits/rejected": -1.6999324560165405, + "logps/chosen": -258.9493103027344, + "logps/rejected": -218.26693725585938, + "loss": 0.6933, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04702289029955864, + "rewards/margins": 0.1070425882935524, + "rewards/rejected": -0.06001969426870346, + "step": 4970 + }, + { + "epoch": 0.81, + "learning_rate": 5.053365850354302e-07, + "logits/chosen": -2.040210247039795, + "logits/rejected": -1.8047596216201782, + "logps/chosen": -227.4982147216797, + "logps/rejected": -230.69216918945312, + "loss": 0.6883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03772208094596863, + "rewards/margins": 0.10088513046503067, + "rewards/rejected": -0.06316304951906204, + "step": 4980 + }, + { + "epoch": 0.82, + "learning_rate": 4.967622511411116e-07, + "logits/chosen": -2.1462295055389404, + "logits/rejected": -1.773026466369629, + "logps/chosen": -251.27749633789062, + "logps/rejected": -237.3136749267578, + "loss": 0.6878, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028829623013734818, + "rewards/margins": 0.12523439526557922, + "rewards/rejected": -0.09640476852655411, + "step": 4990 + }, + { + "epoch": 0.82, + "learning_rate": 4.882532525288825e-07, + "logits/chosen": -2.1470699310302734, + "logits/rejected": -1.8392826318740845, + "logps/chosen": -327.70208740234375, + "logps/rejected": -286.5236511230469, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.045814089477062225, + "rewards/margins": 0.08385896682739258, + "rewards/rejected": -0.038044869899749756, + "step": 5000 + }, + { + "epoch": 0.82, + "learning_rate": 4.798098667183851e-07, + "logits/chosen": -1.9265044927597046, + "logits/rejected": -1.6688350439071655, + "logps/chosen": -233.49560546875, + "logps/rejected": -206.34536743164062, + "loss": 0.6889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05522836372256279, + "rewards/margins": 0.11638941615819931, + "rewards/rejected": -0.06116104871034622, + "step": 5010 + }, + { + "epoch": 0.82, + "learning_rate": 4.7143236908931105e-07, + "logits/chosen": -1.9530761241912842, + "logits/rejected": -1.7228977680206299, + "logps/chosen": -202.51834106445312, + "logps/rejected": -192.5845184326172, + "loss": 0.6875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03651784360408783, + "rewards/margins": 0.1526477038860321, + "rewards/rejected": -0.11612987518310547, + "step": 5020 + }, + { + "epoch": 0.82, + "learning_rate": 4.631210328724206e-07, + "logits/chosen": -1.859708547592163, + "logits/rejected": -1.7997753620147705, + "logps/chosen": -284.34039306640625, + "logps/rejected": -276.2288818359375, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03592761605978012, + "rewards/margins": 0.057627469301223755, + "rewards/rejected": -0.021699849516153336, + "step": 5030 + }, + { + "epoch": 0.82, + "learning_rate": 4.5487612914062785e-07, + "logits/chosen": -1.8619056940078735, + "logits/rejected": -1.807208776473999, + "logps/chosen": -257.4362487792969, + "logps/rejected": -277.68572998046875, + "loss": 0.6872, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.006367975380271673, + "rewards/margins": 0.13181285560131073, + "rewards/rejected": -0.12544488906860352, + "step": 5040 + }, + { + "epoch": 0.83, + "learning_rate": 4.466979268001634e-07, + "logits/chosen": -2.0291638374328613, + "logits/rejected": -1.809472680091858, + "logps/chosen": -232.92715454101562, + "logps/rejected": -234.1992645263672, + "loss": 0.6902, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01809687539935112, + "rewards/margins": 0.09585966169834137, + "rewards/rejected": -0.07776279747486115, + "step": 5050 + }, + { + "epoch": 0.83, + "learning_rate": 4.385866925818019e-07, + "logits/chosen": -1.812727928161621, + "logits/rejected": -1.8956689834594727, + "logps/chosen": -239.1660919189453, + "logps/rejected": -244.65087890625, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03684180974960327, + "rewards/margins": 0.04873966425657272, + "rewards/rejected": -0.011897852644324303, + "step": 5060 + }, + { + "epoch": 0.83, + "learning_rate": 4.305426910321628e-07, + "logits/chosen": -1.9461021423339844, + "logits/rejected": -1.6821556091308594, + "logps/chosen": -299.94293212890625, + "logps/rejected": -241.63168334960938, + "loss": 0.693, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.019023098051548004, + "rewards/margins": 0.018745549023151398, + "rewards/rejected": 0.00027755461633205414, + "step": 5070 + }, + { + "epoch": 0.83, + "learning_rate": 4.225661845050846e-07, + "logits/chosen": -1.8442819118499756, + "logits/rejected": -1.6971681118011475, + "logps/chosen": -241.041015625, + "logps/rejected": -239.36318969726562, + "loss": 0.6871, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.055490244179964066, + "rewards/margins": 0.13150055706501007, + "rewards/rejected": -0.0760103315114975, + "step": 5080 + }, + { + "epoch": 0.83, + "learning_rate": 4.1465743315306477e-07, + "logits/chosen": -2.037449598312378, + "logits/rejected": -1.7886598110198975, + "logps/chosen": -310.5695495605469, + "logps/rejected": -259.7740783691406, + "loss": 0.6888, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.049449045211076736, + "rewards/margins": 0.1003706231713295, + "rewards/rejected": -0.05092158168554306, + "step": 5090 + }, + { + "epoch": 0.83, + "learning_rate": 4.0681669491877575e-07, + "logits/chosen": -1.7250486612319946, + "logits/rejected": -1.7483265399932861, + "logps/chosen": -250.4457244873047, + "logps/rejected": -234.5820770263672, + "loss": 0.6878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.025174085050821304, + "rewards/margins": 0.10070687532424927, + "rewards/rejected": -0.07553279399871826, + "step": 5100 + }, + { + "epoch": 0.84, + "learning_rate": 3.9904422552665605e-07, + "logits/chosen": -2.030991792678833, + "logits/rejected": -1.7798175811767578, + "logps/chosen": -315.4599609375, + "logps/rejected": -279.9196472167969, + "loss": 0.6898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04851796105504036, + "rewards/margins": 0.09414204210042953, + "rewards/rejected": -0.045624084770679474, + "step": 5110 + }, + { + "epoch": 0.84, + "learning_rate": 3.913402784745626e-07, + "logits/chosen": -1.8425061702728271, + "logits/rejected": -1.8201786279678345, + "logps/chosen": -270.40625, + "logps/rejected": -304.279541015625, + "loss": 0.6888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.006314513273537159, + "rewards/margins": 0.0949721559882164, + "rewards/rejected": -0.08865765482187271, + "step": 5120 + }, + { + "epoch": 0.84, + "learning_rate": 3.8370510502550916e-07, + "logits/chosen": -1.8791755437850952, + "logits/rejected": -1.8021551370620728, + "logps/chosen": -279.0675964355469, + "logps/rejected": -251.7834014892578, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.009561575017869473, + "rewards/margins": 0.05043962597846985, + "rewards/rejected": -0.0408780574798584, + "step": 5130 + }, + { + "epoch": 0.84, + "learning_rate": 3.761389541994706e-07, + "logits/chosen": -2.3054659366607666, + "logits/rejected": -2.0193867683410645, + "logps/chosen": -238.5725555419922, + "logps/rejected": -256.4541931152344, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05859548598527908, + "rewards/margins": 0.10275013744831085, + "rewards/rejected": -0.04415465146303177, + "step": 5140 + }, + { + "epoch": 0.84, + "learning_rate": 3.686420727652576e-07, + "logits/chosen": -1.849359154701233, + "logits/rejected": -1.6916815042495728, + "logps/chosen": -274.2336120605469, + "logps/rejected": -290.5913391113281, + "loss": 0.6878, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.003267051186412573, + "rewards/margins": 0.09480474889278412, + "rewards/rejected": -0.09153767675161362, + "step": 5150 + }, + { + "epoch": 0.84, + "learning_rate": 3.6121470523247074e-07, + "logits/chosen": -1.98979914188385, + "logits/rejected": -1.7235511541366577, + "logps/chosen": -253.6145782470703, + "logps/rejected": -239.12319946289062, + "loss": 0.6879, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04128045588731766, + "rewards/margins": 0.08574996143579483, + "rewards/rejected": -0.044469498097896576, + "step": 5160 + }, + { + "epoch": 0.85, + "learning_rate": 3.538570938435279e-07, + "logits/chosen": -1.8884479999542236, + "logits/rejected": -1.8580424785614014, + "logps/chosen": -272.1982421875, + "logps/rejected": -271.7079772949219, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.041528794914484024, + "rewards/margins": 0.07156471908092499, + "rewards/rejected": -0.030035916715860367, + "step": 5170 + }, + { + "epoch": 0.85, + "learning_rate": 3.4656947856576e-07, + "logits/chosen": -2.2203056812286377, + "logits/rejected": -1.9431819915771484, + "logps/chosen": -288.06524658203125, + "logps/rejected": -245.9959259033203, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02978016994893551, + "rewards/margins": 0.10247679054737091, + "rewards/rejected": -0.07269660383462906, + "step": 5180 + }, + { + "epoch": 0.85, + "learning_rate": 3.393520970835837e-07, + "logits/chosen": -2.1742258071899414, + "logits/rejected": -1.9565502405166626, + "logps/chosen": -303.97955322265625, + "logps/rejected": -212.51309204101562, + "loss": 0.6919, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.026600461453199387, + "rewards/margins": 0.04079204425215721, + "rewards/rejected": -0.014191585592925549, + "step": 5190 + }, + { + "epoch": 0.85, + "learning_rate": 3.3220518479075564e-07, + "logits/chosen": -2.045121908187866, + "logits/rejected": -1.9188991785049438, + "logps/chosen": -339.7648010253906, + "logps/rejected": -293.672119140625, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02908080257475376, + "rewards/margins": 0.06526035070419312, + "rewards/rejected": -0.036179546266794205, + "step": 5200 + }, + { + "epoch": 0.85, + "learning_rate": 3.2512897478268776e-07, + "logits/chosen": -2.0665719509124756, + "logits/rejected": -1.8896081447601318, + "logps/chosen": -226.0492706298828, + "logps/rejected": -202.93243408203125, + "loss": 0.6856, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05610230565071106, + "rewards/margins": 0.1276354044675827, + "rewards/rejected": -0.07153309881687164, + "step": 5210 + }, + { + "epoch": 0.85, + "learning_rate": 3.1812369784885027e-07, + "logits/chosen": -2.101672649383545, + "logits/rejected": -1.8331382274627686, + "logps/chosen": -250.4229278564453, + "logps/rejected": -215.63369750976562, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05187688395380974, + "rewards/margins": 0.08669890463352203, + "rewards/rejected": -0.034822020679712296, + "step": 5220 + }, + { + "epoch": 0.86, + "learning_rate": 3.111895824652411e-07, + "logits/chosen": -1.9888617992401123, + "logits/rejected": -2.012720823287964, + "logps/chosen": -290.1101379394531, + "logps/rejected": -318.8715515136719, + "loss": 0.6893, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.06653638184070587, + "rewards/margins": 0.110171839594841, + "rewards/rejected": -0.04363545402884483, + "step": 5230 + }, + { + "epoch": 0.86, + "learning_rate": 3.04326854786936e-07, + "logits/chosen": -1.9610564708709717, + "logits/rejected": -1.7328922748565674, + "logps/chosen": -269.8328857421875, + "logps/rejected": -296.21929931640625, + "loss": 0.687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.051869869232177734, + "rewards/margins": 0.12129143625497818, + "rewards/rejected": -0.06942155957221985, + "step": 5240 + }, + { + "epoch": 0.86, + "learning_rate": 2.975357386407118e-07, + "logits/chosen": -2.041713237762451, + "logits/rejected": -1.934038519859314, + "logps/chosen": -217.55691528320312, + "logps/rejected": -213.2156982421875, + "loss": 0.6897, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.021868595853447914, + "rewards/margins": 0.0667169839143753, + "rewards/rejected": -0.04484838619828224, + "step": 5250 + }, + { + "epoch": 0.86, + "learning_rate": 2.90816455517747e-07, + "logits/chosen": -2.0877394676208496, + "logits/rejected": -1.8372814655303955, + "logps/chosen": -240.55545043945312, + "logps/rejected": -256.9902038574219, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04815537482500076, + "rewards/margins": 0.10561804473400116, + "rewards/rejected": -0.0574626699090004, + "step": 5260 + }, + { + "epoch": 0.86, + "learning_rate": 2.8416922456639603e-07, + "logits/chosen": -1.8568775653839111, + "logits/rejected": -1.8593075275421143, + "logps/chosen": -200.6631317138672, + "logps/rejected": -215.8887481689453, + "loss": 0.6862, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008217515423893929, + "rewards/margins": 0.1259576976299286, + "rewards/rejected": -0.11774017661809921, + "step": 5270 + }, + { + "epoch": 0.86, + "learning_rate": 2.775942625850453e-07, + "logits/chosen": -2.014117479324341, + "logits/rejected": -1.9761308431625366, + "logps/chosen": -183.64688110351562, + "logps/rejected": -238.2388153076172, + "loss": 0.689, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05383627861738205, + "rewards/margins": 0.10767177492380142, + "rewards/rejected": -0.05383550003170967, + "step": 5280 + }, + { + "epoch": 0.87, + "learning_rate": 2.7109178401503874e-07, + "logits/chosen": -1.919205904006958, + "logits/rejected": -1.7993791103363037, + "logps/chosen": -262.3959655761719, + "logps/rejected": -275.47833251953125, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03535730764269829, + "rewards/margins": 0.07986004650592804, + "rewards/rejected": -0.04450273886322975, + "step": 5290 + }, + { + "epoch": 0.87, + "learning_rate": 2.6466200093368563e-07, + "logits/chosen": -1.9238742589950562, + "logits/rejected": -1.814490556716919, + "logps/chosen": -189.10308837890625, + "logps/rejected": -225.6112823486328, + "loss": 0.6879, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02200132980942726, + "rewards/margins": 0.07067473977804184, + "rewards/rejected": -0.04867340996861458, + "step": 5300 + }, + { + "epoch": 0.87, + "learning_rate": 2.583051230473435e-07, + "logits/chosen": -2.1764142513275146, + "logits/rejected": -1.9817962646484375, + "logps/chosen": -246.9797821044922, + "logps/rejected": -265.94610595703125, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.027526771649718285, + "rewards/margins": 0.06022367626428604, + "rewards/rejected": -0.032696906477212906, + "step": 5310 + }, + { + "epoch": 0.87, + "learning_rate": 2.520213576845781e-07, + "logits/chosen": -1.6628122329711914, + "logits/rejected": -1.5482549667358398, + "logps/chosen": -229.26144409179688, + "logps/rejected": -271.391357421875, + "loss": 0.6842, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04098007455468178, + "rewards/margins": 0.17488007247447968, + "rewards/rejected": -0.1339000016450882, + "step": 5320 + }, + { + "epoch": 0.87, + "learning_rate": 2.4581090978940206e-07, + "logits/chosen": -1.9511449337005615, + "logits/rejected": -1.8067255020141602, + "logps/chosen": -194.4497833251953, + "logps/rejected": -253.6127166748047, + "loss": 0.687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05636359006166458, + "rewards/margins": 0.13965095579624176, + "rewards/rejected": -0.08328735828399658, + "step": 5330 + }, + { + "epoch": 0.87, + "learning_rate": 2.39673981914591e-07, + "logits/chosen": -1.9104583263397217, + "logits/rejected": -1.7664823532104492, + "logps/chosen": -257.84674072265625, + "logps/rejected": -240.2957305908203, + "loss": 0.6885, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.059882063418626785, + "rewards/margins": 0.10031101852655411, + "rewards/rejected": -0.040428951382637024, + "step": 5340 + }, + { + "epoch": 0.88, + "learning_rate": 2.336107742150756e-07, + "logits/chosen": -2.1911325454711914, + "logits/rejected": -1.9430911540985107, + "logps/chosen": -270.4957275390625, + "logps/rejected": -238.34909057617188, + "loss": 0.6901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05298991873860359, + "rewards/margins": 0.08840446919202805, + "rewards/rejected": -0.03541455417871475, + "step": 5350 + }, + { + "epoch": 0.88, + "learning_rate": 2.276214844414157e-07, + "logits/chosen": -1.9237644672393799, + "logits/rejected": -1.6972525119781494, + "logps/chosen": -263.86962890625, + "logps/rejected": -260.51220703125, + "loss": 0.687, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01592729613184929, + "rewards/margins": 0.12395380437374115, + "rewards/rejected": -0.10802651941776276, + "step": 5360 + }, + { + "epoch": 0.88, + "learning_rate": 2.217063079333487e-07, + "logits/chosen": -2.075559377670288, + "logits/rejected": -1.8776044845581055, + "logps/chosen": -332.67425537109375, + "logps/rejected": -279.82208251953125, + "loss": 0.6869, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020371347665786743, + "rewards/margins": 0.12244923412799835, + "rewards/rejected": -0.10207787901163101, + "step": 5370 + }, + { + "epoch": 0.88, + "learning_rate": 2.1586543761342005e-07, + "logits/chosen": -2.258777379989624, + "logits/rejected": -1.920732855796814, + "logps/chosen": -340.9679260253906, + "logps/rejected": -254.0303192138672, + "loss": 0.6901, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.033154916018247604, + "rewards/margins": 0.09219371527433395, + "rewards/rejected": -0.05903879925608635, + "step": 5380 + }, + { + "epoch": 0.88, + "learning_rate": 2.1009906398069157e-07, + "logits/chosen": -1.9442269802093506, + "logits/rejected": -1.6081279516220093, + "logps/chosen": -278.1702575683594, + "logps/rejected": -221.9802703857422, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03574693948030472, + "rewards/margins": 0.09812335669994354, + "rewards/rejected": -0.062376417219638824, + "step": 5390 + }, + { + "epoch": 0.88, + "learning_rate": 2.0440737510452585e-07, + "logits/chosen": -2.068906784057617, + "logits/rejected": -1.81875741481781, + "logps/chosen": -270.6360168457031, + "logps/rejected": -262.6910400390625, + "loss": 0.6881, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0627523809671402, + "rewards/margins": 0.12414367496967316, + "rewards/rejected": -0.061391301453113556, + "step": 5400 + }, + { + "epoch": 0.88, + "learning_rate": 1.9879055661845475e-07, + "logits/chosen": -1.9923099279403687, + "logits/rejected": -1.9699596166610718, + "logps/chosen": -280.54058837890625, + "logps/rejected": -330.0, + "loss": 0.6903, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.009225473739206791, + "rewards/margins": 0.10030166804790497, + "rewards/rejected": -0.09107618778944016, + "step": 5410 + }, + { + "epoch": 0.89, + "learning_rate": 1.9324879171412431e-07, + "logits/chosen": -1.9430763721466064, + "logits/rejected": -1.8377370834350586, + "logps/chosen": -217.35708618164062, + "logps/rejected": -275.66424560546875, + "loss": 0.691, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03293789178133011, + "rewards/margins": 0.07960274815559387, + "rewards/rejected": -0.046664852648973465, + "step": 5420 + }, + { + "epoch": 0.89, + "learning_rate": 1.8778226113531944e-07, + "logits/chosen": -2.0188913345336914, + "logits/rejected": -1.8984565734863281, + "logps/chosen": -228.4208984375, + "logps/rejected": -253.54293823242188, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04313891381025314, + "rewards/margins": 0.08758164942264557, + "rewards/rejected": -0.04444272443652153, + "step": 5430 + }, + { + "epoch": 0.89, + "learning_rate": 1.823911431720693e-07, + "logits/chosen": -2.0657808780670166, + "logits/rejected": -1.7327394485473633, + "logps/chosen": -209.64944458007812, + "logps/rejected": -202.9642791748047, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04443899542093277, + "rewards/margins": 0.08911273628473282, + "rewards/rejected": -0.04467373341321945, + "step": 5440 + }, + { + "epoch": 0.89, + "learning_rate": 1.7707561365483378e-07, + "logits/chosen": -1.8135130405426025, + "logits/rejected": -1.7339309453964233, + "logps/chosen": -192.99722290039062, + "logps/rejected": -213.9237518310547, + "loss": 0.6883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.027097662910819054, + "rewards/margins": 0.09586968272924423, + "rewards/rejected": -0.06877203285694122, + "step": 5450 + }, + { + "epoch": 0.89, + "learning_rate": 1.718358459487665e-07, + "logits/chosen": -2.102224349975586, + "logits/rejected": -1.7921440601348877, + "logps/chosen": -327.16180419921875, + "logps/rejected": -258.704345703125, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.017036590725183487, + "rewards/margins": 0.07666231691837311, + "rewards/rejected": -0.05962573364377022, + "step": 5460 + }, + { + "epoch": 0.89, + "learning_rate": 1.666720109480613e-07, + "logits/chosen": -1.996227502822876, + "logits/rejected": -2.0578176975250244, + "logps/chosen": -252.5278778076172, + "logps/rejected": -311.3238525390625, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03457929193973541, + "rewards/margins": 0.10979892313480377, + "rewards/rejected": -0.07521963119506836, + "step": 5470 + }, + { + "epoch": 0.9, + "learning_rate": 1.6158427707037989e-07, + "logits/chosen": -1.9513092041015625, + "logits/rejected": -1.7241798639297485, + "logps/chosen": -259.555419921875, + "logps/rejected": -234.96755981445312, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05417270213365555, + "rewards/margins": 0.08730131387710571, + "rewards/rejected": -0.033128611743450165, + "step": 5480 + }, + { + "epoch": 0.9, + "learning_rate": 1.565728102513567e-07, + "logits/chosen": -1.8478155136108398, + "logits/rejected": -1.74398934841156, + "logps/chosen": -311.42095947265625, + "logps/rejected": -339.234375, + "loss": 0.6908, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03714600205421448, + "rewards/margins": 0.06856032460927963, + "rewards/rejected": -0.031414322555065155, + "step": 5490 + }, + { + "epoch": 0.9, + "learning_rate": 1.5163777393918944e-07, + "logits/chosen": -1.9362646341323853, + "logits/rejected": -1.7603845596313477, + "logps/chosen": -261.5321350097656, + "logps/rejected": -237.23910522460938, + "loss": 0.6871, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04447184503078461, + "rewards/margins": 0.13978040218353271, + "rewards/rejected": -0.09530854225158691, + "step": 5500 + }, + { + "epoch": 0.9, + "learning_rate": 1.4677932908930604e-07, + "logits/chosen": -1.8903309106826782, + "logits/rejected": -1.9083162546157837, + "logps/chosen": -184.5116424560547, + "logps/rejected": -193.9232940673828, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020337218418717384, + "rewards/margins": 0.07409347593784332, + "rewards/rejected": -0.053756266832351685, + "step": 5510 + }, + { + "epoch": 0.9, + "learning_rate": 1.4199763415911587e-07, + "logits/chosen": -1.952972412109375, + "logits/rejected": -1.8644098043441772, + "logps/chosen": -279.64581298828125, + "logps/rejected": -282.6411437988281, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04779963940382004, + "rewards/margins": 0.11681845039129257, + "rewards/rejected": -0.06901881843805313, + "step": 5520 + }, + { + "epoch": 0.9, + "learning_rate": 1.3729284510284325e-07, + "logits/chosen": -1.9547008275985718, + "logits/rejected": -1.8549258708953857, + "logps/chosen": -266.11004638671875, + "logps/rejected": -239.77572631835938, + "loss": 0.69, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05709709972143173, + "rewards/margins": 0.12098660320043564, + "rewards/rejected": -0.0638895183801651, + "step": 5530 + }, + { + "epoch": 0.91, + "learning_rate": 1.3266511536643738e-07, + "logits/chosen": -1.9262663125991821, + "logits/rejected": -1.871561050415039, + "logps/chosen": -276.8809509277344, + "logps/rejected": -277.42633056640625, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.028692984953522682, + "rewards/margins": 0.06393797695636749, + "rewards/rejected": -0.03524499386548996, + "step": 5540 + }, + { + "epoch": 0.91, + "learning_rate": 1.2811459588257135e-07, + "logits/chosen": -1.811094880104065, + "logits/rejected": -1.9233958721160889, + "logps/chosen": -267.83551025390625, + "logps/rejected": -275.330810546875, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.054552413523197174, + "rewards/margins": 0.1084919422864914, + "rewards/rejected": -0.05393952131271362, + "step": 5550 + }, + { + "epoch": 0.91, + "learning_rate": 1.2364143506571835e-07, + "logits/chosen": -2.159637212753296, + "logits/rejected": -1.8764457702636719, + "logps/chosen": -313.66375732421875, + "logps/rejected": -272.637939453125, + "loss": 0.691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04719634726643562, + "rewards/margins": 0.06093304231762886, + "rewards/rejected": -0.013736692257225513, + "step": 5560 + }, + { + "epoch": 0.91, + "learning_rate": 1.192457788073101e-07, + "logits/chosen": -2.145559072494507, + "logits/rejected": -1.7370021343231201, + "logps/chosen": -286.15557861328125, + "logps/rejected": -221.5148468017578, + "loss": 0.688, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.027440408244729042, + "rewards/margins": 0.10607733577489853, + "rewards/rejected": -0.07863692194223404, + "step": 5570 + }, + { + "epoch": 0.91, + "learning_rate": 1.1492777047097919e-07, + "logits/chosen": -2.0990872383117676, + "logits/rejected": -2.048004627227783, + "logps/chosen": -270.29962158203125, + "logps/rejected": -280.3026428222656, + "loss": 0.6913, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03865872696042061, + "rewards/margins": 0.08446788787841797, + "rewards/rejected": -0.04580916091799736, + "step": 5580 + }, + { + "epoch": 0.91, + "learning_rate": 1.1068755088788491e-07, + "logits/chosen": -2.088320255279541, + "logits/rejected": -2.105867862701416, + "logps/chosen": -206.8843536376953, + "logps/rejected": -268.5412292480469, + "loss": 0.691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04851872846484184, + "rewards/margins": 0.051219724118709564, + "rewards/rejected": -0.0027009951882064342, + "step": 5590 + }, + { + "epoch": 0.92, + "learning_rate": 1.065252583521173e-07, + "logits/chosen": -2.024479389190674, + "logits/rejected": -1.853237509727478, + "logps/chosen": -290.9424743652344, + "logps/rejected": -276.1693420410156, + "loss": 0.6896, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.07392782717943192, + "rewards/margins": 0.11121167987585068, + "rewards/rejected": -0.03728386014699936, + "step": 5600 + }, + { + "epoch": 0.92, + "learning_rate": 1.024410286161881e-07, + "logits/chosen": -1.6346032619476318, + "logits/rejected": -1.7279155254364014, + "logps/chosen": -203.22694396972656, + "logps/rejected": -209.01083374023438, + "loss": 0.6929, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.03074551746249199, + "rewards/margins": 0.004895345773547888, + "rewards/rejected": 0.025850171223282814, + "step": 5610 + }, + { + "epoch": 0.92, + "learning_rate": 9.843499488660479e-08, + "logits/chosen": -1.965131163597107, + "logits/rejected": -1.7745568752288818, + "logps/chosen": -258.39788818359375, + "logps/rejected": -230.2206268310547, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04950153827667236, + "rewards/margins": 0.07621227204799652, + "rewards/rejected": -0.02671072445809841, + "step": 5620 + }, + { + "epoch": 0.92, + "learning_rate": 9.450728781952245e-08, + "logits/chosen": -2.0119552612304688, + "logits/rejected": -1.8218374252319336, + "logps/chosen": -249.98843383789062, + "logps/rejected": -218.7266387939453, + "loss": 0.6915, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.054436445236206055, + "rewards/margins": 0.07215365767478943, + "rewards/rejected": -0.017717216163873672, + "step": 5630 + }, + { + "epoch": 0.92, + "learning_rate": 9.065803551648517e-08, + "logits/chosen": -1.8693866729736328, + "logits/rejected": -1.5909593105316162, + "logps/chosen": -190.80616760253906, + "logps/rejected": -200.70567321777344, + "loss": 0.6873, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03776348754763603, + "rewards/margins": 0.12551133334636688, + "rewards/rejected": -0.08774784952402115, + "step": 5640 + }, + { + "epoch": 0.92, + "learning_rate": 8.688736352024885e-08, + "logits/chosen": -1.9673439264297485, + "logits/rejected": -1.885069489479065, + "logps/chosen": -255.69290161132812, + "logps/rejected": -250.2675323486328, + "loss": 0.689, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.057724498212337494, + "rewards/margins": 0.11295797675848007, + "rewards/rejected": -0.055233485996723175, + "step": 5650 + }, + { + "epoch": 0.93, + "learning_rate": 8.319539481068229e-08, + "logits/chosen": -2.005737781524658, + "logits/rejected": -1.5745179653167725, + "logps/chosen": -281.39117431640625, + "logps/rejected": -270.2967834472656, + "loss": 0.6896, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05007070302963257, + "rewards/margins": 0.15671563148498535, + "rewards/rejected": -0.10664491355419159, + "step": 5660 + }, + { + "epoch": 0.93, + "learning_rate": 7.958224980076146e-08, + "logits/chosen": -2.0450103282928467, + "logits/rejected": -1.664944052696228, + "logps/chosen": -263.5628967285156, + "logps/rejected": -221.3567657470703, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026941200718283653, + "rewards/margins": 0.08035246282815933, + "rewards/rejected": -0.05341125279664993, + "step": 5670 + }, + { + "epoch": 0.93, + "learning_rate": 7.604804633263851e-08, + "logits/chosen": -1.851030707359314, + "logits/rejected": -1.7000089883804321, + "logps/chosen": -295.1942443847656, + "logps/rejected": -269.30328369140625, + "loss": 0.6915, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02838749811053276, + "rewards/margins": 0.07305712252855301, + "rewards/rejected": -0.04466962814331055, + "step": 5680 + }, + { + "epoch": 0.93, + "learning_rate": 7.25928996738004e-08, + "logits/chosen": -1.9483797550201416, + "logits/rejected": -1.5993316173553467, + "logps/chosen": -248.3431396484375, + "logps/rejected": -254.10610961914062, + "loss": 0.6902, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07831916958093643, + "rewards/margins": 0.12273217737674713, + "rewards/rejected": -0.04441302642226219, + "step": 5690 + }, + { + "epoch": 0.93, + "learning_rate": 6.921692251330825e-08, + "logits/chosen": -2.066793918609619, + "logits/rejected": -1.9162273406982422, + "logps/chosen": -306.1136169433594, + "logps/rejected": -295.83258056640625, + "loss": 0.6921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.028992395848035812, + "rewards/margins": 0.038350366055965424, + "rewards/rejected": -0.009357966482639313, + "step": 5700 + }, + { + "epoch": 0.93, + "learning_rate": 6.592022495812311e-08, + "logits/chosen": -1.6695019006729126, + "logits/rejected": -1.631906270980835, + "logps/chosen": -162.53184509277344, + "logps/rejected": -181.72531127929688, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04603511095046997, + "rewards/margins": 0.07537040114402771, + "rewards/rejected": -0.02933528646826744, + "step": 5710 + }, + { + "epoch": 0.94, + "learning_rate": 6.270291452951355e-08, + "logits/chosen": -1.826674461364746, + "logits/rejected": -1.6826915740966797, + "logps/chosen": -257.7422790527344, + "logps/rejected": -250.5347137451172, + "loss": 0.6908, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.046172045171260834, + "rewards/margins": 0.08908624947071075, + "rewards/rejected": -0.04291420429944992, + "step": 5720 + }, + { + "epoch": 0.94, + "learning_rate": 5.9565096159550364e-08, + "logits/chosen": -1.8904144763946533, + "logits/rejected": -1.9185192584991455, + "logps/chosen": -250.4985809326172, + "logps/rejected": -239.54794311523438, + "loss": 0.6931, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03669193759560585, + "rewards/margins": 0.0697455033659935, + "rewards/rejected": -0.03305356949567795, + "step": 5730 + }, + { + "epoch": 0.94, + "learning_rate": 5.6506872187682414e-08, + "logits/chosen": -2.111309766769409, + "logits/rejected": -1.9023300409317017, + "logps/chosen": -219.80123901367188, + "logps/rejected": -181.26162719726562, + "loss": 0.6935, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.030547108501195908, + "rewards/margins": 0.040379274636507034, + "rewards/rejected": -0.009832166135311127, + "step": 5740 + }, + { + "epoch": 0.94, + "learning_rate": 5.352834235740067e-08, + "logits/chosen": -2.0818755626678467, + "logits/rejected": -1.940712571144104, + "logps/chosen": -211.98379516601562, + "logps/rejected": -258.3806457519531, + "loss": 0.6905, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05625417083501816, + "rewards/margins": 0.07480119168758392, + "rewards/rejected": -0.018547018989920616, + "step": 5750 + }, + { + "epoch": 0.94, + "learning_rate": 5.0629603812984704e-08, + "logits/chosen": -1.7864925861358643, + "logits/rejected": -1.6660085916519165, + "logps/chosen": -304.483642578125, + "logps/rejected": -293.7355041503906, + "loss": 0.688, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03714631870388985, + "rewards/margins": 0.10452187061309814, + "rewards/rejected": -0.067375548183918, + "step": 5760 + }, + { + "epoch": 0.94, + "learning_rate": 4.781075109633243e-08, + "logits/chosen": -2.1498658657073975, + "logits/rejected": -1.8761188983917236, + "logps/chosen": -257.6712951660156, + "logps/rejected": -246.99710083007812, + "loss": 0.6902, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.05641911178827286, + "rewards/margins": 0.07427899539470673, + "rewards/rejected": -0.017859887331724167, + "step": 5770 + }, + { + "epoch": 0.95, + "learning_rate": 4.507187614387953e-08, + "logits/chosen": -1.8091678619384766, + "logits/rejected": -1.7338510751724243, + "logps/chosen": -240.86245727539062, + "logps/rejected": -283.48089599609375, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0462474450469017, + "rewards/margins": 0.12645326554775238, + "rewards/rejected": -0.08020582050085068, + "step": 5780 + }, + { + "epoch": 0.95, + "learning_rate": 4.241306828359881e-08, + "logits/chosen": -1.9466440677642822, + "logits/rejected": -1.9122734069824219, + "logps/chosen": -246.61044311523438, + "logps/rejected": -268.69085693359375, + "loss": 0.6869, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04404435306787491, + "rewards/margins": 0.1299835741519928, + "rewards/rejected": -0.08593922853469849, + "step": 5790 + }, + { + "epoch": 0.95, + "learning_rate": 3.98344142320875e-08, + "logits/chosen": -1.9511282444000244, + "logits/rejected": -2.042219638824463, + "logps/chosen": -236.32052612304688, + "logps/rejected": -299.90576171875, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04284355789422989, + "rewards/margins": 0.11995480954647064, + "rewards/rejected": -0.07711124420166016, + "step": 5800 + }, + { + "epoch": 0.95, + "learning_rate": 3.73359980917401e-08, + "logits/chosen": -2.0698049068450928, + "logits/rejected": -1.7069746255874634, + "logps/chosen": -283.86920166015625, + "logps/rejected": -220.75167846679688, + "loss": 0.6895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.06882615387439728, + "rewards/margins": 0.10552725940942764, + "rewards/rejected": -0.036701105535030365, + "step": 5810 + }, + { + "epoch": 0.95, + "learning_rate": 3.491790134800305e-08, + "logits/chosen": -1.9605919122695923, + "logits/rejected": -1.813962697982788, + "logps/chosen": -182.68936157226562, + "logps/rejected": -175.5609130859375, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05317642539739609, + "rewards/margins": 0.07873348891735077, + "rewards/rejected": -0.02555706538259983, + "step": 5820 + }, + { + "epoch": 0.95, + "learning_rate": 3.258020286671909e-08, + "logits/chosen": -2.055704355239868, + "logits/rejected": -1.8350849151611328, + "logps/chosen": -280.88153076171875, + "logps/rejected": -266.3826904296875, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.06665761768817902, + "rewards/margins": 0.09654451906681061, + "rewards/rejected": -0.029886901378631592, + "step": 5830 + }, + { + "epoch": 0.96, + "learning_rate": 3.032297889155378e-08, + "logits/chosen": -2.0449135303497314, + "logits/rejected": -1.8899867534637451, + "logps/chosen": -322.34368896484375, + "logps/rejected": -330.448974609375, + "loss": 0.6957, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01674279198050499, + "rewards/margins": 0.053814418613910675, + "rewards/rejected": -0.037071626633405685, + "step": 5840 + }, + { + "epoch": 0.96, + "learning_rate": 2.8146303041510226e-08, + "logits/chosen": -1.9525072574615479, + "logits/rejected": -1.902122139930725, + "logps/chosen": -222.65469360351562, + "logps/rejected": -226.985107421875, + "loss": 0.6896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0346917100250721, + "rewards/margins": 0.07150985300540924, + "rewards/rejected": -0.03681814298033714, + "step": 5850 + }, + { + "epoch": 0.96, + "learning_rate": 2.6050246308526315e-08, + "logits/chosen": -2.0796725749969482, + "logits/rejected": -1.8502895832061768, + "logps/chosen": -271.46954345703125, + "logps/rejected": -235.7878875732422, + "loss": 0.6879, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06456605345010757, + "rewards/margins": 0.11507971584796906, + "rewards/rejected": -0.050513654947280884, + "step": 5860 + }, + { + "epoch": 0.96, + "learning_rate": 2.4034877055161e-08, + "logits/chosen": -2.2782702445983887, + "logits/rejected": -2.094332695007324, + "logps/chosen": -274.4179992675781, + "logps/rejected": -251.5694122314453, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04957159608602524, + "rewards/margins": 0.07598142325878143, + "rewards/rejected": -0.026409829035401344, + "step": 5870 + }, + { + "epoch": 0.96, + "learning_rate": 2.210026101236329e-08, + "logits/chosen": -1.8374567031860352, + "logits/rejected": -1.6697406768798828, + "logps/chosen": -252.0631103515625, + "logps/rejected": -227.1796875, + "loss": 0.6912, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04540370777249336, + "rewards/margins": 0.08617019653320312, + "rewards/rejected": -0.04076649993658066, + "step": 5880 + }, + { + "epoch": 0.96, + "learning_rate": 2.0246461277329267e-08, + "logits/chosen": -2.0761470794677734, + "logits/rejected": -1.8557264804840088, + "logps/chosen": -246.3938751220703, + "logps/rejected": -236.2279510498047, + "loss": 0.6898, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.033041972666978836, + "rewards/margins": 0.0941489189863205, + "rewards/rejected": -0.061106957495212555, + "step": 5890 + }, + { + "epoch": 0.97, + "learning_rate": 1.8473538311443717e-08, + "logits/chosen": -1.8573882579803467, + "logits/rejected": -1.8783859014511108, + "logps/chosen": -270.70159912109375, + "logps/rejected": -272.07415771484375, + "loss": 0.6909, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.055087268352508545, + "rewards/margins": 0.06440868973731995, + "rewards/rejected": -0.009321419522166252, + "step": 5900 + }, + { + "epoch": 0.97, + "learning_rate": 1.678154993830866e-08, + "logits/chosen": -1.950534462928772, + "logits/rejected": -1.7592417001724243, + "logps/chosen": -246.81344604492188, + "logps/rejected": -242.6831512451172, + "loss": 0.6881, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03639548644423485, + "rewards/margins": 0.10464715957641602, + "rewards/rejected": -0.06825166195631027, + "step": 5910 + }, + { + "epoch": 0.97, + "learning_rate": 1.51705513418568e-08, + "logits/chosen": -2.1327576637268066, + "logits/rejected": -1.8729976415634155, + "logps/chosen": -318.35333251953125, + "logps/rejected": -269.36328125, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03055185079574585, + "rewards/margins": 0.0663667842745781, + "rewards/rejected": -0.03581493720412254, + "step": 5920 + }, + { + "epoch": 0.97, + "learning_rate": 1.3640595064552675e-08, + "logits/chosen": -2.1345858573913574, + "logits/rejected": -1.6405420303344727, + "logps/chosen": -261.3988952636719, + "logps/rejected": -200.62185668945312, + "loss": 0.6891, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.047378938645124435, + "rewards/margins": 0.11070507764816284, + "rewards/rejected": -0.06332613527774811, + "step": 5930 + }, + { + "epoch": 0.97, + "learning_rate": 1.2191731005677943e-08, + "logits/chosen": -1.8059284687042236, + "logits/rejected": -2.0084118843078613, + "logps/chosen": -258.87725830078125, + "logps/rejected": -336.47088623046875, + "loss": 0.6888, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.048919785767793655, + "rewards/margins": 0.06940393894910812, + "rewards/rejected": -0.02048414945602417, + "step": 5940 + }, + { + "epoch": 0.97, + "learning_rate": 1.082400641970488e-08, + "logits/chosen": -1.8918638229370117, + "logits/rejected": -1.7905925512313843, + "logps/chosen": -201.6887969970703, + "logps/rejected": -202.47903442382812, + "loss": 0.6931, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03641490265727043, + "rewards/margins": 0.04148652032017708, + "rewards/rejected": -0.005071618128567934, + "step": 5950 + }, + { + "epoch": 0.97, + "learning_rate": 9.53746591475374e-09, + "logits/chosen": -1.947353720664978, + "logits/rejected": -1.7577879428863525, + "logps/chosen": -266.77313232421875, + "logps/rejected": -300.2434997558594, + "loss": 0.6866, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.043915875256061554, + "rewards/margins": 0.11781007051467896, + "rewards/rejected": -0.0738941878080368, + "step": 5960 + }, + { + "epoch": 0.98, + "learning_rate": 8.332151451140025e-09, + "logits/chosen": -2.0943922996520996, + "logits/rejected": -1.8888832330703735, + "logps/chosen": -289.454345703125, + "logps/rejected": -256.38104248046875, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04076813533902168, + "rewards/margins": 0.1011524647474289, + "rewards/rejected": -0.06038434058427811, + "step": 5970 + }, + { + "epoch": 0.98, + "learning_rate": 7.208102340003631e-09, + "logits/chosen": -1.9470908641815186, + "logits/rejected": -1.7573446035385132, + "logps/chosen": -221.0652313232422, + "logps/rejected": -191.21139526367188, + "loss": 0.6899, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03391250595450401, + "rewards/margins": 0.07079769670963287, + "rewards/rejected": -0.03688519448041916, + "step": 5980 + }, + { + "epoch": 0.98, + "learning_rate": 6.165355242029048e-09, + "logits/chosen": -2.153614044189453, + "logits/rejected": -1.941288709640503, + "logps/chosen": -258.988525390625, + "logps/rejected": -259.1878967285156, + "loss": 0.6886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04650810360908508, + "rewards/margins": 0.08868498355150223, + "rewards/rejected": -0.042176879942417145, + "step": 5990 + }, + { + "epoch": 0.98, + "learning_rate": 5.2039441662471435e-09, + "logits/chosen": -1.9134023189544678, + "logits/rejected": -1.7545169591903687, + "logps/chosen": -267.35931396484375, + "logps/rejected": -257.5721130371094, + "loss": 0.6885, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.06051031872630119, + "rewards/margins": 0.09169574081897736, + "rewards/rejected": -0.031185422092676163, + "step": 6000 + }, + { + "epoch": 0.98, + "learning_rate": 4.323900468928554e-09, + "logits/chosen": -2.0810718536376953, + "logits/rejected": -2.0528416633605957, + "logps/chosen": -231.15145874023438, + "logps/rejected": -253.4620361328125, + "loss": 0.6911, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.019774356856942177, + "rewards/margins": 0.04601629823446274, + "rewards/rejected": -0.026241939514875412, + "step": 6010 + }, + { + "epoch": 0.98, + "learning_rate": 3.525252852558947e-09, + "logits/chosen": -1.948301911354065, + "logits/rejected": -1.8943039178848267, + "logps/chosen": -237.9792022705078, + "logps/rejected": -289.8432922363281, + "loss": 0.6868, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04880828037858009, + "rewards/margins": 0.11353936046361923, + "rewards/rejected": -0.06473108381032944, + "step": 6020 + }, + { + "epoch": 0.99, + "learning_rate": 2.8080273649039313e-09, + "logits/chosen": -2.057129144668579, + "logits/rejected": -1.947456955909729, + "logps/chosen": -300.96051025390625, + "logps/rejected": -328.15399169921875, + "loss": 0.6898, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04834997281432152, + "rewards/margins": 0.09951449930667877, + "rewards/rejected": -0.051164526492357254, + "step": 6030 + }, + { + "epoch": 0.99, + "learning_rate": 2.172247398158911e-09, + "logits/chosen": -1.921481728553772, + "logits/rejected": -1.6599433422088623, + "logps/chosen": -292.12762451171875, + "logps/rejected": -288.71429443359375, + "loss": 0.6881, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04767230153083801, + "rewards/margins": 0.11761369556188583, + "rewards/rejected": -0.06994140148162842, + "step": 6040 + }, + { + "epoch": 0.99, + "learning_rate": 1.6179336881874675e-09, + "logits/chosen": -2.0784411430358887, + "logits/rejected": -1.900909662246704, + "logps/chosen": -304.08673095703125, + "logps/rejected": -268.337158203125, + "loss": 0.6883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02304939553141594, + "rewards/margins": 0.0845625251531601, + "rewards/rejected": -0.06151314452290535, + "step": 6050 + }, + { + "epoch": 0.99, + "learning_rate": 1.145104313843015e-09, + "logits/chosen": -1.913749098777771, + "logits/rejected": -1.8159157037734985, + "logps/chosen": -182.29115295410156, + "logps/rejected": -211.58267211914062, + "loss": 0.6893, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.051398564130067825, + "rewards/margins": 0.10971683263778687, + "rewards/rejected": -0.05831826478242874, + "step": 6060 + }, + { + "epoch": 0.99, + "learning_rate": 7.537746963809378e-10, + "logits/chosen": -2.112492084503174, + "logits/rejected": -1.8533748388290405, + "logps/chosen": -338.296142578125, + "logps/rejected": -277.8135986328125, + "loss": 0.6878, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0699067935347557, + "rewards/margins": 0.09599098563194275, + "rewards/rejected": -0.026084203273057938, + "step": 6070 + }, + { + "epoch": 0.99, + "learning_rate": 4.4395759895482503e-10, + "logits/chosen": -2.188448429107666, + "logits/rejected": -1.7715675830841064, + "logps/chosen": -257.80194091796875, + "logps/rejected": -226.2391815185547, + "loss": 0.6876, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05495365709066391, + "rewards/margins": 0.09918127954006195, + "rewards/rejected": -0.044227611273527145, + "step": 6080 + }, + { + "epoch": 1.0, + "learning_rate": 2.156631262001385e-10, + "logits/chosen": -1.9688606262207031, + "logits/rejected": -1.9781688451766968, + "logps/chosen": -172.91860961914062, + "logps/rejected": -222.76614379882812, + "loss": 0.6886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05476533621549606, + "rewards/margins": 0.13317716121673584, + "rewards/rejected": -0.07841181755065918, + "step": 6090 + }, + { + "epoch": 1.0, + "learning_rate": 6.889872390503094e-11, + "logits/chosen": -2.1836891174316406, + "logits/rejected": -1.8291714191436768, + "logps/chosen": -237.5067901611328, + "logps/rejected": -206.28579711914062, + "loss": 0.689, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03900928422808647, + "rewards/margins": 0.0767001062631607, + "rewards/rejected": -0.03769083321094513, + "step": 6100 + }, + { + "epoch": 1.0, + "learning_rate": 3.669178767484738e-12, + "logits/chosen": -2.1066837310791016, + "logits/rejected": -1.6799333095550537, + "logps/chosen": -254.78628540039062, + "logps/rejected": -223.8677978515625, + "loss": 0.6878, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05836108326911926, + "rewards/margins": 0.1094939112663269, + "rewards/rejected": -0.051132820546627045, + "step": 6110 + }, + { + "epoch": 1.0, + "step": 6113, + "total_flos": 0.0, + "train_loss": 0.6896687981814424, + "train_runtime": 25698.2921, + "train_samples_per_second": 0.952, + "train_steps_per_second": 0.238 + } + ], + "logging_steps": 10, + "max_steps": 6113, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}