diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14119 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.982857142857143, + "global_step": 261, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "logps_train/chosen": -124.03406524658203, + "logps_train/ref_chosen": -124.5, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -103.7885971069336, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.021984513849020004, + "rewards_train/margins": 0.03294864948838949, + "rewards_train/rejected": -0.010964135639369488, + "step": 0 + }, + { + "epoch": 0, + "logps_train/chosen": -189.20074462890625, + "logps_train/ref_chosen": -189.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -143.92283630371094, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": 0.01742708310484886, + "rewards_train/margins": -0.006499961018562317, + "rewards_train/rejected": 0.02392704412341118, + "step": 0 + }, + { + "epoch": 0, + "logps_train/chosen": -108.28680419921875, + "logps_train/ref_chosen": -108.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -86.16234588623047, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": -0.0033381874673068523, + "rewards_train/margins": -0.008782926481217146, + "rewards_train/rejected": 0.005444739013910294, + "step": 0 + }, + { + "epoch": 0, + "logps_train/chosen": -159.1072235107422, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -137.1709747314453, + "rewards_train/accuracies": 0.4375, + "rewards_train/chosen": -0.1611127108335495, + "rewards_train/margins": -0.18815578520298004, + "rewards_train/rejected": 0.027043074369430542, + "step": 0 + }, + { + "epoch": 0.01, + "learning_rate": 6.25e-06, + "loss": 0.7329, + "step": 1 + }, + { + "epoch": 0.01, + "logps_train/chosen": -152.52149963378906, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -146.0, + "logps_train/rejected": -145.44671630859375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.3556646704673767, + "rewards_train/margins": -0.34302489552646875, + "rewards_train/rejected": -0.012639774940907955, + "step": 1 + }, + { + "epoch": 0.01, + "logps_train/chosen": -162.546875, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -115.53173065185547, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": -0.0027348892763257027, + "rewards_train/margins": -0.006935393437743187, + "rewards_train/rejected": 0.004200504161417484, + "step": 1 + }, + { + "epoch": 0.01, + "logps_train/chosen": -166.0098876953125, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -137.22930908203125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.009777255356311798, + "rewards_train/margins": 0.0004109693691134453, + "rewards_train/rejected": -0.010188224725425243, + "step": 1 + }, + { + "epoch": 0.01, + "logps_train/chosen": -136.130859375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -112.58856201171875, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.009960669092833996, + "rewards_train/margins": 0.021551736630499363, + "rewards_train/rejected": -0.011591067537665367, + "step": 1 + }, + { + "epoch": 0.02, + "learning_rate": 1.25e-05, + "loss": 0.7722, + "step": 2 + }, + { + "epoch": 0.02, + "logps_train/chosen": -124.05975341796875, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -109.16248321533203, + "rewards_train/accuracies": 0.34375, + "rewards_train/chosen": 0.018145941197872162, + "rewards_train/margins": -0.004668369889259338, + "rewards_train/rejected": 0.0228143110871315, + "step": 2 + }, + { + "epoch": 0.02, + "logps_train/chosen": -165.04600524902344, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -138.3419952392578, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": 0.011854463256895542, + "rewards_train/margins": -0.011222056113183498, + "rewards_train/rejected": 0.02307651937007904, + "step": 2 + }, + { + "epoch": 0.02, + "logps_train/chosen": -142.4432373046875, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -123.91960144042969, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": 0.008557641878724098, + "rewards_train/margins": 0.01167575130239129, + "rewards_train/rejected": -0.0031181094236671925, + "step": 2 + }, + { + "epoch": 0.02, + "logps_train/chosen": -165.10171508789062, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -143.83792114257812, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.02361709624528885, + "rewards_train/margins": 0.017565627116709948, + "rewards_train/rejected": 0.006051469128578901, + "step": 2 + }, + { + "epoch": 0.03, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.6922, + "step": 3 + }, + { + "epoch": 0.03, + "logps_train/chosen": -170.82505798339844, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -151.93087768554688, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": 0.013978248462080956, + "rewards_train/margins": -0.005433313548564911, + "rewards_train/rejected": 0.019411562010645866, + "step": 3 + }, + { + "epoch": 0.03, + "logps_train/chosen": -139.39083862304688, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -129.07003784179688, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.020486466586589813, + "rewards_train/margins": 0.003638278692960739, + "rewards_train/rejected": 0.016848187893629074, + "step": 3 + }, + { + "epoch": 0.03, + "logps_train/chosen": -176.51419067382812, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -132.66586303710938, + "rewards_train/accuracies": 0.4375, + "rewards_train/chosen": 0.009908124804496765, + "rewards_train/margins": -0.014521919190883636, + "rewards_train/rejected": 0.0244300439953804, + "step": 3 + }, + { + "epoch": 0.03, + "logps_train/chosen": -156.88722229003906, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -119.69668579101562, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.019480425864458084, + "rewards_train/margins": -0.0017685145139694214, + "rewards_train/rejected": 0.021248940378427505, + "step": 3 + }, + { + "epoch": 0.05, + "learning_rate": 2.5e-05, + "loss": 0.6956, + "step": 4 + }, + { + "epoch": 0.05, + "logps_train/chosen": -150.92123413085938, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -117.88097381591797, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.034377966076135635, + "rewards_train/margins": 0.017336303368210793, + "rewards_train/rejected": 0.017041662707924843, + "step": 4 + }, + { + "epoch": 0.05, + "logps_train/chosen": -180.184814453125, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -128.09719848632812, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.0534917414188385, + "rewards_train/margins": 0.01155165582895279, + "rewards_train/rejected": 0.04194008558988571, + "step": 4 + }, + { + "epoch": 0.05, + "logps_train/chosen": -143.22642517089844, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -111.45314025878906, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": 0.04542411118745804, + "rewards_train/margins": 0.008365228772163391, + "rewards_train/rejected": 0.03705888241529465, + "step": 4 + }, + { + "epoch": 0.05, + "logps_train/chosen": -149.19285583496094, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -132.03118896484375, + "rewards_train/accuracies": 0.40625, + "rewards_train/chosen": 0.05766720697283745, + "rewards_train/margins": 0.0012159161269664764, + "rewards_train/rejected": 0.05645129084587097, + "step": 4 + }, + { + "epoch": 0.06, + "learning_rate": 3.125e-05, + "loss": 0.6893, + "step": 5 + }, + { + "epoch": 0.06, + "logps_train/chosen": -147.8840789794922, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -122.57456970214844, + "rewards_train/accuracies": 0.40625, + "rewards_train/chosen": 0.05192326009273529, + "rewards_train/margins": -0.010029420256614685, + "rewards_train/rejected": 0.061952680349349976, + "step": 5 + }, + { + "epoch": 0.06, + "logps_train/chosen": -170.50418090820312, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -136.78695678710938, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.11383962631225586, + "rewards_train/margins": 0.027300000190734863, + "rewards_train/rejected": 0.086539626121521, + "step": 5 + }, + { + "epoch": 0.06, + "logps_train/chosen": -155.12435913085938, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -116.24176025390625, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": 0.08434212952852249, + "rewards_train/margins": 0.012327082455158234, + "rewards_train/rejected": 0.07201504707336426, + "step": 5 + }, + { + "epoch": 0.06, + "logps_train/chosen": -140.93975830078125, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -96.0169677734375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.09296239912509918, + "rewards_train/margins": 0.042803697288036346, + "rewards_train/rejected": 0.050158701837062836, + "step": 5 + }, + { + "epoch": 0.07, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.6848, + "step": 6 + }, + { + "epoch": 0.07, + "logps_train/chosen": -140.3257293701172, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -110.98545837402344, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.1716260313987732, + "rewards_train/margins": 0.07232072949409485, + "rewards_train/rejected": 0.09930530190467834, + "step": 6 + }, + { + "epoch": 0.07, + "logps_train/chosen": -154.02883911132812, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -129.6768798828125, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.20046155154705048, + "rewards_train/margins": 0.06691007316112518, + "rewards_train/rejected": 0.1335514783859253, + "step": 6 + }, + { + "epoch": 0.07, + "logps_train/chosen": -148.28170776367188, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -124.03239440917969, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.17491726577281952, + "rewards_train/margins": 0.04614986479282379, + "rewards_train/rejected": 0.12876740097999573, + "step": 6 + }, + { + "epoch": 0.07, + "logps_train/chosen": -147.0029296875, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -109.11970520019531, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.14702154695987701, + "rewards_train/margins": 0.04444076120853424, + "rewards_train/rejected": 0.10258078575134277, + "step": 6 + }, + { + "epoch": 0.08, + "learning_rate": 4.375e-05, + "loss": 0.667, + "step": 7 + }, + { + "epoch": 0.08, + "logps_train/chosen": -132.00901794433594, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -109.18241882324219, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.226222425699234, + "rewards_train/margins": 0.0596490353345871, + "rewards_train/rejected": 0.1665733903646469, + "step": 7 + }, + { + "epoch": 0.08, + "logps_train/chosen": -176.80740356445312, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -171.0, + "logps_train/rejected": -168.07736206054688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.302267462015152, + "rewards_train/margins": 0.03226765990257263, + "rewards_train/rejected": 0.26999980211257935, + "step": 7 + }, + { + "epoch": 0.08, + "logps_train/chosen": -161.03793334960938, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -107.19043731689453, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.2758948802947998, + "rewards_train/margins": 0.10362982749938965, + "rewards_train/rejected": 0.17226505279541016, + "step": 7 + }, + { + "epoch": 0.08, + "logps_train/chosen": -165.82919311523438, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -142.60577392578125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.31333446502685547, + "rewards_train/margins": 0.09588496387004852, + "rewards_train/rejected": 0.21744950115680695, + "step": 7 + }, + { + "epoch": 0.09, + "learning_rate": 5e-05, + "loss": 0.6623, + "step": 8 + }, + { + "epoch": 0.09, + "logps_train/chosen": -156.38870239257812, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -145.6343231201172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.423044353723526, + "rewards_train/margins": 0.11528569459915161, + "rewards_train/rejected": 0.3077586591243744, + "step": 8 + }, + { + "epoch": 0.09, + "logps_train/chosen": -166.0334014892578, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -124.54536437988281, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.41580069065093994, + "rewards_train/margins": 0.15119647979736328, + "rewards_train/rejected": 0.26460421085357666, + "step": 8 + }, + { + "epoch": 0.09, + "logps_train/chosen": -111.77043151855469, + "logps_train/ref_chosen": -115.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -76.39224243164062, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.2940497100353241, + "rewards_train/margins": 0.1374729573726654, + "rewards_train/rejected": 0.1565767526626587, + "step": 8 + }, + { + "epoch": 0.09, + "logps_train/chosen": -148.72055053710938, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -107.92207336425781, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.42247551679611206, + "rewards_train/margins": 0.2193703055381775, + "rewards_train/rejected": 0.20310521125793457, + "step": 8 + }, + { + "epoch": 0.1, + "learning_rate": 4.99980726386944e-05, + "loss": 0.6258, + "step": 9 + }, + { + "epoch": 0.1, + "logps_train/chosen": -171.09103393554688, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -112.07714080810547, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.5837191343307495, + "rewards_train/margins": 0.2688255310058594, + "rewards_train/rejected": 0.31489360332489014, + "step": 9 + }, + { + "epoch": 0.1, + "logps_train/chosen": -134.94949340820312, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -132.9979705810547, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.4236791431903839, + "rewards_train/margins": 0.02884700894355774, + "rewards_train/rejected": 0.39483213424682617, + "step": 9 + }, + { + "epoch": 0.1, + "logps_train/chosen": -154.36537170410156, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -127.57532501220703, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.5209816694259644, + "rewards_train/margins": 0.10341629385948181, + "rewards_train/rejected": 0.41756537556648254, + "step": 9 + }, + { + "epoch": 0.1, + "logps_train/chosen": -149.526611328125, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -134.63084411621094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.48503243923187256, + "rewards_train/margins": 0.11374184489250183, + "rewards_train/rejected": 0.3712905943393707, + "step": 9 + }, + { + "epoch": 0.11, + "learning_rate": 4.9992290851955325e-05, + "loss": 0.6503, + "step": 10 + }, + { + "epoch": 0.11, + "logps_train/chosen": -153.533935546875, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -126.86518859863281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.5384516716003418, + "rewards_train/margins": 0.16266614198684692, + "rewards_train/rejected": 0.3757855296134949, + "step": 10 + }, + { + "epoch": 0.11, + "logps_train/chosen": -167.98931884765625, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -144.03042602539062, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": 0.6893491744995117, + "rewards_train/margins": 0.16035926342010498, + "rewards_train/rejected": 0.5289899110794067, + "step": 10 + }, + { + "epoch": 0.11, + "logps_train/chosen": -155.04034423828125, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -130.8607177734375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.6690115332603455, + "rewards_train/margins": 0.2056686282157898, + "rewards_train/rejected": 0.46334290504455566, + "step": 10 + }, + { + "epoch": 0.11, + "logps_train/chosen": -130.7728271484375, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -109.23792266845703, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.6114521622657776, + "rewards_train/margins": 0.13563531637191772, + "rewards_train/rejected": 0.47581684589385986, + "step": 10 + }, + { + "epoch": 0.13, + "learning_rate": 4.998265553127013e-05, + "loss": 0.6345, + "step": 11 + }, + { + "epoch": 0.13, + "logps_train/chosen": -160.4620819091797, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -125.24628448486328, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.8416832685470581, + "rewards_train/margins": 0.36943644285202026, + "rewards_train/rejected": 0.47224682569503784, + "step": 11 + }, + { + "epoch": 0.13, + "logps_train/chosen": -145.121826171875, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -132.90176391601562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.646020233631134, + "rewards_train/margins": 0.17330646514892578, + "rewards_train/rejected": 0.47271376848220825, + "step": 11 + }, + { + "epoch": 0.13, + "logps_train/chosen": -151.66098022460938, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -153.0, + "logps_train/rejected": -147.03488159179688, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.7320475578308105, + "rewards_train/margins": 0.1371973752975464, + "rewards_train/rejected": 0.5948501825332642, + "step": 11 + }, + { + "epoch": 0.13, + "logps_train/chosen": -142.59877014160156, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -114.5, + "logps_train/rejected": -109.92486572265625, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.6653189659118652, + "rewards_train/margins": 0.20673072338104248, + "rewards_train/rejected": 0.45858824253082275, + "step": 11 + }, + { + "epoch": 0.14, + "learning_rate": 4.996916816229837e-05, + "loss": 0.6218, + "step": 12 + }, + { + "epoch": 0.14, + "logps_train/chosen": -129.76406860351562, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -104.38932800292969, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.7065520882606506, + "rewards_train/margins": 0.4883071482181549, + "rewards_train/rejected": 0.21824494004249573, + "step": 12 + }, + { + "epoch": 0.14, + "logps_train/chosen": -149.8843994140625, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -126.18021392822266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.9509168863296509, + "rewards_train/margins": 0.3822464942932129, + "rewards_train/rejected": 0.568670392036438, + "step": 12 + }, + { + "epoch": 0.14, + "logps_train/chosen": -141.76194763183594, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -141.0, + "logps_train/rejected": -134.5399169921875, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.6832780838012695, + "rewards_train/margins": 0.04976832866668701, + "rewards_train/rejected": 0.6335097551345825, + "step": 12 + }, + { + "epoch": 0.14, + "logps_train/chosen": -135.6645965576172, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -115.1833724975586, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.6000440716743469, + "rewards_train/margins": 0.09338122606277466, + "rewards_train/rejected": 0.5066628456115723, + "step": 12 + }, + { + "epoch": 0.15, + "learning_rate": 4.995183082464269e-05, + "loss": 0.6331, + "step": 13 + }, + { + "epoch": 0.15, + "logps_train/chosen": -156.60064697265625, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -118.9285888671875, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 1.034368634223938, + "rewards_train/margins": 0.5694149434566498, + "rewards_train/rejected": 0.4649536907672882, + "step": 13 + }, + { + "epoch": 0.15, + "logps_train/chosen": -173.77572631835938, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -163.0, + "logps_train/rejected": -156.2445526123047, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.9569660425186157, + "rewards_train/margins": 0.2895270586013794, + "rewards_train/rejected": 0.6674389839172363, + "step": 13 + }, + { + "epoch": 0.15, + "logps_train/chosen": -151.65478515625, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -115.70185852050781, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 1.0789567232131958, + "rewards_train/margins": 0.48078322410583496, + "rewards_train/rejected": 0.5981734991073608, + "step": 13 + }, + { + "epoch": 0.15, + "logps_train/chosen": -110.07494354248047, + "logps_train/ref_chosen": -118.5, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -97.74917602539062, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.8415290117263794, + "rewards_train/margins": 0.2806065082550049, + "rewards_train/rejected": 0.5609225034713745, + "step": 13 + }, + { + "epoch": 0.16, + "learning_rate": 4.9930646191528175e-05, + "loss": 0.5759, + "step": 14 + }, + { + "epoch": 0.16, + "logps_train/chosen": -177.0446319580078, + "logps_train/ref_chosen": -187.0, + "logps_train/ref_rejected": -176.0, + "logps_train/rejected": -168.82325744628906, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 1.0113569498062134, + "rewards_train/margins": 0.356975793838501, + "rewards_train/rejected": 0.6543811559677124, + "step": 14 + }, + { + "epoch": 0.16, + "logps_train/chosen": -151.45489501953125, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -143.0, + "logps_train/rejected": -136.52796936035156, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": 0.8385694026947021, + "rewards_train/margins": 0.13999921083450317, + "rewards_train/rejected": 0.698570191860199, + "step": 14 + }, + { + "epoch": 0.16, + "logps_train/chosen": -135.15379333496094, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -125.3636703491211, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.8960466980934143, + "rewards_train/margins": 0.2092685103416443, + "rewards_train/rejected": 0.68677818775177, + "step": 14 + }, + { + "epoch": 0.16, + "logps_train/chosen": -146.58248901367188, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -120.76148223876953, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.9701685905456543, + "rewards_train/margins": 0.3346949815750122, + "rewards_train/rejected": 0.6354736089706421, + "step": 14 + }, + { + "epoch": 0.17, + "learning_rate": 4.9905617529390203e-05, + "loss": 0.6141, + "step": 15 + }, + { + "epoch": 0.17, + "logps_train/chosen": -105.37283325195312, + "logps_train/ref_chosen": -112.5, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -107.2294921875, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.717697262763977, + "rewards_train/margins": 0.21457242965698242, + "rewards_train/rejected": 0.5031248331069946, + "step": 15 + }, + { + "epoch": 0.17, + "logps_train/chosen": -140.62977600097656, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -97.55780792236328, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.7388781309127808, + "rewards_train/margins": 0.35042065382003784, + "rewards_train/rejected": 0.3884574770927429, + "step": 15 + }, + { + "epoch": 0.17, + "logps_train/chosen": -150.58285522460938, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -99.97172546386719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.8648346662521362, + "rewards_train/margins": 0.408088356256485, + "rewards_train/rejected": 0.45674630999565125, + "step": 15 + }, + { + "epoch": 0.17, + "logps_train/chosen": -141.79176330566406, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -125.314453125, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.8631088733673096, + "rewards_train/margins": 0.32648766040802, + "rewards_train/rejected": 0.5366212129592896, + "step": 15 + }, + { + "epoch": 0.18, + "learning_rate": 4.987674869737077e-05, + "loss": 0.5859, + "step": 16 + }, + { + "epoch": 0.18, + "logps_train/chosen": -136.44541931152344, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -105.30747985839844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.7658092975616455, + "rewards_train/margins": 0.42277851700782776, + "rewards_train/rejected": 0.34303078055381775, + "step": 16 + }, + { + "epoch": 0.18, + "logps_train/chosen": -142.18093872070312, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -115.7771987915039, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.7697968482971191, + "rewards_train/margins": 0.11177456378936768, + "rewards_train/rejected": 0.6580222845077515, + "step": 16 + }, + { + "epoch": 0.18, + "logps_train/chosen": -154.24090576171875, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -126.07774353027344, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 1.0580410957336426, + "rewards_train/margins": 0.3286939859390259, + "rewards_train/rejected": 0.7293471097946167, + "step": 16 + }, + { + "epoch": 0.18, + "logps_train/chosen": -152.1510467529297, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -104.72694396972656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.752070426940918, + "rewards_train/margins": 0.3003508448600769, + "rewards_train/rejected": 0.45171958208084106, + "step": 16 + }, + { + "epoch": 0.19, + "learning_rate": 4.984404414672346e-05, + "loss": 0.6126, + "step": 17 + }, + { + "epoch": 0.19, + "logps_train/chosen": -142.31809997558594, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -126.29586791992188, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.7054945230484009, + "rewards_train/margins": 0.10695618391036987, + "rewards_train/rejected": 0.598538339138031, + "step": 17 + }, + { + "epoch": 0.19, + "logps_train/chosen": -89.5583724975586, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -88.00601959228516, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": 0.49502912163734436, + "rewards_train/margins": 0.10490813851356506, + "rewards_train/rejected": 0.3901209831237793, + "step": 17 + }, + { + "epoch": 0.19, + "logps_train/chosen": -141.4034881591797, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -95.42372131347656, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.9821124076843262, + "rewards_train/margins": 0.6551483273506165, + "rewards_train/rejected": 0.3269640803337097, + "step": 17 + }, + { + "epoch": 0.19, + "logps_train/chosen": -126.50776672363281, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -115.35845947265625, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.8917524814605713, + "rewards_train/margins": 0.34312617778778076, + "rewards_train/rejected": 0.5486263036727905, + "step": 17 + }, + { + "epoch": 0.21, + "learning_rate": 4.980750892012711e-05, + "loss": 0.616, + "step": 18 + }, + { + "epoch": 0.21, + "logps_train/chosen": -161.98281860351562, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -129.00392150878906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.9355075359344482, + "rewards_train/margins": 0.31050950288772583, + "rewards_train/rejected": 0.6249980330467224, + "step": 18 + }, + { + "epoch": 0.21, + "logps_train/chosen": -109.2407455444336, + "logps_train/ref_chosen": -115.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -87.75328063964844, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.634067952632904, + "rewards_train/margins": 0.21584081649780273, + "rewards_train/rejected": 0.4182271361351013, + "step": 18 + }, + { + "epoch": 0.21, + "logps_train/chosen": -125.10116577148438, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -109.39839172363281, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.7695701122283936, + "rewards_train/margins": 0.3268895447254181, + "rewards_train/rejected": 0.44268056750297546, + "step": 18 + }, + { + "epoch": 0.21, + "logps_train/chosen": -154.5267791748047, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -111.5, + "logps_train/rejected": -105.82466125488281, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.8263745307922363, + "rewards_train/margins": 0.2686058282852173, + "rewards_train/rejected": 0.557768702507019, + "step": 18 + }, + { + "epoch": 0.22, + "learning_rate": 4.976714865090827e-05, + "loss": 0.6399, + "step": 19 + }, + { + "epoch": 0.22, + "logps_train/chosen": -170.05242919921875, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -133.59002685546875, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 1.0042303800582886, + "rewards_train/margins": 0.6750620007514954, + "rewards_train/rejected": 0.3291683793067932, + "step": 19 + }, + { + "epoch": 0.22, + "logps_train/chosen": -138.31671142578125, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -90.01516723632812, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.5841480493545532, + "rewards_train/margins": 0.30216851830482483, + "rewards_train/rejected": 0.2819795310497284, + "step": 19 + }, + { + "epoch": 0.22, + "logps_train/chosen": -127.0712890625, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -87.40908813476562, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.6961421966552734, + "rewards_train/margins": 0.25760751962661743, + "rewards_train/rejected": 0.438534677028656, + "step": 19 + }, + { + "epoch": 0.22, + "logps_train/chosen": -152.77476501464844, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -118.90397644042969, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.7926408648490906, + "rewards_train/margins": 0.37669146060943604, + "rewards_train/rejected": 0.41594940423965454, + "step": 19 + }, + { + "epoch": 0.23, + "learning_rate": 4.972296956217265e-05, + "loss": 0.5838, + "step": 20 + }, + { + "epoch": 0.23, + "logps_train/chosen": -148.97178649902344, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -102.91510009765625, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.6525766849517822, + "rewards_train/margins": 0.4128361642360687, + "rewards_train/rejected": 0.2397405207157135, + "step": 20 + }, + { + "epoch": 0.23, + "logps_train/chosen": -188.7393035888672, + "logps_train/ref_chosen": -201.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -136.55224609375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 1.208785057067871, + "rewards_train/margins": 1.0652238130569458, + "rewards_train/rejected": 0.1435612440109253, + "step": 20 + }, + { + "epoch": 0.23, + "logps_train/chosen": -136.43887329101562, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -106.124755859375, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.8320896625518799, + "rewards_train/margins": 0.5457369983196259, + "rewards_train/rejected": 0.28635266423225403, + "step": 20 + }, + { + "epoch": 0.23, + "logps_train/chosen": -137.87374877929688, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -103.64527893066406, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.6388455629348755, + "rewards_train/margins": 0.2835492193698883, + "rewards_train/rejected": 0.3552963435649872, + "step": 20 + }, + { + "epoch": 0.24, + "learning_rate": 4.967497846584552e-05, + "loss": 0.5307, + "step": 21 + }, + { + "epoch": 0.24, + "logps_train/chosen": -127.54042053222656, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -93.16413116455078, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.4815409779548645, + "rewards_train/margins": 0.36279795318841934, + "rewards_train/rejected": 0.11874302476644516, + "step": 21 + }, + { + "epoch": 0.24, + "logps_train/chosen": -145.63665771484375, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -127.99638366699219, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.3527398109436035, + "rewards_train/margins": 0.19827693700790405, + "rewards_train/rejected": 0.15446287393569946, + "step": 21 + }, + { + "epoch": 0.24, + "logps_train/chosen": -143.23275756835938, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -99.59611511230469, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.8170070648193359, + "rewards_train/margins": 0.7010326832532883, + "rewards_train/rejected": 0.11597438156604767, + "step": 21 + }, + { + "epoch": 0.24, + "logps_train/chosen": -134.1798095703125, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -103.10169982910156, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.8344590663909912, + "rewards_train/margins": 0.4115239083766937, + "rewards_train/rejected": 0.4229351580142975, + "step": 21 + }, + { + "epoch": 0.25, + "learning_rate": 4.962318276162148e-05, + "loss": 0.5787, + "step": 22 + }, + { + "epoch": 0.25, + "logps_train/chosen": -169.95230102539062, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -147.85079956054688, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 1.0285433530807495, + "rewards_train/margins": 0.5781982243061066, + "rewards_train/rejected": 0.45034512877464294, + "step": 22 + }, + { + "epoch": 0.25, + "logps_train/chosen": -159.36582946777344, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -116.03822326660156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.9459124803543091, + "rewards_train/margins": 0.6589144468307495, + "rewards_train/rejected": 0.28699803352355957, + "step": 22 + }, + { + "epoch": 0.25, + "logps_train/chosen": -140.4551544189453, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -113.26671600341797, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.6787030696868896, + "rewards_train/margins": 0.34983301162719727, + "rewards_train/rejected": 0.3288700580596924, + "step": 22 + }, + { + "epoch": 0.25, + "logps_train/chosen": -182.3084259033203, + "logps_train/ref_chosen": -191.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -122.35601806640625, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.8974780440330505, + "rewards_train/margins": 0.5072007179260254, + "rewards_train/rejected": 0.39027732610702515, + "step": 22 + }, + { + "epoch": 0.26, + "learning_rate": 4.9567590435823383e-05, + "loss": 0.5552, + "step": 23 + }, + { + "epoch": 0.26, + "logps_train/chosen": -123.2986068725586, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -114.5, + "logps_train/rejected": -111.90111541748047, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.4740452170372009, + "rewards_train/margins": 0.20946910977363586, + "rewards_train/rejected": 0.26457610726356506, + "step": 23 + }, + { + "epoch": 0.26, + "logps_train/chosen": -143.50405883789062, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -114.5, + "logps_train/rejected": -110.6120376586914, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.8933444023132324, + "rewards_train/margins": 0.49614936113357544, + "rewards_train/rejected": 0.397195041179657, + "step": 23 + }, + { + "epoch": 0.26, + "logps_train/chosen": -155.1194610595703, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -121.85575866699219, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 1.0275070667266846, + "rewards_train/margins": 0.6742154359817505, + "rewards_train/rejected": 0.3532916307449341, + "step": 23 + }, + { + "epoch": 0.26, + "logps_train/chosen": -140.42703247070312, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -122.62283325195312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.627119779586792, + "rewards_train/margins": 0.5090075731277466, + "rewards_train/rejected": 0.11811220645904541, + "step": 23 + }, + { + "epoch": 0.27, + "learning_rate": 4.950821006017107e-05, + "loss": 0.5787, + "step": 24 + }, + { + "epoch": 0.27, + "logps_train/chosen": -136.32363891601562, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -101.22764587402344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.6326753497123718, + "rewards_train/margins": 0.4577834904193878, + "rewards_train/rejected": 0.174891859292984, + "step": 24 + }, + { + "epoch": 0.27, + "logps_train/chosen": -157.54037475585938, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -126.21077728271484, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.776432454586029, + "rewards_train/margins": 0.4170653820037842, + "rewards_train/rejected": 0.3593670725822449, + "step": 24 + }, + { + "epoch": 0.27, + "logps_train/chosen": -107.36483001708984, + "logps_train/ref_chosen": -112.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -82.24800109863281, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.5118322372436523, + "rewards_train/margins": 0.4391593262553215, + "rewards_train/rejected": 0.07267291098833084, + "step": 24 + }, + { + "epoch": 0.27, + "logps_train/chosen": -137.41708374023438, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -112.7099609375, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.5886627435684204, + "rewards_train/margins": 0.302530437707901, + "rewards_train/rejected": 0.2861323058605194, + "step": 24 + }, + { + "epoch": 0.29, + "learning_rate": 4.944505079045958e-05, + "loss": 0.6016, + "step": 25 + }, + { + "epoch": 0.29, + "logps_train/chosen": -157.17239379882812, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -100.15357971191406, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.8933084011077881, + "rewards_train/margins": 0.4814213514328003, + "rewards_train/rejected": 0.4118870496749878, + "step": 25 + }, + { + "epoch": 0.29, + "logps_train/chosen": -128.82952880859375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -114.12107849121094, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.7554748058319092, + "rewards_train/margins": 0.49004340171813965, + "rewards_train/rejected": 0.26543140411376953, + "step": 25 + }, + { + "epoch": 0.29, + "logps_train/chosen": -174.43142700195312, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -161.0, + "logps_train/rejected": -156.95753479003906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.6013880968093872, + "rewards_train/margins": 0.15554097294807434, + "rewards_train/rejected": 0.44584712386131287, + "step": 25 + }, + { + "epoch": 0.29, + "logps_train/chosen": -179.6756591796875, + "logps_train/ref_chosen": -190.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -136.37588500976562, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.9904413819313049, + "rewards_train/margins": 0.6243195533752441, + "rewards_train/rejected": 0.3661218285560608, + "step": 25 + }, + { + "epoch": 0.3, + "learning_rate": 4.9378122365147536e-05, + "loss": 0.5856, + "step": 26 + }, + { + "epoch": 0.3, + "logps_train/chosen": -160.62625122070312, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -112.54586029052734, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.568233847618103, + "rewards_train/margins": 0.46276117116212845, + "rewards_train/rejected": 0.10547267645597458, + "step": 26 + }, + { + "epoch": 0.3, + "logps_train/chosen": -180.06460571289062, + "logps_train/ref_chosen": -188.0, + "logps_train/ref_rejected": -170.0, + "logps_train/rejected": -165.44969177246094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.8118870854377747, + "rewards_train/margins": 0.4134972393512726, + "rewards_train/rejected": 0.3983898460865021, + "step": 26 + }, + { + "epoch": 0.3, + "logps_train/chosen": -165.16346740722656, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -125.14801025390625, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.733361005783081, + "rewards_train/margins": 0.4440605640411377, + "rewards_train/rejected": 0.28930044174194336, + "step": 26 + }, + { + "epoch": 0.3, + "logps_train/chosen": -149.80445861816406, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -172.0, + "logps_train/rejected": -168.4322967529297, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.5533437728881836, + "rewards_train/margins": 0.24852675199508667, + "rewards_train/rejected": 0.3048170208930969, + "step": 26 + }, + { + "epoch": 0.31, + "learning_rate": 4.9307435103855507e-05, + "loss": 0.6072, + "step": 27 + }, + { + "epoch": 0.31, + "logps_train/chosen": -140.88067626953125, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -111.68878173828125, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.4423023760318756, + "rewards_train/margins": 0.2377922534942627, + "rewards_train/rejected": 0.20451012253761292, + "step": 27 + }, + { + "epoch": 0.31, + "logps_train/chosen": -127.85845947265625, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -104.07398986816406, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.3409123420715332, + "rewards_train/margins": 0.4412309601902962, + "rewards_train/rejected": -0.10031861811876297, + "step": 27 + }, + { + "epoch": 0.31, + "logps_train/chosen": -124.84896087646484, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -100.35652160644531, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.36427372694015503, + "rewards_train/margins": 0.3226316273212433, + "rewards_train/rejected": 0.04164209961891174, + "step": 27 + }, + { + "epoch": 0.31, + "logps_train/chosen": -147.45672607421875, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -114.71719360351562, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.6011910438537598, + "rewards_train/margins": 0.5735943987965584, + "rewards_train/rejected": 0.027596645057201385, + "step": 27 + }, + { + "epoch": 0.32, + "learning_rate": 4.923299990577488e-05, + "loss": 0.617, + "step": 28 + }, + { + "epoch": 0.32, + "logps_train/chosen": -169.756103515625, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -124.39460754394531, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.40368735790252686, + "rewards_train/margins": 0.5618972331285477, + "rewards_train/rejected": -0.1582098752260208, + "step": 28 + }, + { + "epoch": 0.32, + "logps_train/chosen": -123.26806640625, + "logps_train/ref_chosen": -127.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -93.16129302978516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.394532173871994, + "rewards_train/margins": 0.6385668218135834, + "rewards_train/rejected": -0.24403464794158936, + "step": 28 + }, + { + "epoch": 0.32, + "logps_train/chosen": -162.7403106689453, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -112.55574035644531, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.8793867826461792, + "rewards_train/margins": 1.0994132608175278, + "rewards_train/rejected": -0.22002647817134857, + "step": 28 + }, + { + "epoch": 0.32, + "logps_train/chosen": -108.07917785644531, + "logps_train/ref_chosen": -112.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -100.18183135986328, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.4343189299106598, + "rewards_train/margins": 0.35391509532928467, + "rewards_train/rejected": 0.08040383458137512, + "step": 28 + }, + { + "epoch": 0.33, + "learning_rate": 4.9154828247987275e-05, + "loss": 0.5549, + "step": 29 + }, + { + "epoch": 0.33, + "logps_train/chosen": -152.16064453125, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -132.60389709472656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.7116701602935791, + "rewards_train/margins": 0.6406141370534897, + "rewards_train/rejected": 0.07105602324008942, + "step": 29 + }, + { + "epoch": 0.33, + "logps_train/chosen": -150.0108642578125, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -124.35476684570312, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.49451807141304016, + "rewards_train/margins": 0.45050277933478355, + "rewards_train/rejected": 0.04401529207825661, + "step": 29 + }, + { + "epoch": 0.33, + "logps_train/chosen": -139.51673889160156, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -123.01943969726562, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.6350452899932861, + "rewards_train/margins": 0.6675560399889946, + "rewards_train/rejected": -0.032510749995708466, + "step": 29 + }, + { + "epoch": 0.33, + "logps_train/chosen": -151.99301147460938, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -137.28713989257812, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.34091004729270935, + "rewards_train/margins": 0.29519854485988617, + "rewards_train/rejected": 0.04571150243282318, + "step": 29 + }, + { + "epoch": 0.34, + "learning_rate": 4.907293218369499e-05, + "loss": 0.5685, + "step": 30 + }, + { + "epoch": 0.34, + "logps_train/chosen": -171.75759887695312, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -101.5311508178711, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.736739993095398, + "rewards_train/margins": 0.8000111803412437, + "rewards_train/rejected": -0.0632711872458458, + "step": 30 + }, + { + "epoch": 0.34, + "logps_train/chosen": -150.02069091796875, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -147.0, + "logps_train/rejected": -146.40304565429688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.6478447914123535, + "rewards_train/margins": 0.6080716103315353, + "rewards_train/rejected": 0.039773181080818176, + "step": 30 + }, + { + "epoch": 0.34, + "logps_train/chosen": -138.49688720703125, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -118.42485809326172, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.20158207416534424, + "rewards_train/margins": 0.424081951379776, + "rewards_train/rejected": -0.22249987721443176, + "step": 30 + }, + { + "epoch": 0.34, + "logps_train/chosen": -153.21910095214844, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -131.95118713378906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.5112931728363037, + "rewards_train/margins": 0.5308257788419724, + "rewards_train/rejected": -0.01953260600566864, + "step": 30 + }, + { + "epoch": 0.35, + "learning_rate": 4.898732434036244e-05, + "loss": 0.5647, + "step": 31 + }, + { + "epoch": 0.35, + "logps_train/chosen": -110.5118408203125, + "logps_train/ref_chosen": -111.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -105.46853637695312, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.026843346655368805, + "rewards_train/margins": 0.23717240244150162, + "rewards_train/rejected": -0.2103290557861328, + "step": 31 + }, + { + "epoch": 0.35, + "logps_train/chosen": -170.99050903320312, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -158.0, + "logps_train/rejected": -157.1632080078125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.5195033550262451, + "rewards_train/margins": 0.4441733881831169, + "rewards_train/rejected": 0.0753299668431282, + "step": 31 + }, + { + "epoch": 0.35, + "logps_train/chosen": -159.28607177734375, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -138.23997497558594, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.43447864055633545, + "rewards_train/margins": 0.6162890195846558, + "rewards_train/rejected": -0.1818103790283203, + "step": 31 + }, + { + "epoch": 0.35, + "logps_train/chosen": -134.58811950683594, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -132.0167236328125, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.36775100231170654, + "rewards_train/margins": 0.45528803020715714, + "rewards_train/rejected": -0.08753702789545059, + "step": 31 + }, + { + "epoch": 0.37, + "learning_rate": 4.889801791776921e-05, + "loss": 0.6137, + "step": 32 + }, + { + "epoch": 0.37, + "logps_train/chosen": -135.58267211914062, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -115.02655029296875, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.2093113362789154, + "rewards_train/margins": 0.3869669735431671, + "rewards_train/rejected": -0.1776556372642517, + "step": 32 + }, + { + "epoch": 0.37, + "logps_train/chosen": -169.59982299804688, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -144.22299194335938, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.26462656259536743, + "rewards_train/margins": 0.3123156949877739, + "rewards_train/rejected": -0.047689132392406464, + "step": 32 + }, + { + "epoch": 0.37, + "logps_train/chosen": -146.15602111816406, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -115.34562683105469, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.4857647120952606, + "rewards_train/margins": 0.7841946184635162, + "rewards_train/rejected": -0.2984299063682556, + "step": 32 + }, + { + "epoch": 0.37, + "logps_train/chosen": -139.1509552001953, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -147.0, + "logps_train/rejected": -151.7754364013672, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.40150654315948486, + "rewards_train/margins": 0.8612766265869141, + "rewards_train/rejected": -0.4597700834274292, + "step": 32 + }, + { + "epoch": 0.38, + "learning_rate": 4.880502668597475e-05, + "loss": 0.6015, + "step": 33 + }, + { + "epoch": 0.38, + "logps_train/chosen": -164.56207275390625, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -149.7305145263672, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.6187927722930908, + "rewards_train/margins": 0.6844227313995361, + "rewards_train/rejected": -0.06562995910644531, + "step": 33 + }, + { + "epoch": 0.38, + "logps_train/chosen": -157.14566040039062, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -111.04812622070312, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.4366801977157593, + "rewards_train/margins": 0.8013320863246918, + "rewards_train/rejected": -0.3646518886089325, + "step": 33 + }, + { + "epoch": 0.38, + "logps_train/chosen": -159.72222900390625, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -166.0, + "logps_train/rejected": -166.43585205078125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.3263116478919983, + "rewards_train/margins": 0.37236351892352104, + "rewards_train/rejected": -0.04605187103152275, + "step": 33 + }, + { + "epoch": 0.38, + "logps_train/chosen": -149.86741638183594, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -103.66566467285156, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.577516496181488, + "rewards_train/margins": 1.0174227356910706, + "rewards_train/rejected": -0.4399062395095825, + "step": 33 + }, + { + "epoch": 0.39, + "learning_rate": 4.870836498319523e-05, + "loss": 0.5305, + "step": 34 + }, + { + "epoch": 0.39, + "logps_train/chosen": -161.78330993652344, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -130.53228759765625, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.26034095883369446, + "rewards_train/margins": 0.532904863357544, + "rewards_train/rejected": -0.2725639045238495, + "step": 34 + }, + { + "epoch": 0.39, + "logps_train/chosen": -167.86843872070312, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -139.84494018554688, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 0.5324914455413818, + "rewards_train/margins": 0.7851487994194031, + "rewards_train/rejected": -0.25265735387802124, + "step": 34 + }, + { + "epoch": 0.39, + "logps_train/chosen": -147.22430419921875, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -123.09727478027344, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.3912162780761719, + "rewards_train/margins": 0.3929597958922386, + "rewards_train/rejected": -0.001743517816066742, + "step": 34 + }, + { + "epoch": 0.39, + "logps_train/chosen": -147.94500732421875, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -137.91192626953125, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.15667131543159485, + "rewards_train/margins": 0.4005984365940094, + "rewards_train/rejected": -0.24392712116241455, + "step": 34 + }, + { + "epoch": 0.4, + "learning_rate": 4.86080477135927e-05, + "loss": 0.5872, + "step": 35 + }, + { + "epoch": 0.4, + "logps_train/chosen": -149.14828491210938, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -131.72792053222656, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.3583168387413025, + "rewards_train/margins": 0.661382257938385, + "rewards_train/rejected": -0.3030654191970825, + "step": 35 + }, + { + "epoch": 0.4, + "logps_train/chosen": -173.38485717773438, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -130.51498413085938, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.5212795734405518, + "rewards_train/margins": 0.8276603519916534, + "rewards_train/rejected": -0.3063807785511017, + "step": 35 + }, + { + "epoch": 0.4, + "logps_train/chosen": -153.11077880859375, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -115.9320068359375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3146057426929474, + "rewards_train/margins": 0.8578799068927765, + "rewards_train/rejected": -0.5432741641998291, + "step": 35 + }, + { + "epoch": 0.4, + "logps_train/chosen": -172.79013061523438, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -155.0, + "logps_train/rejected": -157.91104125976562, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.24540139734745026, + "rewards_train/margins": 0.5402161329984665, + "rewards_train/rejected": -0.29481473565101624, + "step": 35 + }, + { + "epoch": 0.41, + "learning_rate": 4.850409034497704e-05, + "loss": 0.5427, + "step": 36 + }, + { + "epoch": 0.41, + "logps_train/chosen": -147.08950805664062, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -132.83718872070312, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.3379243314266205, + "rewards_train/margins": 0.9198374450206757, + "rewards_train/rejected": -0.5819131135940552, + "step": 36 + }, + { + "epoch": 0.41, + "logps_train/chosen": -161.61907958984375, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -127.36285400390625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3235408067703247, + "rewards_train/margins": 1.1210931539535522, + "rewards_train/rejected": -0.7975523471832275, + "step": 36 + }, + { + "epoch": 0.41, + "logps_train/chosen": -131.70205688476562, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -119.81352996826172, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.042293842881917953, + "rewards_train/margins": 0.5976700149476528, + "rewards_train/rejected": -0.5553761720657349, + "step": 36 + }, + { + "epoch": 0.41, + "logps_train/chosen": -125.20751953125, + "logps_train/ref_chosen": -126.0, + "logps_train/ref_rejected": -93.5, + "logps_train/rejected": -101.25257873535156, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.11919025331735611, + "rewards_train/margins": 0.896144725382328, + "rewards_train/rejected": -0.7769544720649719, + "step": 36 + }, + { + "epoch": 0.42, + "learning_rate": 4.839650890642104e-05, + "loss": 0.5169, + "step": 37 + }, + { + "epoch": 0.42, + "logps_train/chosen": -121.39013671875, + "logps_train/ref_chosen": -120.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -102.62464141845703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1606450080871582, + "rewards_train/margins": 0.42508548498153687, + "rewards_train/rejected": -0.5857304930686951, + "step": 37 + }, + { + "epoch": 0.42, + "logps_train/chosen": -125.78103637695312, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -111.85375213623047, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.0980682522058487, + "rewards_train/margins": 0.46781833469867706, + "rewards_train/rejected": -0.36975008249282837, + "step": 37 + }, + { + "epoch": 0.42, + "logps_train/chosen": -145.85195922851562, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -128.85755920410156, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.18382151424884796, + "rewards_train/margins": 0.7501192837953568, + "rewards_train/rejected": -0.5662977695465088, + "step": 37 + }, + { + "epoch": 0.42, + "logps_train/chosen": -120.22520446777344, + "logps_train/ref_chosen": -122.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -94.60220336914062, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.18724526464939117, + "rewards_train/margins": 0.8372600227594376, + "rewards_train/rejected": -0.6500147581100464, + "step": 37 + }, + { + "epoch": 0.43, + "learning_rate": 4.828531998578885e-05, + "loss": 0.5599, + "step": 38 + }, + { + "epoch": 0.43, + "logps_train/chosen": -182.186279296875, + "logps_train/ref_chosen": -186.0, + "logps_train/ref_rejected": -153.0, + "logps_train/rejected": -157.68670654296875, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.349829763174057, + "rewards_train/margins": 0.827485203742981, + "rewards_train/rejected": -0.47765544056892395, + "step": 38 + }, + { + "epoch": 0.43, + "logps_train/chosen": -148.2696075439453, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -132.3610382080078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.011784523725509644, + "rewards_train/margins": 0.44945111870765686, + "rewards_train/rejected": -0.4376665949821472, + "step": 38 + }, + { + "epoch": 0.43, + "logps_train/chosen": -148.4195556640625, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -126.39881134033203, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.09119848161935806, + "rewards_train/margins": 0.628833569586277, + "rewards_train/rejected": -0.537635087966919, + "step": 38 + }, + { + "epoch": 0.43, + "logps_train/chosen": -154.6384735107422, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -131.3975067138672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4014357924461365, + "rewards_train/margins": 0.9915284514427185, + "rewards_train/rejected": -0.590092658996582, + "step": 38 + }, + { + "epoch": 0.45, + "learning_rate": 4.8170540727178326e-05, + "loss": 0.5291, + "step": 39 + }, + { + "epoch": 0.45, + "logps_train/chosen": -151.14736938476562, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -128.07752990722656, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.21706223487854004, + "rewards_train/margins": 0.8728617429733276, + "rewards_train/rejected": -0.6557995080947876, + "step": 39 + }, + { + "epoch": 0.45, + "logps_train/chosen": -175.7981719970703, + "logps_train/ref_chosen": -178.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -152.55126953125, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.25172606110572815, + "rewards_train/margins": 1.094938188791275, + "rewards_train/rejected": -0.8432121276855469, + "step": 39 + }, + { + "epoch": 0.45, + "logps_train/chosen": -142.94915771484375, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -126.44970703125, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.24932152032852173, + "rewards_train/margins": 0.8067926168441772, + "rewards_train/rejected": -0.5574710965156555, + "step": 39 + }, + { + "epoch": 0.45, + "logps_train/chosen": -175.91030883789062, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -129.6051025390625, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.29153841733932495, + "rewards_train/margins": 0.695236325263977, + "rewards_train/rejected": -0.4036979079246521, + "step": 39 + }, + { + "epoch": 0.46, + "learning_rate": 4.805218882827761e-05, + "loss": 0.5147, + "step": 40 + }, + { + "epoch": 0.46, + "logps_train/chosen": -139.74484252929688, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -110.22254180908203, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.10924912989139557, + "rewards_train/margins": 0.5274574607610703, + "rewards_train/rejected": -0.6367065906524658, + "step": 40 + }, + { + "epoch": 0.46, + "logps_train/chosen": -160.38890075683594, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -146.0457000732422, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": -0.15876291692256927, + "rewards_train/margins": 0.4254945069551468, + "rewards_train/rejected": -0.5842574238777161, + "step": 40 + }, + { + "epoch": 0.46, + "logps_train/chosen": -153.27549743652344, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -135.46401977539062, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.12430575489997864, + "rewards_train/margins": 0.5440479218959808, + "rewards_train/rejected": -0.4197421669960022, + "step": 40 + }, + { + "epoch": 0.46, + "logps_train/chosen": -176.70468139648438, + "logps_train/ref_chosen": -178.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -124.0498275756836, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 0.16253966093063354, + "rewards_train/margins": 0.6728933453559875, + "rewards_train/rejected": -0.510353684425354, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 4.793028253763633e-05, + "loss": 0.5968, + "step": 41 + }, + { + "epoch": 0.47, + "logps_train/chosen": -171.22650146484375, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -146.68588256835938, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.2716864347457886, + "rewards_train/margins": 0.5631259977817535, + "rewards_train/rejected": -0.29143956303596497, + "step": 41 + }, + { + "epoch": 0.47, + "logps_train/chosen": -125.95775604248047, + "logps_train/ref_chosen": -124.5, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -107.48272705078125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.14445701241493225, + "rewards_train/margins": 0.6441479623317719, + "rewards_train/rejected": -0.7886049747467041, + "step": 41 + }, + { + "epoch": 0.47, + "logps_train/chosen": -126.57373046875, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -87.17964935302734, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.19477389752864838, + "rewards_train/margins": 0.7732878178358078, + "rewards_train/rejected": -0.5785139203071594, + "step": 41 + }, + { + "epoch": 0.47, + "logps_train/chosen": -148.91766357421875, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -95.6549072265625, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.44352418184280396, + "rewards_train/margins": 0.24266964197158813, + "rewards_train/rejected": -0.6861938238143921, + "step": 41 + }, + { + "epoch": 0.48, + "learning_rate": 4.780484065185188e-05, + "loss": 0.6466, + "step": 42 + }, + { + "epoch": 0.48, + "logps_train/chosen": -115.9168930053711, + "logps_train/ref_chosen": -114.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -87.66254425048828, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.21981437504291534, + "rewards_train/margins": 0.599864050745964, + "rewards_train/rejected": -0.8196784257888794, + "step": 42 + }, + { + "epoch": 0.48, + "logps_train/chosen": -184.0168914794922, + "logps_train/ref_chosen": -186.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -145.45132446289062, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.2588578462600708, + "rewards_train/margins": 0.7897325158119202, + "rewards_train/rejected": -0.5308746695518494, + "step": 42 + }, + { + "epoch": 0.48, + "logps_train/chosen": -124.75608825683594, + "logps_train/ref_chosen": -125.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -101.11264038085938, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 0.04958619177341461, + "rewards_train/margins": 0.9850691705942154, + "rewards_train/rejected": -0.9354829788208008, + "step": 42 + }, + { + "epoch": 0.48, + "logps_train/chosen": -163.1077880859375, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -129.63858032226562, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.006554238498210907, + "rewards_train/margins": 0.6211441680788994, + "rewards_train/rejected": -0.6145899295806885, + "step": 42 + }, + { + "epoch": 0.49, + "learning_rate": 4.767588251267121e-05, + "loss": 0.5648, + "step": 43 + }, + { + "epoch": 0.49, + "logps_train/chosen": -151.1636962890625, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -102.26740264892578, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.17907707393169403, + "rewards_train/margins": 1.0967354625463486, + "rewards_train/rejected": -0.9176583886146545, + "step": 43 + }, + { + "epoch": 0.49, + "logps_train/chosen": -154.32647705078125, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -118.2194595336914, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.08923967182636261, + "rewards_train/margins": 0.5269937962293625, + "rewards_train/rejected": -0.6162334680557251, + "step": 43 + }, + { + "epoch": 0.49, + "logps_train/chosen": -132.7676544189453, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -116.9314193725586, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": -0.3089926540851593, + "rewards_train/margins": 0.21969613432884216, + "rewards_train/rejected": -0.5286887884140015, + "step": 43 + }, + { + "epoch": 0.49, + "logps_train/chosen": -158.78457641601562, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -104.1181640625, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.05724158138036728, + "rewards_train/margins": 0.9021327868103981, + "rewards_train/rejected": -0.9593743681907654, + "step": 43 + }, + { + "epoch": 0.5, + "learning_rate": 4.754342800400852e-05, + "loss": 0.5647, + "step": 44 + }, + { + "epoch": 0.5, + "logps_train/chosen": -145.93472290039062, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -123.93000030517578, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3344568610191345, + "rewards_train/margins": 0.7847806811332703, + "rewards_train/rejected": -0.45032382011413574, + "step": 44 + }, + { + "epoch": 0.5, + "logps_train/chosen": -140.81419372558594, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -109.78305053710938, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.007155388593673706, + "rewards_train/margins": 0.5099476277828217, + "rewards_train/rejected": -0.502792239189148, + "step": 44 + }, + { + "epoch": 0.5, + "logps_train/chosen": -151.26019287109375, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -121.91029357910156, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.026507869362831116, + "rewards_train/margins": 0.7881536334753036, + "rewards_train/rejected": -0.8146615028381348, + "step": 44 + }, + { + "epoch": 0.5, + "logps_train/chosen": -157.45396423339844, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -120.49565887451172, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.14910393953323364, + "rewards_train/margins": 0.5894901752471924, + "rewards_train/rejected": -0.44038623571395874, + "step": 44 + }, + { + "epoch": 0.51, + "learning_rate": 4.7407497548879384e-05, + "loss": 0.5792, + "step": 45 + }, + { + "epoch": 0.51, + "logps_train/chosen": -113.65975952148438, + "logps_train/ref_chosen": -113.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -76.96720886230469, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.006112739443778992, + "rewards_train/margins": 0.6379715949296951, + "rewards_train/rejected": -0.6440843343734741, + "step": 45 + }, + { + "epoch": 0.51, + "logps_train/chosen": -154.4318389892578, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -128.56004333496094, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.1915566474199295, + "rewards_train/margins": 0.7697056084871292, + "rewards_train/rejected": -0.5781489610671997, + "step": 45 + }, + { + "epoch": 0.51, + "logps_train/chosen": -164.68267822265625, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -124.21025848388672, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.11348249763250351, + "rewards_train/margins": 0.9802481904625893, + "rewards_train/rejected": -1.0937306880950928, + "step": 45 + }, + { + "epoch": 0.51, + "logps_train/chosen": -148.53305053710938, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -122.53600311279297, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.3209740221500397, + "rewards_train/margins": 0.7358295619487762, + "rewards_train/rejected": -0.4148555397987366, + "step": 45 + }, + { + "epoch": 0.53, + "learning_rate": 4.726811210625176e-05, + "loss": 0.5427, + "step": 46 + }, + { + "epoch": 0.53, + "logps_train/chosen": -169.46038818359375, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -117.86236572265625, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.38960567116737366, + "rewards_train/margins": 0.9380492866039276, + "rewards_train/rejected": -0.548443615436554, + "step": 46 + }, + { + "epoch": 0.53, + "logps_train/chosen": -109.06082916259766, + "logps_train/ref_chosen": -110.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -86.27485656738281, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 0.10861498862504959, + "rewards_train/margins": 0.8819746449589729, + "rewards_train/rejected": -0.7733596563339233, + "step": 46 + }, + { + "epoch": 0.53, + "logps_train/chosen": -149.625732421875, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -111.72176361083984, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.3020744025707245, + "rewards_train/margins": 1.0752268731594086, + "rewards_train/rejected": -0.7731524705886841, + "step": 46 + }, + { + "epoch": 0.53, + "logps_train/chosen": -160.92486572265625, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -140.57608032226562, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.2807552218437195, + "rewards_train/margins": 0.5618008971214294, + "rewards_train/rejected": -0.28104567527770996, + "step": 46 + }, + { + "epoch": 0.54, + "learning_rate": 4.7125293167814345e-05, + "loss": 0.5141, + "step": 47 + }, + { + "epoch": 0.54, + "logps_train/chosen": -174.048828125, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -146.0, + "logps_train/rejected": -150.9656219482422, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.28046780824661255, + "rewards_train/margins": 0.7596479058265686, + "rewards_train/rejected": -0.47918009757995605, + "step": 47 + }, + { + "epoch": 0.54, + "logps_train/chosen": -134.04562377929688, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -144.9838409423828, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.08473768830299377, + "rewards_train/margins": 0.6593497097492218, + "rewards_train/rejected": -0.7440873980522156, + "step": 47 + }, + { + "epoch": 0.54, + "logps_train/chosen": -153.0111083984375, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -124.59916687011719, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.3758174479007721, + "rewards_train/margins": 0.956462949514389, + "rewards_train/rejected": -1.3322803974151611, + "step": 47 + }, + { + "epoch": 0.54, + "logps_train/chosen": -139.15069580078125, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -112.43024444580078, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 0.30602389574050903, + "rewards_train/margins": 0.9506352543830872, + "rewards_train/rejected": -0.6446113586425781, + "step": 47 + }, + { + "epoch": 0.55, + "learning_rate": 4.697906275466279e-05, + "loss": 0.5201, + "step": 48 + }, + { + "epoch": 0.55, + "logps_train/chosen": -129.7398681640625, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -114.27020263671875, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.321838915348053, + "rewards_train/margins": 0.39028793573379517, + "rewards_train/rejected": -0.7121268510818481, + "step": 48 + }, + { + "epoch": 0.55, + "logps_train/chosen": -170.0889434814453, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -177.0, + "logps_train/rejected": -180.01104736328125, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.4118082821369171, + "rewards_train/margins": 0.7699443101882935, + "rewards_train/rejected": -0.35813602805137634, + "step": 48 + }, + { + "epoch": 0.55, + "logps_train/chosen": -166.91763305664062, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -129.64866638183594, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.2073570191860199, + "rewards_train/margins": 0.8548417389392853, + "rewards_train/rejected": -0.6474847197532654, + "step": 48 + }, + { + "epoch": 0.55, + "logps_train/chosen": -160.1295166015625, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -138.43392944335938, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.05831342190504074, + "rewards_train/margins": 0.5032433345913887, + "rewards_train/rejected": -0.5615567564964294, + "step": 48 + }, + { + "epoch": 0.56, + "learning_rate": 4.68294434139043e-05, + "loss": 0.6017, + "step": 49 + }, + { + "epoch": 0.56, + "logps_train/chosen": -149.0851593017578, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -115.5216064453125, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 0.5192182064056396, + "rewards_train/margins": 1.6972576379776, + "rewards_train/rejected": -1.1780394315719604, + "step": 49 + }, + { + "epoch": 0.56, + "logps_train/chosen": -125.34487915039062, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -108.78245544433594, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.09363719820976257, + "rewards_train/margins": 0.7584063112735748, + "rewards_train/rejected": -0.6647691130638123, + "step": 49 + }, + { + "epoch": 0.56, + "logps_train/chosen": -141.67967224121094, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -124.4493408203125, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": 0.10556776821613312, + "rewards_train/margins": 0.5667611807584763, + "rewards_train/rejected": -0.46119341254234314, + "step": 49 + }, + { + "epoch": 0.56, + "logps_train/chosen": -124.61006164550781, + "logps_train/ref_chosen": -125.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -95.22357177734375, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.030595704913139343, + "rewards_train/margins": 0.7752193659543991, + "rewards_train/rejected": -0.7446236610412598, + "step": 49 + }, + { + "epoch": 0.57, + "learning_rate": 4.667645821518111e-05, + "loss": 0.4851, + "step": 50 + }, + { + "epoch": 0.57, + "logps_train/chosen": -176.50062561035156, + "logps_train/ref_chosen": -178.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -155.60519409179688, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.17474240064620972, + "rewards_train/margins": 0.5643627047538757, + "rewards_train/rejected": -0.389620304107666, + "step": 50 + }, + { + "epoch": 0.57, + "logps_train/chosen": -165.77471923828125, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -145.21810913085938, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.4006544351577759, + "rewards_train/margins": 0.8921915590763092, + "rewards_train/rejected": -0.4915371239185333, + "step": 50 + }, + { + "epoch": 0.57, + "logps_train/chosen": -142.41026306152344, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -121.59886169433594, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.2132701277732849, + "rewards_train/margins": 0.9618284106254578, + "rewards_train/rejected": -0.7485582828521729, + "step": 50 + }, + { + "epoch": 0.57, + "logps_train/chosen": -150.48184204101562, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -128.40724182128906, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.04735386371612549, + "rewards_train/margins": 0.7860215902328491, + "rewards_train/rejected": -0.8333754539489746, + "step": 50 + }, + { + "epoch": 0.58, + "learning_rate": 4.65201307471134e-05, + "loss": 0.5391, + "step": 51 + }, + { + "epoch": 0.58, + "logps_train/chosen": -158.06637573242188, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -139.3981170654297, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": 0.11431728303432465, + "rewards_train/margins": 0.3441133350133896, + "rewards_train/rejected": -0.22979605197906494, + "step": 51 + }, + { + "epoch": 0.58, + "logps_train/chosen": -132.14430236816406, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -121.54273986816406, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.13733020424842834, + "rewards_train/margins": 1.0312997996807098, + "rewards_train/rejected": -1.1686300039291382, + "step": 51 + }, + { + "epoch": 0.58, + "logps_train/chosen": -145.66217041015625, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -144.03201293945312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.26250535249710083, + "rewards_train/margins": 0.5208224654197693, + "rewards_train/rejected": -0.7833278179168701, + "step": 51 + }, + { + "epoch": 0.58, + "logps_train/chosen": -169.46548461914062, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -143.0, + "logps_train/rejected": -150.265869140625, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.2237648218870163, + "rewards_train/margins": 0.9571881741285324, + "rewards_train/rejected": -0.7334233522415161, + "step": 51 + }, + { + "epoch": 0.59, + "learning_rate": 4.6360485113662216e-05, + "loss": 0.6187, + "step": 52 + }, + { + "epoch": 0.59, + "logps_train/chosen": -163.33822631835938, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -145.1938018798828, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.24283689260482788, + "rewards_train/margins": 0.8202248811721802, + "rewards_train/rejected": -0.5773879885673523, + "step": 52 + }, + { + "epoch": 0.59, + "logps_train/chosen": -181.57000732421875, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -133.94961547851562, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.14456185698509216, + "rewards_train/margins": 0.8153055608272552, + "rewards_train/rejected": -0.6707437038421631, + "step": 52 + }, + { + "epoch": 0.59, + "logps_train/chosen": -160.7414093017578, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -131.62620544433594, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 0.27234378457069397, + "rewards_train/margins": 1.1264202892780304, + "rewards_train/rejected": -0.8540765047073364, + "step": 52 + }, + { + "epoch": 0.59, + "logps_train/chosen": -129.25555419921875, + "logps_train/ref_chosen": -127.5, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -129.04898071289062, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": -0.16788795590400696, + "rewards_train/margins": 0.3479476869106293, + "rewards_train/rejected": -0.5158356428146362, + "step": 52 + }, + { + "epoch": 0.61, + "learning_rate": 4.6197545930412874e-05, + "loss": 0.5332, + "step": 53 + }, + { + "epoch": 0.61, + "logps_train/chosen": -153.34988403320312, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -160.0, + "logps_train/rejected": -164.3514862060547, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": -0.12678614258766174, + "rewards_train/margins": 0.33057892322540283, + "rewards_train/rejected": -0.4573650658130646, + "step": 53 + }, + { + "epoch": 0.61, + "logps_train/chosen": -162.29542541503906, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -130.75132751464844, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.25033989548683167, + "rewards_train/margins": 1.0030124485492706, + "rewards_train/rejected": -0.752672553062439, + "step": 53 + }, + { + "epoch": 0.61, + "logps_train/chosen": -138.71487426757812, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -101.21212768554688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.04726358875632286, + "rewards_train/margins": 1.0392774604260921, + "rewards_train/rejected": -0.9920138716697693, + "step": 53 + }, + { + "epoch": 0.61, + "logps_train/chosen": -159.54571533203125, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -129.88558959960938, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.34285280108451843, + "rewards_train/margins": 0.4222678244113922, + "rewards_train/rejected": -0.7651206254959106, + "step": 53 + }, + { + "epoch": 0.62, + "learning_rate": 4.6031338320779534e-05, + "loss": 0.6483, + "step": 54 + }, + { + "epoch": 0.62, + "logps_train/chosen": -173.44290161132812, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -133.28878784179688, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.12924429774284363, + "rewards_train/margins": 0.9989194571971893, + "rewards_train/rejected": -0.8696751594543457, + "step": 54 + }, + { + "epoch": 0.62, + "logps_train/chosen": -148.29855346679688, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -115.97592163085938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.009714528918266296, + "rewards_train/margins": 0.9764523059129715, + "rewards_train/rejected": -0.9861668348312378, + "step": 54 + }, + { + "epoch": 0.62, + "logps_train/chosen": -163.42010498046875, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -132.7950439453125, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 0.16990332305431366, + "rewards_train/margins": 1.1158150881528854, + "rewards_train/rejected": -0.9459117650985718, + "step": 54 + }, + { + "epoch": 0.62, + "logps_train/chosen": -169.2532958984375, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -126.75450134277344, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.2445676475763321, + "rewards_train/margins": 0.5689687579870224, + "rewards_train/rejected": -0.8135364055633545, + "step": 54 + }, + { + "epoch": 0.63, + "learning_rate": 4.586188791213143e-05, + "loss": 0.5139, + "step": 55 + }, + { + "epoch": 0.63, + "logps_train/chosen": -128.13194274902344, + "logps_train/ref_chosen": -124.5, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -120.18585205078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3469837009906769, + "rewards_train/margins": 1.0525586307048798, + "rewards_train/rejected": -1.3995423316955566, + "step": 55 + }, + { + "epoch": 0.63, + "logps_train/chosen": -137.48464965820312, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -109.7799072265625, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.32219475507736206, + "rewards_train/margins": 0.4786474108695984, + "rewards_train/rejected": -0.8008421659469604, + "step": 55 + }, + { + "epoch": 0.63, + "logps_train/chosen": -155.82305908203125, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -140.88726806640625, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.3568549156188965, + "rewards_train/margins": 1.018384873867035, + "rewards_train/rejected": -0.6615299582481384, + "step": 55 + }, + { + "epoch": 0.63, + "logps_train/chosen": -137.8286590576172, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -101.40814208984375, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.3273978531360626, + "rewards_train/margins": 0.5737676322460175, + "rewards_train/rejected": -0.9011654853820801, + "step": 55 + }, + { + "epoch": 0.64, + "learning_rate": 4.568922083184144e-05, + "loss": 0.5308, + "step": 56 + }, + { + "epoch": 0.64, + "logps_train/chosen": -201.31527709960938, + "logps_train/ref_chosen": -204.0, + "logps_train/ref_rejected": -166.0, + "logps_train/rejected": -172.15939331054688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.23741737008094788, + "rewards_train/margins": 0.8567750751972198, + "rewards_train/rejected": -0.619357705116272, + "step": 56 + }, + { + "epoch": 0.64, + "logps_train/chosen": -173.6513671875, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -154.0, + "logps_train/rejected": -163.62249755859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.11093807220458984, + "rewards_train/margins": 1.0794376134872437, + "rewards_train/rejected": -0.9684995412826538, + "step": 56 + }, + { + "epoch": 0.64, + "logps_train/chosen": -170.75677490234375, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -153.0, + "logps_train/rejected": -160.66265869140625, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": -0.1634213924407959, + "rewards_train/margins": 0.5591435432434082, + "rewards_train/rejected": -0.7225649356842041, + "step": 56 + }, + { + "epoch": 0.64, + "logps_train/chosen": -176.91485595703125, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -158.0, + "logps_train/rejected": -165.61184692382812, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": -0.07683837413787842, + "rewards_train/margins": 0.6816108822822571, + "rewards_train/rejected": -0.7584492564201355, + "step": 56 + }, + { + "epoch": 0.65, + "learning_rate": 4.5513363703257496e-05, + "loss": 0.5886, + "step": 57 + }, + { + "epoch": 0.65, + "logps_train/chosen": -147.32388305664062, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -118.71624755859375, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": -0.22408804297447205, + "rewards_train/margins": 0.9452424347400665, + "rewards_train/rejected": -1.1693304777145386, + "step": 57 + }, + { + "epoch": 0.65, + "logps_train/chosen": -149.79339599609375, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -138.41961669921875, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.2672303020954132, + "rewards_train/margins": 0.8092033565044403, + "rewards_train/rejected": -1.0764336585998535, + "step": 57 + }, + { + "epoch": 0.65, + "logps_train/chosen": -108.96585083007812, + "logps_train/ref_chosen": -104.5, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -111.37809753417969, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.4704621434211731, + "rewards_train/margins": 0.4902119040489197, + "rewards_train/rejected": -0.9606740474700928, + "step": 57 + }, + { + "epoch": 0.65, + "logps_train/chosen": -176.698486328125, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -146.0, + "logps_train/rejected": -154.18609619140625, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": 0.010303527116775513, + "rewards_train/margins": 0.8293034136295319, + "rewards_train/rejected": -0.8189998865127563, + "step": 57 + }, + { + "epoch": 0.66, + "learning_rate": 4.533434364159761e-05, + "loss": 0.5843, + "step": 58 + }, + { + "epoch": 0.66, + "logps_train/chosen": -136.48675537109375, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -142.8067626953125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.18441638350486755, + "rewards_train/margins": 0.39918962121009827, + "rewards_train/rejected": -0.5836060047149658, + "step": 58 + }, + { + "epoch": 0.66, + "logps_train/chosen": -184.3641357421875, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -146.0, + "logps_train/rejected": -153.0880584716797, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.08172591775655746, + "rewards_train/margins": 0.6115528717637062, + "rewards_train/rejected": -0.6932787895202637, + "step": 58 + }, + { + "epoch": 0.66, + "logps_train/chosen": -175.23277282714844, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -117.5, + "logps_train/rejected": -124.69281005859375, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.07252401113510132, + "rewards_train/margins": 0.7904370427131653, + "rewards_train/rejected": -0.717913031578064, + "step": 58 + }, + { + "epoch": 0.66, + "logps_train/chosen": -144.44598388671875, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -113.03366088867188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.09381596744060516, + "rewards_train/margins": 1.031522586941719, + "rewards_train/rejected": -1.1253385543823242, + "step": 58 + }, + { + "epoch": 0.67, + "learning_rate": 4.515218824976895e-05, + "loss": 0.5473, + "step": 59 + }, + { + "epoch": 0.67, + "logps_train/chosen": -135.3798065185547, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -105.5, + "logps_train/rejected": -120.29751586914062, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -0.3549787104129791, + "rewards_train/margins": 1.1142815053462982, + "rewards_train/rejected": -1.4692602157592773, + "step": 59 + }, + { + "epoch": 0.67, + "logps_train/chosen": -140.24740600585938, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -105.5, + "logps_train/rejected": -117.92471313476562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.19534514844417572, + "rewards_train/margins": 1.0404373556375504, + "rewards_train/rejected": -1.235782504081726, + "step": 59 + }, + { + "epoch": 0.67, + "logps_train/chosen": -170.62718200683594, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -133.84637451171875, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": 0.15193051099777222, + "rewards_train/margins": 1.3069776892662048, + "rewards_train/rejected": -1.1550471782684326, + "step": 59 + }, + { + "epoch": 0.67, + "logps_train/chosen": -171.68701171875, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -140.77157592773438, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.14379817247390747, + "rewards_train/margins": 0.9716156721115112, + "rewards_train/rejected": -0.8278174996376038, + "step": 59 + }, + { + "epoch": 0.69, + "learning_rate": 4.496692561411182e-05, + "loss": 0.4245, + "step": 60 + }, + { + "epoch": 0.69, + "logps_train/chosen": -145.77418518066406, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -127.12965393066406, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.3304464817047119, + "rewards_train/margins": 0.8365238904953003, + "rewards_train/rejected": -1.1669703722000122, + "step": 60 + }, + { + "epoch": 0.69, + "logps_train/chosen": -173.94161987304688, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -132.8108673095703, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.3185763955116272, + "rewards_train/margins": 0.8773545622825623, + "rewards_train/rejected": -1.1959309577941895, + "step": 60 + }, + { + "epoch": 0.69, + "logps_train/chosen": -190.071044921875, + "logps_train/ref_chosen": -189.0, + "logps_train/ref_rejected": -159.0, + "logps_train/rejected": -168.48336791992188, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.10808217525482178, + "rewards_train/margins": 0.8708211779594421, + "rewards_train/rejected": -0.9789033532142639, + "step": 60 + }, + { + "epoch": 0.69, + "logps_train/chosen": -121.50714111328125, + "logps_train/ref_chosen": -116.5, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -117.76773071289062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5074518322944641, + "rewards_train/margins": 1.0132667422294617, + "rewards_train/rejected": -1.5207185745239258, + "step": 60 + }, + { + "epoch": 0.7, + "learning_rate": 4.477858430006906e-05, + "loss": 0.5009, + "step": 61 + }, + { + "epoch": 0.7, + "logps_train/chosen": -148.4321746826172, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -105.5, + "logps_train/rejected": -122.11833953857422, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.6903851628303528, + "rewards_train/margins": 0.9600227475166321, + "rewards_train/rejected": -1.6504079103469849, + "step": 61 + }, + { + "epoch": 0.7, + "logps_train/chosen": -167.27960205078125, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -140.42367553710938, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.3914361596107483, + "rewards_train/margins": 0.738186776638031, + "rewards_train/rejected": -1.1296229362487793, + "step": 61 + }, + { + "epoch": 0.7, + "logps_train/chosen": -175.978271484375, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -129.72894287109375, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.16933763027191162, + "rewards_train/margins": 1.182854413986206, + "rewards_train/rejected": -1.3521920442581177, + "step": 61 + }, + { + "epoch": 0.7, + "logps_train/chosen": -153.08892822265625, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -137.57305908203125, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.2504955530166626, + "rewards_train/margins": 0.9743890762329102, + "rewards_train/rejected": -1.2248846292495728, + "step": 61 + }, + { + "epoch": 0.71, + "learning_rate": 4.458719334778153e-05, + "loss": 0.5142, + "step": 62 + }, + { + "epoch": 0.71, + "logps_train/chosen": -143.49911499023438, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -147.0, + "logps_train/rejected": -156.21580505371094, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": -0.044589295983314514, + "rewards_train/margins": 0.8543341010808945, + "rewards_train/rejected": -0.898923397064209, + "step": 62 + }, + { + "epoch": 0.71, + "logps_train/chosen": -156.5689239501953, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -125.41449737548828, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.5022292137145996, + "rewards_train/margins": 0.6355087757110596, + "rewards_train/rejected": -1.1377379894256592, + "step": 62 + }, + { + "epoch": 0.71, + "logps_train/chosen": -201.266357421875, + "logps_train/ref_chosen": -196.0, + "logps_train/ref_rejected": -159.0, + "logps_train/rejected": -173.8751220703125, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -0.43591195344924927, + "rewards_train/margins": 1.0374639630317688, + "rewards_train/rejected": -1.473375916481018, + "step": 62 + }, + { + "epoch": 0.71, + "logps_train/chosen": -149.0179901123047, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -170.43136596679688, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -1.208733081817627, + "rewards_train/margins": 0.9972443580627441, + "rewards_train/rejected": -2.205977439880371, + "step": 62 + }, + { + "epoch": 0.72, + "learning_rate": 4.43927822676105e-05, + "loss": 0.5598, + "step": 63 + }, + { + "epoch": 0.72, + "logps_train/chosen": -126.84237670898438, + "logps_train/ref_chosen": -117.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -115.09578704833984, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.9945892691612244, + "rewards_train/margins": 0.3996577858924866, + "rewards_train/rejected": -1.394247055053711, + "step": 63 + }, + { + "epoch": 0.72, + "logps_train/chosen": -142.31930541992188, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -117.5, + "logps_train/rejected": -130.88372802734375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.607906699180603, + "rewards_train/margins": 0.7457003593444824, + "rewards_train/rejected": -1.3536070585250854, + "step": 63 + }, + { + "epoch": 0.72, + "logps_train/chosen": -124.56082916259766, + "logps_train/ref_chosen": -115.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -110.76518249511719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.923465371131897, + "rewards_train/margins": 0.618140697479248, + "rewards_train/rejected": -1.541606068611145, + "step": 63 + }, + { + "epoch": 0.72, + "logps_train/chosen": -179.1411895751953, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -157.0, + "logps_train/rejected": -171.82464599609375, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.33872923254966736, + "rewards_train/margins": 1.1219574511051178, + "rewards_train/rejected": -1.4606866836547852, + "step": 63 + }, + { + "epoch": 0.73, + "learning_rate": 4.419538103558742e-05, + "loss": 0.5848, + "step": 64 + }, + { + "epoch": 0.73, + "logps_train/chosen": -168.5423583984375, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -148.78189086914062, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.5333375334739685, + "rewards_train/margins": 0.8630149960517883, + "rewards_train/rejected": -1.3963525295257568, + "step": 64 + }, + { + "epoch": 0.73, + "logps_train/chosen": -140.47198486328125, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -124.3873291015625, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -1.0393857955932617, + "rewards_train/margins": 0.7289131879806519, + "rewards_train/rejected": -1.7682989835739136, + "step": 64 + }, + { + "epoch": 0.73, + "logps_train/chosen": -174.19720458984375, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -125.15985870361328, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.5634703040122986, + "rewards_train/margins": 1.0730233788490295, + "rewards_train/rejected": -1.6364936828613281, + "step": 64 + }, + { + "epoch": 0.73, + "logps_train/chosen": -160.61813354492188, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -143.4553680419922, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.6700178384780884, + "rewards_train/margins": 0.5729802846908569, + "rewards_train/rejected": -1.2429981231689453, + "step": 64 + }, + { + "epoch": 0.74, + "learning_rate": 4.3995020088792e-05, + "loss": 0.5701, + "step": 65 + }, + { + "epoch": 0.74, + "logps_train/chosen": -188.96270751953125, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -148.6373291015625, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -1.192512035369873, + "rewards_train/margins": 0.6878228187561035, + "rewards_train/rejected": -1.8803348541259766, + "step": 65 + }, + { + "epoch": 0.74, + "logps_train/chosen": -173.8163299560547, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -141.50582885742188, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.7177664041519165, + "rewards_train/margins": 0.6095980405807495, + "rewards_train/rejected": -1.327364444732666, + "step": 65 + }, + { + "epoch": 0.74, + "logps_train/chosen": -156.86846923828125, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -131.81948852539062, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -0.514484167098999, + "rewards_train/margins": 1.3581879138946533, + "rewards_train/rejected": -1.8726720809936523, + "step": 65 + }, + { + "epoch": 0.74, + "logps_train/chosen": -153.33334350585938, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -140.69818115234375, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.8610811233520508, + "rewards_train/margins": 0.6778771877288818, + "rewards_train/rejected": -1.5389583110809326, + "step": 65 + }, + { + "epoch": 0.75, + "learning_rate": 4.379173032065912e-05, + "loss": 0.571, + "step": 66 + }, + { + "epoch": 0.75, + "logps_train/chosen": -123.39452362060547, + "logps_train/ref_chosen": -111.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -110.48931121826172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.208837628364563, + "rewards_train/margins": 0.7541805505752563, + "rewards_train/rejected": -1.9630181789398193, + "step": 66 + }, + { + "epoch": 0.75, + "logps_train/chosen": -149.31439208984375, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -124.77264404296875, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.7231384515762329, + "rewards_train/margins": 1.0160391330718994, + "rewards_train/rejected": -1.7391775846481323, + "step": 66 + }, + { + "epoch": 0.75, + "logps_train/chosen": -189.39576721191406, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -165.7838134765625, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.8770769834518433, + "rewards_train/margins": 0.7735692262649536, + "rewards_train/rejected": -1.6506462097167969, + "step": 66 + }, + { + "epoch": 0.75, + "logps_train/chosen": -172.92323303222656, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -151.0, + "logps_train/rejected": -169.48965454101562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6337293982505798, + "rewards_train/margins": 1.203711450099945, + "rewards_train/rejected": -1.837440848350525, + "step": 66 + }, + { + "epoch": 0.77, + "learning_rate": 4.358554307621541e-05, + "loss": 0.519, + "step": 67 + }, + { + "epoch": 0.77, + "logps_train/chosen": -106.08445739746094, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -104.09698486328125, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -1.045597791671753, + "rewards_train/margins": 0.9242256879806519, + "rewards_train/rejected": -1.9698234796524048, + "step": 67 + }, + { + "epoch": 0.77, + "logps_train/chosen": -157.91793823242188, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -139.233154296875, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -1.0456013679504395, + "rewards_train/margins": 1.0714645385742188, + "rewards_train/rejected": -2.117065906524658, + "step": 67 + }, + { + "epoch": 0.77, + "logps_train/chosen": -191.77606201171875, + "logps_train/ref_chosen": -187.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -159.0693359375, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.44342678785324097, + "rewards_train/margins": 1.00628000497818, + "rewards_train/rejected": -1.449706792831421, + "step": 67 + }, + { + "epoch": 0.77, + "logps_train/chosen": -136.33993530273438, + "logps_train/ref_chosen": -125.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -121.84136199951172, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -1.1351652145385742, + "rewards_train/margins": 0.6773405075073242, + "rewards_train/rejected": -1.8125057220458984, + "step": 67 + }, + { + "epoch": 0.78, + "learning_rate": 4.337649014724621e-05, + "loss": 0.5137, + "step": 68 + }, + { + "epoch": 0.78, + "logps_train/chosen": -163.60760498046875, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -143.341064453125, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.8888216018676758, + "rewards_train/margins": 0.49487829208374023, + "rewards_train/rejected": -1.383699893951416, + "step": 68 + }, + { + "epoch": 0.78, + "logps_train/chosen": -172.0780029296875, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -118.73085021972656, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.7458871603012085, + "rewards_train/margins": 0.9197756052017212, + "rewards_train/rejected": -1.6656627655029297, + "step": 68 + }, + { + "epoch": 0.78, + "logps_train/chosen": -135.83432006835938, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -148.3021697998047, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.9420253038406372, + "rewards_train/margins": 0.8683685064315796, + "rewards_train/rejected": -1.8103938102722168, + "step": 68 + }, + { + "epoch": 0.78, + "logps_train/chosen": -156.50640869140625, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -119.85636138916016, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0331604480743408, + "rewards_train/margins": 0.8918311595916748, + "rewards_train/rejected": -1.9249916076660156, + "step": 68 + }, + { + "epoch": 0.79, + "learning_rate": 4.31646037673936e-05, + "loss": 0.6014, + "step": 69 + }, + { + "epoch": 0.79, + "logps_train/chosen": -109.36697387695312, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -90.85383605957031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0384798049926758, + "rewards_train/margins": 1.0995402336120605, + "rewards_train/rejected": -2.1380200386047363, + "step": 69 + }, + { + "epoch": 0.79, + "logps_train/chosen": -164.5358428955078, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -157.33303833007812, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -1.1899994611740112, + "rewards_train/margins": 0.6788498163223267, + "rewards_train/rejected": -1.868849277496338, + "step": 69 + }, + { + "epoch": 0.79, + "logps_train/chosen": -172.2188720703125, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -148.18170166015625, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.46329355239868164, + "rewards_train/margins": 1.061615228652954, + "rewards_train/rejected": -1.5249087810516357, + "step": 69 + }, + { + "epoch": 0.79, + "logps_train/chosen": -157.74252319335938, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -144.2315673828125, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.8678563833236694, + "rewards_train/margins": 0.7282001972198486, + "rewards_train/rejected": -1.596056580543518, + "step": 69 + }, + { + "epoch": 0.8, + "learning_rate": 4.2949916607186357e-05, + "loss": 0.5289, + "step": 70 + }, + { + "epoch": 0.8, + "logps_train/chosen": -151.2384033203125, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -117.5, + "logps_train/rejected": -136.65538024902344, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.9337508678436279, + "rewards_train/margins": 0.9835443496704102, + "rewards_train/rejected": -1.917295217514038, + "step": 70 + }, + { + "epoch": 0.8, + "logps_train/chosen": -164.0899658203125, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -119.64618682861328, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.9898558855056763, + "rewards_train/margins": 1.0959542989730835, + "rewards_train/rejected": -2.0858101844787598, + "step": 70 + }, + { + "epoch": 0.8, + "logps_train/chosen": -158.28138732910156, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -107.51951599121094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7542130947113037, + "rewards_train/margins": 1.254964828491211, + "rewards_train/rejected": -2.0091779232025146, + "step": 70 + }, + { + "epoch": 0.8, + "logps_train/chosen": -158.1058349609375, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -135.90130615234375, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -1.465502142906189, + "rewards_train/margins": 0.7091010808944702, + "rewards_train/rejected": -2.174603223800659, + "step": 70 + }, + { + "epoch": 0.81, + "learning_rate": 4.273246176900252e-05, + "loss": 0.5182, + "step": 71 + }, + { + "epoch": 0.81, + "logps_train/chosen": -180.446533203125, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -157.4761962890625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7675294876098633, + "rewards_train/margins": 0.56602942943573, + "rewards_train/rejected": -1.3335589170455933, + "step": 71 + }, + { + "epoch": 0.81, + "logps_train/chosen": -166.73056030273438, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -143.46798706054688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6558671593666077, + "rewards_train/margins": 1.4000133872032166, + "rewards_train/rejected": -2.055880546569824, + "step": 71 + }, + { + "epoch": 0.81, + "logps_train/chosen": -188.02003479003906, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -167.78900146484375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7188003063201904, + "rewards_train/margins": 1.1408617496490479, + "rewards_train/rejected": -1.8596620559692383, + "step": 71 + }, + { + "epoch": 0.81, + "logps_train/chosen": -144.71981811523438, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -128.44407653808594, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -1.008699893951416, + "rewards_train/margins": 0.8348284959793091, + "rewards_train/rejected": -1.843528389930725, + "step": 71 + }, + { + "epoch": 0.82, + "learning_rate": 4.251227278196536e-05, + "loss": 0.4895, + "step": 72 + }, + { + "epoch": 0.82, + "logps_train/chosen": -132.26991271972656, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -113.90031433105469, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -0.6027978658676147, + "rewards_train/margins": 1.070241093635559, + "rewards_train/rejected": -1.6730389595031738, + "step": 72 + }, + { + "epoch": 0.82, + "logps_train/chosen": -156.4181671142578, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -129.92678833007812, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.6840531826019287, + "rewards_train/margins": 0.7702467441558838, + "rewards_train/rejected": -1.4542999267578125, + "step": 72 + }, + { + "epoch": 0.82, + "logps_train/chosen": -137.28082275390625, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -150.2662353515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9548407793045044, + "rewards_train/margins": 0.8424848318099976, + "rewards_train/rejected": -1.797325611114502, + "step": 72 + }, + { + "epoch": 0.82, + "logps_train/chosen": -159.76388549804688, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -127.27428436279297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3386008143424988, + "rewards_train/margins": 1.3924408555030823, + "rewards_train/rejected": -1.731041669845581, + "step": 72 + }, + { + "epoch": 0.83, + "learning_rate": 4.228938359677354e-05, + "loss": 0.4813, + "step": 73 + }, + { + "epoch": 0.83, + "logps_train/chosen": -158.0514373779297, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -126.70331573486328, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8166680335998535, + "rewards_train/margins": 0.6750500202178955, + "rewards_train/rejected": -1.491718053817749, + "step": 73 + }, + { + "epoch": 0.83, + "logps_train/chosen": -167.51153564453125, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -154.432861328125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.33699363470077515, + "rewards_train/margins": 0.9434502720832825, + "rewards_train/rejected": -1.2804439067840576, + "step": 73 + }, + { + "epoch": 0.83, + "logps_train/chosen": -197.72607421875, + "logps_train/ref_chosen": -196.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -160.88992309570312, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.23745089769363403, + "rewards_train/margins": 0.9035444855690002, + "rewards_train/rejected": -1.1409953832626343, + "step": 73 + }, + { + "epoch": 0.83, + "logps_train/chosen": -149.27923583984375, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -128.9501953125, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.7999448180198669, + "rewards_train/margins": 1.0314993262290955, + "rewards_train/rejected": -1.8314441442489624, + "step": 73 + }, + { + "epoch": 0.85, + "learning_rate": 4.206382858046636e-05, + "loss": 0.5071, + "step": 74 + }, + { + "epoch": 0.85, + "logps_train/chosen": -176.96607971191406, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -150.25148010253906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6526632308959961, + "rewards_train/margins": 0.5287351608276367, + "rewards_train/rejected": -1.1813983917236328, + "step": 74 + }, + { + "epoch": 0.85, + "logps_train/chosen": -202.90512084960938, + "logps_train/ref_chosen": -201.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -153.5736083984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.19715136289596558, + "rewards_train/margins": 0.9527884125709534, + "rewards_train/rejected": -1.149939775466919, + "step": 74 + }, + { + "epoch": 0.85, + "logps_train/chosen": -149.58419799804688, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -117.5, + "logps_train/rejected": -133.56790161132812, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.5854701995849609, + "rewards_train/margins": 1.0398268699645996, + "rewards_train/rejected": -1.6252970695495605, + "step": 74 + }, + { + "epoch": 0.85, + "logps_train/chosen": -132.50787353515625, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -119.30223083496094, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.8568595051765442, + "rewards_train/margins": 0.9427974820137024, + "rewards_train/rejected": -1.7996569871902466, + "step": 74 + }, + { + "epoch": 0.86, + "learning_rate": 4.1835642511124656e-05, + "loss": 0.5643, + "step": 75 + }, + { + "epoch": 0.86, + "logps_train/chosen": -158.19993591308594, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -128.560546875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4262433648109436, + "rewards_train/margins": 1.189382255077362, + "rewards_train/rejected": -1.6156256198883057, + "step": 75 + }, + { + "epoch": 0.86, + "logps_train/chosen": -188.24725341796875, + "logps_train/ref_chosen": -187.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -144.20211791992188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1420358419418335, + "rewards_train/margins": 0.9777122735977173, + "rewards_train/rejected": -1.1197481155395508, + "step": 75 + }, + { + "epoch": 0.86, + "logps_train/chosen": -137.14773559570312, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -112.96627807617188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8436791896820068, + "rewards_train/margins": 0.7864446640014648, + "rewards_train/rejected": -1.6301238536834717, + "step": 75 + }, + { + "epoch": 0.86, + "logps_train/chosen": -174.13377380371094, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -139.75009155273438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.310349702835083, + "rewards_train/margins": 1.0647573471069336, + "rewards_train/rejected": -1.3751070499420166, + "step": 75 + }, + { + "epoch": 0.87, + "learning_rate": 4.160486057250849e-05, + "loss": 0.4979, + "step": 76 + }, + { + "epoch": 0.87, + "logps_train/chosen": -142.59698486328125, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -129.82534790039062, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.3968072235584259, + "rewards_train/margins": 0.733596533536911, + "rewards_train/rejected": -1.130403757095337, + "step": 76 + }, + { + "epoch": 0.87, + "logps_train/chosen": -161.75323486328125, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -125.47662353515625, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": -1.1422175168991089, + "rewards_train/margins": 0.11257338523864746, + "rewards_train/rejected": -1.2547909021377563, + "step": 76 + }, + { + "epoch": 0.87, + "logps_train/chosen": -178.9916229248047, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -132.39141845703125, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.31464019417762756, + "rewards_train/margins": 0.715956062078476, + "rewards_train/rejected": -1.0305962562561035, + "step": 76 + }, + { + "epoch": 0.87, + "logps_train/chosen": -152.24346923828125, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -127.48640441894531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2509094476699829, + "rewards_train/margins": 1.095436692237854, + "rewards_train/rejected": -1.346346139907837, + "step": 76 + }, + { + "epoch": 0.88, + "learning_rate": 4.137151834863213e-05, + "loss": 0.6958, + "step": 77 + }, + { + "epoch": 0.88, + "logps_train/chosen": -148.580078125, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -157.56333923339844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.48818454146385193, + "rewards_train/margins": 0.4652206003665924, + "rewards_train/rejected": -0.9534051418304443, + "step": 77 + }, + { + "epoch": 0.88, + "logps_train/chosen": -166.72164916992188, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -147.3184356689453, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.47060221433639526, + "rewards_train/margins": 0.5743276476860046, + "rewards_train/rejected": -1.0449298620224, + "step": 77 + }, + { + "epoch": 0.88, + "logps_train/chosen": -143.37298583984375, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -130.98110961914062, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.5442638397216797, + "rewards_train/margins": 1.0199611186981201, + "rewards_train/rejected": -1.5642249584197998, + "step": 77 + }, + { + "epoch": 0.88, + "logps_train/chosen": -155.43408203125, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -129.516845703125, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.3463389575481415, + "rewards_train/margins": 0.9465561211109161, + "rewards_train/rejected": -1.2928950786590576, + "step": 77 + }, + { + "epoch": 0.89, + "learning_rate": 4.1135651818277445e-05, + "loss": 0.5414, + "step": 78 + }, + { + "epoch": 0.89, + "logps_train/chosen": -154.96820068359375, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -132.89260864257812, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -0.20873412489891052, + "rewards_train/margins": 1.1599203646183014, + "rewards_train/rejected": -1.368654489517212, + "step": 78 + }, + { + "epoch": 0.89, + "logps_train/chosen": -144.24415588378906, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -130.75668334960938, + "rewards_train/accuracies": 0.53125, + "rewards_train/chosen": -0.38398557901382446, + "rewards_train/margins": 0.327619731426239, + "rewards_train/rejected": -0.7116053104400635, + "step": 78 + }, + { + "epoch": 0.89, + "logps_train/chosen": -139.5198974609375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -116.1756591796875, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.28363025188446045, + "rewards_train/margins": 0.8827880620956421, + "rewards_train/rejected": -1.1664183139801025, + "step": 78 + }, + { + "epoch": 0.89, + "logps_train/chosen": -180.26397705078125, + "logps_train/ref_chosen": -178.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -149.57452392578125, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.1939750611782074, + "rewards_train/margins": 1.027538686990738, + "rewards_train/rejected": -1.2215137481689453, + "step": 78 + }, + { + "epoch": 0.9, + "learning_rate": 4.089729734944634e-05, + "loss": 0.544, + "step": 79 + }, + { + "epoch": 0.9, + "logps_train/chosen": -143.99681091308594, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -153.2803497314453, + "rewards_train/accuracies": 0.5625, + "rewards_train/chosen": -0.678196370601654, + "rewards_train/margins": 0.220004141330719, + "rewards_train/rejected": -0.898200511932373, + "step": 79 + }, + { + "epoch": 0.9, + "logps_train/chosen": -158.53390502929688, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -146.01658630371094, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.03825424611568451, + "rewards_train/margins": 0.9192156940698624, + "rewards_train/rejected": -0.9574699401855469, + "step": 79 + }, + { + "epoch": 0.9, + "logps_train/chosen": -156.74154663085938, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -150.85284423828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.48094165325164795, + "rewards_train/margins": 0.6565214395523071, + "rewards_train/rejected": -1.137463092803955, + "step": 79 + }, + { + "epoch": 0.9, + "logps_train/chosen": -183.3664093017578, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -156.0, + "logps_train/rejected": -164.84950256347656, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.10578218847513199, + "rewards_train/margins": 0.8176443204283714, + "rewards_train/rejected": -0.9234265089035034, + "step": 79 + }, + { + "epoch": 0.91, + "learning_rate": 4.065649169375324e-05, + "loss": 0.6275, + "step": 80 + }, + { + "epoch": 0.91, + "logps_train/chosen": -161.34866333007812, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -130.3014678955078, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -0.00708414614200592, + "rewards_train/margins": 1.3033236116170883, + "rewards_train/rejected": -1.3104077577590942, + "step": 80 + }, + { + "epoch": 0.91, + "logps_train/chosen": -196.28811645507812, + "logps_train/ref_chosen": -194.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -164.7366943359375, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.1925809383392334, + "rewards_train/margins": 1.0608729124069214, + "rewards_train/rejected": -1.2534538507461548, + "step": 80 + }, + { + "epoch": 0.91, + "logps_train/chosen": -143.91098022460938, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -137.1011962890625, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.027426350861787796, + "rewards_train/margins": 0.7220006696879864, + "rewards_train/rejected": -0.7494270205497742, + "step": 80 + }, + { + "epoch": 0.91, + "logps_train/chosen": -186.50070190429688, + "logps_train/ref_chosen": -188.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -138.23883056640625, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.17532047629356384, + "rewards_train/margins": 1.30330428481102, + "rewards_train/rejected": -1.127983808517456, + "step": 80 + }, + { + "epoch": 0.93, + "learning_rate": 4.041327198075838e-05, + "loss": 0.4788, + "step": 81 + }, + { + "epoch": 0.93, + "logps_train/chosen": -128.11949157714844, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -107.81585693359375, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.03633883595466614, + "rewards_train/margins": 1.1293288171291351, + "rewards_train/rejected": -1.1656676530838013, + "step": 81 + }, + { + "epoch": 0.93, + "logps_train/chosen": -165.39813232421875, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -141.9514617919922, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.31048035621643066, + "rewards_train/margins": 1.1549919247627258, + "rewards_train/rejected": -0.8445115685462952, + "step": 81 + }, + { + "epoch": 0.93, + "logps_train/chosen": -126.2926025390625, + "logps_train/ref_chosen": -124.5, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -113.79829406738281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.19713124632835388, + "rewards_train/margins": 0.9133626520633698, + "rewards_train/rejected": -1.1104938983917236, + "step": 81 + }, + { + "epoch": 0.93, + "logps_train/chosen": -187.25137329101562, + "logps_train/ref_chosen": -187.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -142.0508575439453, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": -0.020840942859649658, + "rewards_train/margins": 1.0018226504325867, + "rewards_train/rejected": -1.0226635932922363, + "step": 81 + }, + { + "epoch": 0.94, + "learning_rate": 4.016767571224284e-05, + "loss": 0.4871, + "step": 82 + }, + { + "epoch": 0.94, + "logps_train/chosen": -136.57688903808594, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -102.67213439941406, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.5452377796173096, + "rewards_train/margins": 0.577444314956665, + "rewards_train/rejected": -1.1226820945739746, + "step": 82 + }, + { + "epoch": 0.94, + "logps_train/chosen": -162.8528289794922, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -123.56939697265625, + "rewards_train/accuracies": 0.59375, + "rewards_train/chosen": -0.4730757772922516, + "rewards_train/margins": 0.4601331651210785, + "rewards_train/rejected": -0.9332089424133301, + "step": 82 + }, + { + "epoch": 0.94, + "logps_train/chosen": -152.0038299560547, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -127.97988891601562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0006765499711036682, + "rewards_train/margins": 1.3138957843184471, + "rewards_train/rejected": -1.3145723342895508, + "step": 82 + }, + { + "epoch": 0.94, + "logps_train/chosen": -141.26974487304688, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -124.5, + "logps_train/rejected": -132.22300720214844, + "rewards_train/accuracies": 0.46875, + "rewards_train/chosen": -0.452718585729599, + "rewards_train/margins": 0.32729652523994446, + "rewards_train/rejected": -0.7800151109695435, + "step": 82 + }, + { + "epoch": 0.95, + "learning_rate": 3.991974075642621e-05, + "loss": 0.5842, + "step": 83 + }, + { + "epoch": 0.95, + "logps_train/chosen": -144.98745727539062, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -131.68789672851562, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.15765535831451416, + "rewards_train/margins": 1.060597538948059, + "rewards_train/rejected": -1.2182528972625732, + "step": 83 + }, + { + "epoch": 0.95, + "logps_train/chosen": -159.57968139648438, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -131.57052612304688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.22476573288440704, + "rewards_train/margins": 1.0797476023435593, + "rewards_train/rejected": -1.3045133352279663, + "step": 83 + }, + { + "epoch": 0.95, + "logps_train/chosen": -138.2335968017578, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -133.75772094726562, + "rewards_train/accuracies": 0.8125, + "rewards_train/chosen": -0.14119389653205872, + "rewards_train/margins": 1.2634479701519012, + "rewards_train/rejected": -1.40464186668396, + "step": 83 + }, + { + "epoch": 0.95, + "logps_train/chosen": -145.84381103515625, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -147.16595458984375, + "rewards_train/accuracies": 0.65625, + "rewards_train/chosen": -0.2642488479614258, + "rewards_train/margins": 0.4790076017379761, + "rewards_train/rejected": -0.7432564496994019, + "step": 83 + }, + { + "epoch": 0.96, + "learning_rate": 3.96695053421277e-05, + "loss": 0.4781, + "step": 84 + }, + { + "epoch": 0.96, + "logps_train/chosen": -183.54966735839844, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -143.7237548828125, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.6588726043701172, + "rewards_train/margins": 0.7602804899215698, + "rewards_train/rejected": -1.419153094291687, + "step": 84 + }, + { + "epoch": 0.96, + "logps_train/chosen": -118.30413055419922, + "logps_train/ref_chosen": -113.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -104.0281982421875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.46830376982688904, + "rewards_train/margins": 0.7527767717838287, + "rewards_train/rejected": -1.2210805416107178, + "step": 84 + }, + { + "epoch": 0.96, + "logps_train/chosen": -142.31927490234375, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -121.17881774902344, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.44811493158340454, + "rewards_train/margins": 0.5898844599723816, + "rewards_train/rejected": -1.0379993915557861, + "step": 84 + }, + { + "epoch": 0.96, + "logps_train/chosen": -162.7830047607422, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -154.27557373046875, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.023749448359012604, + "rewards_train/margins": 0.9083372130990028, + "rewards_train/rejected": -0.8845877647399902, + "step": 84 + }, + { + "epoch": 0.97, + "learning_rate": 3.941700805287168e-05, + "loss": 0.5488, + "step": 85 + }, + { + "epoch": 0.97, + "logps_train/chosen": -145.57501220703125, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -146.69969177246094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0080266073346138, + "rewards_train/margins": 0.8595626428723335, + "rewards_train/rejected": -0.8515360355377197, + "step": 85 + }, + { + "epoch": 0.97, + "logps_train/chosen": -157.25, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -141.33470153808594, + "rewards_train/accuracies": 0.78125, + "rewards_train/chosen": 0.25292864441871643, + "rewards_train/margins": 0.8953824937343597, + "rewards_train/rejected": -0.6424538493156433, + "step": 85 + }, + { + "epoch": 0.97, + "logps_train/chosen": -156.912109375, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -123.60111999511719, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": 0.2099611759185791, + "rewards_train/margins": 0.9257863759994507, + "rewards_train/rejected": -0.7158252000808716, + "step": 85 + }, + { + "epoch": 0.97, + "logps_train/chosen": -148.09971618652344, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -110.16642761230469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.032508380711078644, + "rewards_train/margins": 1.2118464782834053, + "rewards_train/rejected": -1.1793380975723267, + "step": 85 + }, + { + "epoch": 0.98, + "learning_rate": 3.916228782093857e-05, + "loss": 0.4959, + "step": 86 + }, + { + "epoch": 0.98, + "logps_train/chosen": -134.76730346679688, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -116.19819641113281, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": -0.11325030773878098, + "rewards_train/margins": 0.5541764572262764, + "rewards_train/rejected": -0.6674267649650574, + "step": 86 + }, + { + "epoch": 0.98, + "logps_train/chosen": -177.84475708007812, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -145.89642333984375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3474574685096741, + "rewards_train/margins": 1.1841707825660706, + "rewards_train/rejected": -0.8367133140563965, + "step": 86 + }, + { + "epoch": 0.98, + "logps_train/chosen": -162.54910278320312, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -128.0146484375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.29553040862083435, + "rewards_train/margins": 1.3141840398311615, + "rewards_train/rejected": -1.0186536312103271, + "step": 86 + }, + { + "epoch": 0.98, + "logps_train/chosen": -159.21815490722656, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -137.9909210205078, + "rewards_train/accuracies": 0.6875, + "rewards_train/chosen": 0.2795528173446655, + "rewards_train/margins": 0.6489570438861847, + "rewards_train/rejected": -0.36940422654151917, + "step": 86 + }, + { + "epoch": 0.99, + "learning_rate": 3.890538392136188e-05, + "loss": 0.5114, + "step": 87 + }, + { + "epoch": 0.99, + "logps_train/chosen": -156.0184326171875, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -113.9042739868164, + "rewards_train/accuracies": 0.71875, + "rewards_train/chosen": -0.060827575623989105, + "rewards_train/margins": 0.9319678917527199, + "rewards_train/rejected": -0.992795467376709, + "step": 87 + }, + { + "epoch": 0.99, + "logps_train/chosen": -153.46629333496094, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -147.35433959960938, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": 0.40214991569519043, + "rewards_train/margins": 1.7021600008010864, + "rewards_train/rejected": -1.300010085105896, + "step": 87 + }, + { + "epoch": 0.99, + "logps_train/chosen": -137.97434997558594, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -142.22134399414062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3973763883113861, + "rewards_train/margins": 1.8008342683315277, + "rewards_train/rejected": -1.4034578800201416, + "step": 87 + }, + { + "epoch": 0.99, + "logps_train/chosen": -149.7021026611328, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -150.05059814453125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.41885271668434143, + "rewards_train/margins": 2.0571157038211823, + "rewards_train/rejected": -1.6382629871368408, + "step": 87 + }, + { + "epoch": 1.01, + "learning_rate": 3.8646335965872414e-05, + "loss": 0.3438, + "step": 88 + }, + { + "epoch": 1.01, + "logps_train/chosen": -133.6719207763672, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -138.8072509765625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.3787682056427002, + "rewards_train/margins": 1.741170048713684, + "rewards_train/rejected": -1.3624018430709839, + "step": 88 + }, + { + "epoch": 1.01, + "logps_train/chosen": -175.55409240722656, + "logps_train/ref_chosen": -185.0, + "logps_train/ref_rejected": -160.0, + "logps_train/rejected": -170.5850067138672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9660758972167969, + "rewards_train/margins": 2.0684245824813843, + "rewards_train/rejected": -1.1023486852645874, + "step": 88 + }, + { + "epoch": 1.01, + "logps_train/chosen": -130.45492553710938, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -122.48587799072266, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 0.4400542378425598, + "rewards_train/margins": 2.027607262134552, + "rewards_train/rejected": -1.5875530242919922, + "step": 88 + }, + { + "epoch": 1.01, + "logps_train/chosen": -139.69839477539062, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -145.28656005859375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.48426252603530884, + "rewards_train/margins": 2.070731222629547, + "rewards_train/rejected": -1.5864686965942383, + "step": 88 + }, + { + "epoch": 1.02, + "learning_rate": 3.838518389679065e-05, + "loss": 0.223, + "step": 89 + }, + { + "epoch": 1.02, + "logps_train/chosen": -135.4624786376953, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -144.20643615722656, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -0.40640440583229065, + "rewards_train/margins": 1.82263645529747, + "rewards_train/rejected": -2.2290408611297607, + "step": 89 + }, + { + "epoch": 1.02, + "logps_train/chosen": -151.22457885742188, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -158.74038696289062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1002472639083862, + "rewards_train/margins": 2.5626660585403442, + "rewards_train/rejected": -1.462418794631958, + "step": 89 + }, + { + "epoch": 1.02, + "logps_train/chosen": -152.0464324951172, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -123.42019653320312, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 1.0309046506881714, + "rewards_train/margins": 2.756957173347473, + "rewards_train/rejected": -1.7260525226593018, + "step": 89 + }, + { + "epoch": 1.02, + "logps_train/chosen": -142.5914764404297, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -130.45376586914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0303058624267578, + "rewards_train/margins": 2.694628357887268, + "rewards_train/rejected": -1.6643224954605103, + "step": 89 + }, + { + "epoch": 1.03, + "learning_rate": 3.812196798086799e-05, + "loss": 0.2086, + "step": 90 + }, + { + "epoch": 1.03, + "logps_train/chosen": -122.85346221923828, + "logps_train/ref_chosen": -127.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -120.95262908935547, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.44590410590171814, + "rewards_train/margins": 2.2182177007198334, + "rewards_train/rejected": -1.7723135948181152, + "step": 90 + }, + { + "epoch": 1.03, + "logps_train/chosen": -135.0833282470703, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -148.67274475097656, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 0.5127615928649902, + "rewards_train/margins": 1.8486876487731934, + "rewards_train/rejected": -1.3359260559082031, + "step": 90 + }, + { + "epoch": 1.03, + "logps_train/chosen": -127.56909942626953, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -137.51315307617188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.8834019899368286, + "rewards_train/margins": 2.3031264543533325, + "rewards_train/rejected": -1.419724464416504, + "step": 90 + }, + { + "epoch": 1.03, + "logps_train/chosen": -157.03421020507812, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -134.88729858398438, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.7317352294921875, + "rewards_train/margins": 2.5376535654067993, + "rewards_train/rejected": -1.8059183359146118, + "step": 90 + }, + { + "epoch": 1.04, + "learning_rate": 3.785672880307817e-05, + "loss": 0.2009, + "step": 91 + }, + { + "epoch": 1.04, + "logps_train/chosen": -175.03244018554688, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -156.64846801757812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.8252712488174438, + "rewards_train/margins": 2.3022282123565674, + "rewards_train/rejected": -1.4769569635391235, + "step": 91 + }, + { + "epoch": 1.04, + "logps_train/chosen": -134.37451171875, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -126.92540740966797, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.2630377411842346, + "rewards_train/margins": 1.791417419910431, + "rewards_train/rejected": -1.5283796787261963, + "step": 91 + }, + { + "epoch": 1.04, + "logps_train/chosen": -122.10877227783203, + "logps_train/ref_chosen": -121.0, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -109.1966552734375, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -0.07845546305179596, + "rewards_train/margins": 1.5968745797872543, + "rewards_train/rejected": -1.6753300428390503, + "step": 91 + }, + { + "epoch": 1.04, + "logps_train/chosen": -175.49851989746094, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -151.86138916015625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.5926284790039062, + "rewards_train/margins": 2.2545496225357056, + "rewards_train/rejected": -1.6619211435317993, + "step": 91 + }, + { + "epoch": 1.05, + "learning_rate": 3.7589507260359404e-05, + "loss": 0.2275, + "step": 92 + }, + { + "epoch": 1.05, + "logps_train/chosen": -149.17735290527344, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -150.0, + "logps_train/rejected": -168.39157104492188, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.5334359407424927, + "rewards_train/margins": 2.4423190355300903, + "rewards_train/rejected": -1.9088830947875977, + "step": 92 + }, + { + "epoch": 1.05, + "logps_train/chosen": -128.9286346435547, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -155.26699829101562, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.3554763197898865, + "rewards_train/margins": 2.472897946834564, + "rewards_train/rejected": -2.1174216270446777, + "step": 92 + }, + { + "epoch": 1.05, + "logps_train/chosen": -126.43780517578125, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -122.29857635498047, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.2506530284881592, + "rewards_train/margins": 2.3148858547210693, + "rewards_train/rejected": -2.06423282623291, + "step": 92 + }, + { + "epoch": 1.05, + "logps_train/chosen": -155.54661560058594, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -121.3519515991211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8316656351089478, + "rewards_train/margins": 2.5136386156082153, + "rewards_train/rejected": -1.6819729804992676, + "step": 92 + }, + { + "epoch": 1.06, + "learning_rate": 3.732034455530863e-05, + "loss": 0.1687, + "step": 93 + }, + { + "epoch": 1.06, + "logps_train/chosen": -180.7027587890625, + "logps_train/ref_chosen": -190.0, + "logps_train/ref_rejected": -171.0, + "logps_train/rejected": -185.62432861328125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.9221908450126648, + "rewards_train/margins": 2.366423189640045, + "rewards_train/rejected": -1.4442323446273804, + "step": 93 + }, + { + "epoch": 1.06, + "logps_train/chosen": -174.7982940673828, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -162.9814453125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.7939980030059814, + "rewards_train/margins": 2.73032546043396, + "rewards_train/rejected": -1.9363274574279785, + "step": 93 + }, + { + "epoch": 1.06, + "logps_train/chosen": -133.31781005859375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -116.48727416992188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.2709043323993683, + "rewards_train/margins": 2.5305700600147247, + "rewards_train/rejected": -2.2596657276153564, + "step": 93 + }, + { + "epoch": 1.06, + "logps_train/chosen": -156.94393920898438, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -139.75140380859375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.225722536444664, + "rewards_train/margins": 2.1422575563192368, + "rewards_train/rejected": -1.9165350198745728, + "step": 93 + }, + { + "epoch": 1.07, + "learning_rate": 3.704928218982845e-05, + "loss": 0.1829, + "step": 94 + }, + { + "epoch": 1.07, + "logps_train/chosen": -139.14205932617188, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -131.75674438476562, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.3425329923629761, + "rewards_train/margins": 2.1845154762268066, + "rewards_train/rejected": -1.8419824838638306, + "step": 94 + }, + { + "epoch": 1.07, + "logps_train/chosen": -145.99407958984375, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -156.46421813964844, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 0.3601621985435486, + "rewards_train/margins": 2.0587562918663025, + "rewards_train/rejected": -1.698594093322754, + "step": 94 + }, + { + "epoch": 1.07, + "logps_train/chosen": -125.1644287109375, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -120.51481628417969, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -0.10804449021816254, + "rewards_train/margins": 2.082010880112648, + "rewards_train/rejected": -2.1900553703308105, + "step": 94 + }, + { + "epoch": 1.07, + "logps_train/chosen": -156.272216796875, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -144.20132446289062, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.561450183391571, + "rewards_train/margins": 2.499405324459076, + "rewards_train/rejected": -1.9379551410675049, + "step": 94 + }, + { + "epoch": 1.09, + "learning_rate": 3.677636195872802e-05, + "loss": 0.2071, + "step": 95 + }, + { + "epoch": 1.09, + "logps_train/chosen": -165.90792846679688, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -166.0856170654297, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.7180935144424438, + "rewards_train/margins": 2.9360300302505493, + "rewards_train/rejected": -2.2179365158081055, + "step": 95 + }, + { + "epoch": 1.09, + "logps_train/chosen": -143.52719116210938, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -130.509521484375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.600570797920227, + "rewards_train/margins": 2.178702235221863, + "rewards_train/rejected": -2.77927303314209, + "step": 95 + }, + { + "epoch": 1.09, + "logps_train/chosen": -168.9429931640625, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -131.9114532470703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6327519416809082, + "rewards_train/margins": 3.1334948539733887, + "rewards_train/rejected": -2.5007429122924805, + "step": 95 + }, + { + "epoch": 1.09, + "logps_train/chosen": -161.4586944580078, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -135.01678466796875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.04224124550819397, + "rewards_train/margins": 2.7115966379642487, + "rewards_train/rejected": -2.6693553924560547, + "step": 95 + }, + { + "epoch": 1.1, + "learning_rate": 3.6501625943278805e-05, + "loss": 0.1683, + "step": 96 + }, + { + "epoch": 1.1, + "logps_train/chosen": -166.95816040039062, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -168.36917114257812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.10113706439733505, + "rewards_train/margins": 2.489418126642704, + "rewards_train/rejected": -2.590555191040039, + "step": 96 + }, + { + "epoch": 1.1, + "logps_train/chosen": -159.73220825195312, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -146.8890380859375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": 0.12797528505325317, + "rewards_train/margins": 2.4644392132759094, + "rewards_train/rejected": -2.3364639282226562, + "step": 96 + }, + { + "epoch": 1.1, + "logps_train/chosen": -141.54803466796875, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -137.75836181640625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.1825377345085144, + "rewards_train/margins": 2.6527701020240784, + "rewards_train/rejected": -2.8353078365325928, + "step": 96 + }, + { + "epoch": 1.1, + "logps_train/chosen": -167.2300262451172, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -154.9856414794922, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 0.023921027779579163, + "rewards_train/margins": 2.4235100895166397, + "rewards_train/rejected": -2.3995890617370605, + "step": 96 + }, + { + "epoch": 1.11, + "learning_rate": 3.622511650472601e-05, + "loss": 0.1993, + "step": 97 + }, + { + "epoch": 1.11, + "logps_train/chosen": -145.75787353515625, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -143.81246948242188, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": 0.23278886079788208, + "rewards_train/margins": 2.254021942615509, + "rewards_train/rejected": -2.021233081817627, + "step": 97 + }, + { + "epoch": 1.11, + "logps_train/chosen": -134.80609130859375, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -121.690185546875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.004826478660106659, + "rewards_train/margins": 2.3503986075520515, + "rewards_train/rejected": -2.355225086212158, + "step": 97 + }, + { + "epoch": 1.11, + "logps_train/chosen": -122.42782592773438, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -128.15924072265625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.4224705398082733, + "rewards_train/margins": 2.235249310731888, + "rewards_train/rejected": -2.657719850540161, + "step": 97 + }, + { + "epoch": 1.11, + "logps_train/chosen": -164.96200561523438, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -157.0484619140625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": 0.057143136858940125, + "rewards_train/margins": 2.3765157908201218, + "rewards_train/rejected": -2.3193726539611816, + "step": 97 + }, + { + "epoch": 1.12, + "learning_rate": 3.5946876277757066e-05, + "loss": 0.2219, + "step": 98 + }, + { + "epoch": 1.12, + "logps_train/chosen": -157.20069885253906, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -158.66604614257812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.008192479610443115, + "rewards_train/margins": 2.5320449471473694, + "rewards_train/rejected": -2.5402374267578125, + "step": 98 + }, + { + "epoch": 1.12, + "logps_train/chosen": -168.69036865234375, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -165.33319091796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.05536573380231857, + "rewards_train/margins": 2.6265848353505135, + "rewards_train/rejected": -2.681950569152832, + "step": 98 + }, + { + "epoch": 1.12, + "logps_train/chosen": -150.9794464111328, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -150.70811462402344, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.5467734336853027, + "rewards_train/margins": 2.44610857963562, + "rewards_train/rejected": -2.992882013320923, + "step": 98 + }, + { + "epoch": 1.12, + "logps_train/chosen": -137.99195861816406, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -140.39208984375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.13567104935646057, + "rewards_train/margins": 2.4768291413784027, + "rewards_train/rejected": -2.6125001907348633, + "step": 98 + }, + { + "epoch": 1.13, + "learning_rate": 3.5666948163927716e-05, + "loss": 0.1803, + "step": 99 + }, + { + "epoch": 1.13, + "logps_train/chosen": -146.2452392578125, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -150.16941833496094, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.3603140115737915, + "rewards_train/margins": 2.619420647621155, + "rewards_train/rejected": -2.9797346591949463, + "step": 99 + }, + { + "epoch": 1.13, + "logps_train/chosen": -126.06419372558594, + "logps_train/ref_chosen": -121.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -114.22348022460938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.49977877736091614, + "rewards_train/margins": 2.319640129804611, + "rewards_train/rejected": -2.8194189071655273, + "step": 99 + }, + { + "epoch": 1.13, + "logps_train/chosen": -132.54383850097656, + "logps_train/ref_chosen": -125.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -168.75698852539062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7930556535720825, + "rewards_train/margins": 2.4375261068344116, + "rewards_train/rejected": -3.230581760406494, + "step": 99 + }, + { + "epoch": 1.13, + "logps_train/chosen": -130.33595275878906, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -93.5, + "logps_train/rejected": -121.05641174316406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6355488896369934, + "rewards_train/margins": 2.140502631664276, + "rewards_train/rejected": -2.7760515213012695, + "step": 99 + }, + { + "epoch": 1.14, + "learning_rate": 3.5385375325047166e-05, + "loss": 0.2181, + "step": 100 + }, + { + "epoch": 1.14, + "logps_train/chosen": -173.3467254638672, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -161.84597778320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3217822313308716, + "rewards_train/margins": 2.759886145591736, + "rewards_train/rejected": -3.0816683769226074, + "step": 100 + }, + { + "epoch": 1.14, + "logps_train/chosen": -180.46633911132812, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -174.64852905273438, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.31440621614456177, + "rewards_train/margins": 2.7816969752311707, + "rewards_train/rejected": -3.0961031913757324, + "step": 100 + }, + { + "epoch": 1.14, + "logps_train/chosen": -149.21096801757812, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -135.708251953125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.562260091304779, + "rewards_train/margins": 2.4379597306251526, + "rewards_train/rejected": -3.0002198219299316, + "step": 100 + }, + { + "epoch": 1.14, + "logps_train/chosen": -165.3883819580078, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -147.68844604492188, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.2641317844390869, + "rewards_train/margins": 2.305885076522827, + "rewards_train/rejected": -2.570016860961914, + "step": 100 + }, + { + "epoch": 1.15, + "learning_rate": 3.510220117652297e-05, + "loss": 0.1737, + "step": 101 + }, + { + "epoch": 1.15, + "logps_train/chosen": -153.82940673828125, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -131.58132934570312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.41795065999031067, + "rewards_train/margins": 3.12226340174675, + "rewards_train/rejected": -3.5402140617370605, + "step": 101 + }, + { + "epoch": 1.15, + "logps_train/chosen": -149.13304138183594, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -142.9083251953125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.5003165006637573, + "rewards_train/margins": 2.4736016988754272, + "rewards_train/rejected": -2.9739181995391846, + "step": 101 + }, + { + "epoch": 1.15, + "logps_train/chosen": -185.08547973632812, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -164.0, + "logps_train/rejected": -201.46383666992188, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.43471941351890564, + "rewards_train/margins": 3.3151794970035553, + "rewards_train/rejected": -3.749898910522461, + "step": 101 + }, + { + "epoch": 1.15, + "logps_train/chosen": -143.16180419921875, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -152.1904754638672, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8441107273101807, + "rewards_train/margins": 2.4427101612091064, + "rewards_train/rejected": -3.286820888519287, + "step": 101 + }, + { + "epoch": 1.17, + "learning_rate": 3.481746938066684e-05, + "loss": 0.1572, + "step": 102 + }, + { + "epoch": 1.17, + "logps_train/chosen": -169.8740234375, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -151.0, + "logps_train/rejected": -182.55886840820312, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -0.9510747790336609, + "rewards_train/margins": 2.172243893146515, + "rewards_train/rejected": -3.123318672180176, + "step": 102 + }, + { + "epoch": 1.17, + "logps_train/chosen": -140.3857421875, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -151.04632568359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9004732966423035, + "rewards_train/margins": 2.5984941124916077, + "rewards_train/rejected": -3.498967409133911, + "step": 102 + }, + { + "epoch": 1.17, + "logps_train/chosen": -180.0685577392578, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -165.80479431152344, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9504103660583496, + "rewards_train/margins": 3.016885280609131, + "rewards_train/rejected": -3.9672956466674805, + "step": 102 + }, + { + "epoch": 1.17, + "logps_train/chosen": -159.4775390625, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -159.30438232421875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.7955068349838257, + "rewards_train/margins": 2.654363751411438, + "rewards_train/rejected": -3.4498705863952637, + "step": 102 + }, + { + "epoch": 1.18, + "learning_rate": 3.4531223839962453e-05, + "loss": 0.1811, + "step": 103 + }, + { + "epoch": 1.18, + "logps_train/chosen": -126.63336181640625, + "logps_train/ref_chosen": -117.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -131.54600524902344, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.9199284911155701, + "rewards_train/margins": 3.021292269229889, + "rewards_train/rejected": -3.941220760345459, + "step": 103 + }, + { + "epoch": 1.18, + "logps_train/chosen": -138.7056121826172, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -143.95974731445312, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.7718613147735596, + "rewards_train/margins": 3.3114187717437744, + "rewards_train/rejected": -4.083280086517334, + "step": 103 + }, + { + "epoch": 1.18, + "logps_train/chosen": -170.283447265625, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -173.4869384765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.28478938341140747, + "rewards_train/margins": 3.222693145275116, + "rewards_train/rejected": -3.5074825286865234, + "step": 103 + }, + { + "epoch": 1.18, + "logps_train/chosen": -186.765380859375, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -184.8590850830078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7643318176269531, + "rewards_train/margins": 2.9727487564086914, + "rewards_train/rejected": -3.7370805740356445, + "step": 103 + }, + { + "epoch": 1.19, + "learning_rate": 3.4243508690296135e-05, + "loss": 0.1504, + "step": 104 + }, + { + "epoch": 1.19, + "logps_train/chosen": -132.2598876953125, + "logps_train/ref_chosen": -122.5, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -142.23316955566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9834104180335999, + "rewards_train/margins": 2.9381489157676697, + "rewards_train/rejected": -3.9215593338012695, + "step": 104 + }, + { + "epoch": 1.19, + "logps_train/chosen": -133.6953887939453, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -161.70277404785156, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.7267659902572632, + "rewards_train/margins": 2.867143750190735, + "rewards_train/rejected": -3.593909740447998, + "step": 104 + }, + { + "epoch": 1.19, + "logps_train/chosen": -148.66311645507812, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -142.30751037597656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.7122358083724976, + "rewards_train/margins": 3.461971879005432, + "rewards_train/rejected": -4.17420768737793, + "step": 104 + }, + { + "epoch": 1.19, + "logps_train/chosen": -169.8518829345703, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -159.99441528320312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9660478830337524, + "rewards_train/margins": 2.7869099378585815, + "rewards_train/rejected": -3.752957820892334, + "step": 104 + }, + { + "epoch": 1.2, + "learning_rate": 3.39543682941516e-05, + "loss": 0.1721, + "step": 105 + }, + { + "epoch": 1.2, + "logps_train/chosen": -163.95489501953125, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -164.74134826660156, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.6238088607788086, + "rewards_train/margins": 2.583432197570801, + "rewards_train/rejected": -4.207241058349609, + "step": 105 + }, + { + "epoch": 1.2, + "logps_train/chosen": -170.38116455078125, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -171.26705932617188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8710801005363464, + "rewards_train/margins": 3.070517122745514, + "rewards_train/rejected": -3.9415972232818604, + "step": 105 + }, + { + "epoch": 1.2, + "logps_train/chosen": -171.02854919433594, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -151.22903442382812, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.7450417280197144, + "rewards_train/margins": 2.1690722703933716, + "rewards_train/rejected": -3.914113998413086, + "step": 105 + }, + { + "epoch": 1.2, + "logps_train/chosen": -162.40505981445312, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -158.7891845703125, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.560684084892273, + "rewards_train/margins": 2.450362801551819, + "rewards_train/rejected": -4.011046886444092, + "step": 105 + }, + { + "epoch": 1.21, + "learning_rate": 3.366384723376977e-05, + "loss": 0.2371, + "step": 106 + }, + { + "epoch": 1.21, + "logps_train/chosen": -119.2367935180664, + "logps_train/ref_chosen": -99.5, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -139.24002075195312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9458470344543457, + "rewards_train/margins": 2.0191712379455566, + "rewards_train/rejected": -3.9650182723999023, + "step": 106 + }, + { + "epoch": 1.21, + "logps_train/chosen": -144.74407958984375, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -153.19857788085938, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.4341243505477905, + "rewards_train/margins": 2.543986439704895, + "rewards_train/rejected": -3.9781107902526855, + "step": 106 + }, + { + "epoch": 1.21, + "logps_train/chosen": -190.86489868164062, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -181.99560546875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8407862186431885, + "rewards_train/margins": 3.3185408115386963, + "rewards_train/rejected": -4.159327030181885, + "step": 106 + }, + { + "epoch": 1.21, + "logps_train/chosen": -173.3730926513672, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -179.78939819335938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8697314262390137, + "rewards_train/margins": 3.73850679397583, + "rewards_train/rejected": -4.608238220214844, + "step": 106 + }, + { + "epoch": 1.22, + "learning_rate": 3.3371990304274656e-05, + "loss": 0.1757, + "step": 107 + }, + { + "epoch": 1.22, + "logps_train/chosen": -186.10415649414062, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -159.8262939453125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9052414894104004, + "rewards_train/margins": 3.8516058921813965, + "rewards_train/rejected": -4.756847381591797, + "step": 107 + }, + { + "epoch": 1.22, + "logps_train/chosen": -146.3466339111328, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -157.24111938476562, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.206477165222168, + "rewards_train/margins": 2.8152427673339844, + "rewards_train/rejected": -4.021719932556152, + "step": 107 + }, + { + "epoch": 1.22, + "logps_train/chosen": -172.4299774169922, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -143.0, + "logps_train/rejected": -178.18833923339844, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.0049124956130981, + "rewards_train/margins": 2.5578664541244507, + "rewards_train/rejected": -3.562778949737549, + "step": 107 + }, + { + "epoch": 1.22, + "logps_train/chosen": -148.16546630859375, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -114.5, + "logps_train/rejected": -157.4647216796875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6409605741500854, + "rewards_train/margins": 2.648480534553528, + "rewards_train/rejected": -4.289441108703613, + "step": 107 + }, + { + "epoch": 1.23, + "learning_rate": 3.3078842506766484e-05, + "loss": 0.1539, + "step": 108 + }, + { + "epoch": 1.23, + "logps_train/chosen": -138.66619873046875, + "logps_train/ref_chosen": -120.0, + "logps_train/ref_rejected": -126.5, + "logps_train/rejected": -176.2493896484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8791207075119019, + "rewards_train/margins": 3.097185254096985, + "rewards_train/rejected": -4.976305961608887, + "step": 108 + }, + { + "epoch": 1.23, + "logps_train/chosen": -143.60552978515625, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -149.5123291015625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4019588232040405, + "rewards_train/margins": 2.506013035774231, + "rewards_train/rejected": -3.9079718589782715, + "step": 108 + }, + { + "epoch": 1.23, + "logps_train/chosen": -157.93670654296875, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -159.3701171875, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.7084167003631592, + "rewards_train/margins": 3.0381643772125244, + "rewards_train/rejected": -4.746581077575684, + "step": 108 + }, + { + "epoch": 1.23, + "logps_train/chosen": -199.24874877929688, + "logps_train/ref_chosen": -188.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -186.7269287109375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0649135112762451, + "rewards_train/margins": 3.8796541690826416, + "rewards_train/rejected": -4.944567680358887, + "step": 108 + }, + { + "epoch": 1.25, + "learning_rate": 3.278444904138297e-05, + "loss": 0.1546, + "step": 109 + }, + { + "epoch": 1.25, + "logps_train/chosen": -143.3296661376953, + "logps_train/ref_chosen": -119.5, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -155.20834350585938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.4022021293640137, + "rewards_train/margins": 2.016094207763672, + "rewards_train/rejected": -4.4182963371276855, + "step": 109 + }, + { + "epoch": 1.25, + "logps_train/chosen": -152.14297485351562, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -163.31515502929688, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.001040458679199, + "rewards_train/margins": 2.6719303131103516, + "rewards_train/rejected": -4.672970771789551, + "step": 109 + }, + { + "epoch": 1.25, + "logps_train/chosen": -147.17257690429688, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -133.1774444580078, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -2.0941858291625977, + "rewards_train/margins": 2.6622066497802734, + "rewards_train/rejected": -4.756392478942871, + "step": 109 + }, + { + "epoch": 1.25, + "logps_train/chosen": -170.54586791992188, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -167.38327026367188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1571266651153564, + "rewards_train/margins": 3.347363233566284, + "rewards_train/rejected": -4.504489898681641, + "step": 109 + }, + { + "epoch": 1.26, + "learning_rate": 3.248885530033004e-05, + "loss": 0.2248, + "step": 110 + }, + { + "epoch": 1.26, + "logps_train/chosen": -170.85304260253906, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -174.83726501464844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8820815086364746, + "rewards_train/margins": 3.082406520843506, + "rewards_train/rejected": -4.9644880294799805, + "step": 110 + }, + { + "epoch": 1.26, + "logps_train/chosen": -141.23141479492188, + "logps_train/ref_chosen": -121.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -149.21527099609375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.9987776279449463, + "rewards_train/margins": 2.5813276767730713, + "rewards_train/rejected": -4.580105304718018, + "step": 110 + }, + { + "epoch": 1.26, + "logps_train/chosen": -150.4207000732422, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -141.09347534179688, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7420827150344849, + "rewards_train/margins": 2.7829874753952026, + "rewards_train/rejected": -4.5250701904296875, + "step": 110 + }, + { + "epoch": 1.26, + "logps_train/chosen": -154.02862548828125, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -146.64706420898438, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -2.1366517543792725, + "rewards_train/margins": 2.7122819423675537, + "rewards_train/rejected": -4.848933696746826, + "step": 110 + }, + { + "epoch": 1.27, + "learning_rate": 3.219210686088278e-05, + "loss": 0.1783, + "step": 111 + }, + { + "epoch": 1.27, + "logps_train/chosen": -196.1944580078125, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -177.0728759765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4467411041259766, + "rewards_train/margins": 3.594043731689453, + "rewards_train/rejected": -5.04078483581543, + "step": 111 + }, + { + "epoch": 1.27, + "logps_train/chosen": -154.63021850585938, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -155.19606018066406, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4006195068359375, + "rewards_train/margins": 2.3893957138061523, + "rewards_train/rejected": -4.79001522064209, + "step": 111 + }, + { + "epoch": 1.27, + "logps_train/chosen": -206.43435668945312, + "logps_train/ref_chosen": -196.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -181.19366455078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0297620296478271, + "rewards_train/margins": 3.8278849124908447, + "rewards_train/rejected": -4.857646942138672, + "step": 111 + }, + { + "epoch": 1.27, + "logps_train/chosen": -200.61825561523438, + "logps_train/ref_chosen": -185.0, + "logps_train/ref_rejected": -147.0, + "logps_train/rejected": -197.92047119140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5702245235443115, + "rewards_train/margins": 3.5200655460357666, + "rewards_train/rejected": -5.090290069580078, + "step": 111 + }, + { + "epoch": 1.28, + "learning_rate": 3.1894249478357965e-05, + "loss": 0.1124, + "step": 112 + }, + { + "epoch": 1.28, + "logps_train/chosen": -184.96241760253906, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -177.40328979492188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3922240734100342, + "rewards_train/margins": 3.4400475025177, + "rewards_train/rejected": -4.832271575927734, + "step": 112 + }, + { + "epoch": 1.28, + "logps_train/chosen": -190.53353881835938, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -161.22642517089844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.5588231086730957, + "rewards_train/margins": 3.2116708755493164, + "rewards_train/rejected": -4.770493984222412, + "step": 112 + }, + { + "epoch": 1.28, + "logps_train/chosen": -162.96185302734375, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -184.81460571289062, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.5726509094238281, + "rewards_train/margins": 3.1111040115356445, + "rewards_train/rejected": -4.683754920959473, + "step": 112 + }, + { + "epoch": 1.28, + "logps_train/chosen": -188.84292602539062, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -189.7190399169922, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.4589011669158936, + "rewards_train/margins": 3.8735506534576416, + "rewards_train/rejected": -5.332451820373535, + "step": 112 + }, + { + "epoch": 1.29, + "learning_rate": 3.15953290790591e-05, + "loss": 0.1447, + "step": 113 + }, + { + "epoch": 1.29, + "logps_train/chosen": -155.69085693359375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -144.44094848632812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.01090669631958, + "rewards_train/margins": 3.0220065116882324, + "rewards_train/rejected": -5.0329132080078125, + "step": 113 + }, + { + "epoch": 1.29, + "logps_train/chosen": -176.43182373046875, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -196.0265655517578, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.9890806674957275, + "rewards_train/margins": 3.118849515914917, + "rewards_train/rejected": -5.1079301834106445, + "step": 113 + }, + { + "epoch": 1.29, + "logps_train/chosen": -147.64617919921875, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -158.9957733154297, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8295596837997437, + "rewards_train/margins": 2.9519516229629517, + "rewards_train/rejected": -4.781511306762695, + "step": 113 + }, + { + "epoch": 1.29, + "logps_train/chosen": -201.11227416992188, + "logps_train/ref_chosen": -191.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -172.78042602539062, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9713826775550842, + "rewards_train/margins": 3.347773015499115, + "rewards_train/rejected": -4.319155693054199, + "step": 113 + }, + { + "epoch": 1.3, + "learning_rate": 3.1295391753195047e-05, + "loss": 0.1702, + "step": 114 + }, + { + "epoch": 1.3, + "logps_train/chosen": -183.66317749023438, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -176.61851501464844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3601659536361694, + "rewards_train/margins": 3.3909436464309692, + "rewards_train/rejected": -4.751109600067139, + "step": 114 + }, + { + "epoch": 1.3, + "logps_train/chosen": -190.912353515625, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -192.1277618408203, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.485377311706543, + "rewards_train/margins": 2.946054458618164, + "rewards_train/rejected": -5.431431770324707, + "step": 114 + }, + { + "epoch": 1.3, + "logps_train/chosen": -142.88148498535156, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -151.7729949951172, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.4135019779205322, + "rewards_train/margins": 3.1451447010040283, + "rewards_train/rejected": -4.5586466789245605, + "step": 114 + }, + { + "epoch": 1.3, + "logps_train/chosen": -195.8353271484375, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -157.0, + "logps_train/rejected": -207.72579956054688, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4638062715530396, + "rewards_train/margins": 3.5900248289108276, + "rewards_train/rejected": -5.053831100463867, + "step": 114 + }, + { + "epoch": 1.31, + "learning_rate": 3.099448374777351e-05, + "loss": 0.1436, + "step": 115 + }, + { + "epoch": 1.31, + "logps_train/chosen": -202.21957397460938, + "logps_train/ref_chosen": -187.0, + "logps_train/ref_rejected": -158.0, + "logps_train/rejected": -206.32733154296875, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -1.5319174528121948, + "rewards_train/margins": 3.348861813545227, + "rewards_train/rejected": -4.880779266357422, + "step": 115 + }, + { + "epoch": 1.31, + "logps_train/chosen": -164.8944091796875, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -145.25302124023438, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.778893232345581, + "rewards_train/margins": 3.073557138442993, + "rewards_train/rejected": -4.852450370788574, + "step": 115 + }, + { + "epoch": 1.31, + "logps_train/chosen": -160.88668823242188, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -163.2848358154297, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -2.420285224914551, + "rewards_train/margins": 2.5804386138916016, + "rewards_train/rejected": -5.000723838806152, + "step": 115 + }, + { + "epoch": 1.31, + "logps_train/chosen": -188.2561492919922, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -169.0, + "logps_train/rejected": -224.9777069091797, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.849443793296814, + "rewards_train/margins": 3.7523800134658813, + "rewards_train/rejected": -5.601823806762695, + "step": 115 + }, + { + "epoch": 1.33, + "learning_rate": 3.069265145947016e-05, + "loss": 0.2013, + "step": 116 + }, + { + "epoch": 1.33, + "logps_train/chosen": -159.26284790039062, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -178.18568420410156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1178855895996094, + "rewards_train/margins": 3.3582873344421387, + "rewards_train/rejected": -5.476172924041748, + "step": 116 + }, + { + "epoch": 1.33, + "logps_train/chosen": -158.87692260742188, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -163.0965118408203, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6447348594665527, + "rewards_train/margins": 3.0411858558654785, + "rewards_train/rejected": -4.685920715332031, + "step": 116 + }, + { + "epoch": 1.33, + "logps_train/chosen": -182.80258178710938, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -177.33001708984375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4982264041900635, + "rewards_train/margins": 3.7515709400177, + "rewards_train/rejected": -5.249797344207764, + "step": 116 + }, + { + "epoch": 1.33, + "logps_train/chosen": -167.61244201660156, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -161.88333129882812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.1869282722473145, + "rewards_train/margins": 3.1240487098693848, + "rewards_train/rejected": -5.310976982116699, + "step": 116 + }, + { + "epoch": 1.34, + "learning_rate": 3.0389941427474873e-05, + "loss": 0.1413, + "step": 117 + }, + { + "epoch": 1.34, + "logps_train/chosen": -187.91346740722656, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -195.27867126464844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.616736650466919, + "rewards_train/margins": 3.5172088146209717, + "rewards_train/rejected": -5.133945465087891, + "step": 117 + }, + { + "epoch": 1.34, + "logps_train/chosen": -151.50758361816406, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -147.73008728027344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9189229011535645, + "rewards_train/margins": 2.8163905143737793, + "rewards_train/rejected": -4.735313415527344, + "step": 117 + }, + { + "epoch": 1.34, + "logps_train/chosen": -169.54360961914062, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -167.17601013183594, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -2.2594144344329834, + "rewards_train/margins": 3.2765467166900635, + "rewards_train/rejected": -5.535961151123047, + "step": 117 + }, + { + "epoch": 1.34, + "logps_train/chosen": -198.6015625, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -195.97879028320312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7592782974243164, + "rewards_train/margins": 3.4153575897216797, + "rewards_train/rejected": -5.174635887145996, + "step": 117 + }, + { + "epoch": 1.35, + "learning_rate": 3.008640032631585e-05, + "loss": 0.1433, + "step": 118 + }, + { + "epoch": 1.35, + "logps_train/chosen": -169.9024200439453, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -191.21194458007812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.6621170043945312, + "rewards_train/margins": 3.6406192779541016, + "rewards_train/rejected": -5.302736282348633, + "step": 118 + }, + { + "epoch": 1.35, + "logps_train/chosen": -144.08741760253906, + "logps_train/ref_chosen": -127.0, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -157.87893676757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7102066278457642, + "rewards_train/margins": 3.2556179761886597, + "rewards_train/rejected": -4.965824604034424, + "step": 118 + }, + { + "epoch": 1.35, + "logps_train/chosen": -137.56689453125, + "logps_train/ref_chosen": -117.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -156.909423828125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.0094242095947266, + "rewards_train/margins": 3.6044678688049316, + "rewards_train/rejected": -5.613892078399658, + "step": 118 + }, + { + "epoch": 1.35, + "logps_train/chosen": -177.030029296875, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -124.5, + "logps_train/rejected": -172.6234130859375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.687475323677063, + "rewards_train/margins": 3.1036747694015503, + "rewards_train/rejected": -4.791150093078613, + "step": 118 + }, + { + "epoch": 1.36, + "learning_rate": 2.978207495866292e-05, + "loss": 0.146, + "step": 119 + }, + { + "epoch": 1.36, + "logps_train/chosen": -167.2484130859375, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -151.47787475585938, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.6930049657821655, + "rewards_train/margins": 3.221190094947815, + "rewards_train/rejected": -4.9141950607299805, + "step": 119 + }, + { + "epoch": 1.36, + "logps_train/chosen": -178.5880584716797, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -176.48397827148438, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.979020595550537, + "rewards_train/margins": 3.617034435272217, + "rewards_train/rejected": -5.596055030822754, + "step": 119 + }, + { + "epoch": 1.36, + "logps_train/chosen": -158.26107788085938, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -180.61029052734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.294564127922058, + "rewards_train/margins": 3.3548184633255005, + "rewards_train/rejected": -4.649382591247559, + "step": 119 + }, + { + "epoch": 1.36, + "logps_train/chosen": -164.2959442138672, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -165.3199005126953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3953654766082764, + "rewards_train/margins": 2.9375030994415283, + "rewards_train/rejected": -4.332868576049805, + "step": 119 + }, + { + "epoch": 1.37, + "learning_rate": 2.947701224811113e-05, + "loss": 0.1248, + "step": 120 + }, + { + "epoch": 1.37, + "logps_train/chosen": -174.03245544433594, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -171.17623901367188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3544423580169678, + "rewards_train/margins": 3.606821298599243, + "rewards_train/rejected": -4.961263656616211, + "step": 120 + }, + { + "epoch": 1.37, + "logps_train/chosen": -147.8043670654297, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -166.4681396484375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.2177417278289795, + "rewards_train/margins": 2.9022276401519775, + "rewards_train/rejected": -4.119969367980957, + "step": 120 + }, + { + "epoch": 1.37, + "logps_train/chosen": -178.89630126953125, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -170.27052307128906, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1310200691223145, + "rewards_train/margins": 3.1832242012023926, + "rewards_train/rejected": -4.314244270324707, + "step": 120 + }, + { + "epoch": 1.37, + "logps_train/chosen": -165.63587951660156, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -154.56637573242188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.1438608169555664, + "rewards_train/margins": 2.665705680847168, + "rewards_train/rejected": -4.809566497802734, + "step": 120 + }, + { + "epoch": 1.38, + "learning_rate": 2.9171259231945598e-05, + "loss": 0.1559, + "step": 121 + }, + { + "epoch": 1.38, + "logps_train/chosen": -151.94607543945312, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -136.6217041015625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.5857216119766235, + "rewards_train/margins": 3.2760576009750366, + "rewards_train/rejected": -4.86177921295166, + "step": 121 + }, + { + "epoch": 1.38, + "logps_train/chosen": -155.14697265625, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -162.53421020507812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3468749523162842, + "rewards_train/margins": 3.6037633419036865, + "rewards_train/rejected": -4.950638294219971, + "step": 121 + }, + { + "epoch": 1.38, + "logps_train/chosen": -161.08724975585938, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -160.4564208984375, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.577230453491211, + "rewards_train/margins": 2.754911422729492, + "rewards_train/rejected": -4.332141876220703, + "step": 121 + }, + { + "epoch": 1.38, + "logps_train/chosen": -160.4774932861328, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -114.5, + "logps_train/rejected": -158.9612274169922, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.4526317119598389, + "rewards_train/margins": 2.9784510135650635, + "rewards_train/rejected": -4.431082725524902, + "step": 121 + }, + { + "epoch": 1.39, + "learning_rate": 2.8864863053888925e-05, + "loss": 0.1559, + "step": 122 + }, + { + "epoch": 1.39, + "logps_train/chosen": -166.92727661132812, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -182.114990234375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.6018102169036865, + "rewards_train/margins": 3.3425023555755615, + "rewards_train/rejected": -4.944312572479248, + "step": 122 + }, + { + "epoch": 1.39, + "logps_train/chosen": -161.64584350585938, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -146.01136779785156, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.470199465751648, + "rewards_train/margins": 2.7532025575637817, + "rewards_train/rejected": -4.22340202331543, + "step": 122 + }, + { + "epoch": 1.39, + "logps_train/chosen": -179.35714721679688, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -127.37224578857422, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3069069385528564, + "rewards_train/margins": 2.7117631435394287, + "rewards_train/rejected": -4.018670082092285, + "step": 122 + }, + { + "epoch": 1.39, + "logps_train/chosen": -142.1234588623047, + "logps_train/ref_chosen": -123.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -134.3780059814453, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.8813402652740479, + "rewards_train/margins": 2.604262590408325, + "rewards_train/rejected": -4.485602855682373, + "step": 122 + }, + { + "epoch": 1.41, + "learning_rate": 2.8557870956832132e-05, + "loss": 0.1891, + "step": 123 + }, + { + "epoch": 1.41, + "logps_train/chosen": -215.79141235351562, + "logps_train/ref_chosen": -207.0, + "logps_train/ref_rejected": -158.0, + "logps_train/rejected": -206.87472534179688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9100003242492676, + "rewards_train/margins": 4.041144847869873, + "rewards_train/rejected": -4.951145172119141, + "step": 123 + }, + { + "epoch": 1.41, + "logps_train/chosen": -171.5395965576172, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -178.28213500976562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3091840744018555, + "rewards_train/margins": 3.5175161361694336, + "rewards_train/rejected": -4.826700210571289, + "step": 123 + }, + { + "epoch": 1.41, + "logps_train/chosen": -197.67599487304688, + "logps_train/ref_chosen": -191.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -182.3161163330078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6696504354476929, + "rewards_train/margins": 3.831687092781067, + "rewards_train/rejected": -4.50133752822876, + "step": 123 + }, + { + "epoch": 1.41, + "logps_train/chosen": -173.24679565429688, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -164.2289276123047, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.745944619178772, + "rewards_train/margins": 3.166388154029846, + "rewards_train/rejected": -3.912332773208618, + "step": 123 + }, + { + "epoch": 1.42, + "learning_rate": 2.8250330275550336e-05, + "loss": 0.0987, + "step": 124 + }, + { + "epoch": 1.42, + "logps_train/chosen": -172.51718139648438, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -177.36227416992188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7524986267089844, + "rewards_train/margins": 4.2184929847717285, + "rewards_train/rejected": -4.970991611480713, + "step": 124 + }, + { + "epoch": 1.42, + "logps_train/chosen": -141.9478302001953, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -145.900146484375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.2405827045440674, + "rewards_train/margins": 3.0773122310638428, + "rewards_train/rejected": -4.31789493560791, + "step": 124 + }, + { + "epoch": 1.42, + "logps_train/chosen": -180.90890502929688, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -169.93887329101562, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6273165941238403, + "rewards_train/margins": 3.1660832166671753, + "rewards_train/rejected": -4.793399810791016, + "step": 124 + }, + { + "epoch": 1.42, + "logps_train/chosen": -171.01747131347656, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -157.03610229492188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1220600605010986, + "rewards_train/margins": 3.2815511226654053, + "rewards_train/rejected": -4.403611183166504, + "step": 124 + }, + { + "epoch": 1.43, + "learning_rate": 2.7942288429404256e-05, + "loss": 0.1097, + "step": 125 + }, + { + "epoch": 1.43, + "logps_train/chosen": -175.7913818359375, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -169.33328247070312, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.0130254030227661, + "rewards_train/margins": 2.717079997062683, + "rewards_train/rejected": -3.730105400085449, + "step": 125 + }, + { + "epoch": 1.43, + "logps_train/chosen": -193.09005737304688, + "logps_train/ref_chosen": -186.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -190.29751586914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7705289125442505, + "rewards_train/margins": 3.368988871574402, + "rewards_train/rejected": -4.139517784118652, + "step": 125 + }, + { + "epoch": 1.43, + "logps_train/chosen": -175.15960693359375, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -162.30618286132812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0979918241500854, + "rewards_train/margins": 3.0310651063919067, + "rewards_train/rejected": -4.129056930541992, + "step": 125 + }, + { + "epoch": 1.43, + "logps_train/chosen": -192.91085815429688, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -156.0, + "logps_train/rejected": -200.0994415283203, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.1479214429855347, + "rewards_train/margins": 3.2763782739639282, + "rewards_train/rejected": -4.424299716949463, + "step": 125 + }, + { + "epoch": 1.44, + "learning_rate": 2.7633792915028677e-05, + "loss": 0.1349, + "step": 126 + }, + { + "epoch": 1.44, + "logps_train/chosen": -185.21578979492188, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -176.3563232421875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.304196298122406, + "rewards_train/margins": 3.450674593448639, + "rewards_train/rejected": -3.754870891571045, + "step": 126 + }, + { + "epoch": 1.44, + "logps_train/chosen": -171.53411865234375, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -186.81044006347656, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.3897385597229004, + "rewards_train/margins": 3.4538064002990723, + "rewards_train/rejected": -4.843544960021973, + "step": 126 + }, + { + "epoch": 1.44, + "logps_train/chosen": -157.05662536621094, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -139.83226013183594, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.347580909729004, + "rewards_train/margins": 2.9695076942443848, + "rewards_train/rejected": -4.317088603973389, + "step": 126 + }, + { + "epoch": 1.44, + "logps_train/chosen": -155.17971801757812, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -143.3584747314453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2133831977844238, + "rewards_train/margins": 2.3373942375183105, + "rewards_train/rejected": -3.5507774353027344, + "step": 126 + }, + { + "epoch": 1.45, + "learning_rate": 2.7324891299008985e-05, + "loss": 0.2015, + "step": 127 + }, + { + "epoch": 1.45, + "logps_train/chosen": -153.86216735839844, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -162.63079833984375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.480747938156128, + "rewards_train/margins": 2.8033287525177, + "rewards_train/rejected": -4.284076690673828, + "step": 127 + }, + { + "epoch": 1.45, + "logps_train/chosen": -186.6682586669922, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -192.19993591308594, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0969038009643555, + "rewards_train/margins": 3.314544677734375, + "rewards_train/rejected": -4.4114484786987305, + "step": 127 + }, + { + "epoch": 1.45, + "logps_train/chosen": -153.8680419921875, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -171.44602966308594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9669804573059082, + "rewards_train/margins": 2.993198871612549, + "rewards_train/rejected": -4.960179328918457, + "step": 127 + }, + { + "epoch": 1.45, + "logps_train/chosen": -176.20404052734375, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -169.44656372070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6339795589447021, + "rewards_train/margins": 3.147592306137085, + "rewards_train/rejected": -3.781571865081787, + "step": 127 + }, + { + "epoch": 1.46, + "learning_rate": 2.701563121054695e-05, + "loss": 0.1361, + "step": 128 + }, + { + "epoch": 1.46, + "logps_train/chosen": -166.00640869140625, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -178.55386352539062, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8213452100753784, + "rewards_train/margins": 3.634822964668274, + "rewards_train/rejected": -4.456168174743652, + "step": 128 + }, + { + "epoch": 1.46, + "logps_train/chosen": -162.34912109375, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -152.22750854492188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0764155387878418, + "rewards_train/margins": 3.124901294708252, + "rewards_train/rejected": -4.201316833496094, + "step": 128 + }, + { + "epoch": 1.46, + "logps_train/chosen": -156.54486083984375, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -149.5716094970703, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1559518575668335, + "rewards_train/margins": 3.318982720375061, + "rewards_train/rejected": -4.4749345779418945, + "step": 128 + }, + { + "epoch": 1.46, + "logps_train/chosen": -190.66033935546875, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -178.93310546875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.6988459825515747, + "rewards_train/margins": 3.3385089635849, + "rewards_train/rejected": -4.037354946136475, + "step": 128 + }, + { + "epoch": 1.47, + "learning_rate": 2.6706060334116777e-05, + "loss": 0.1665, + "step": 129 + }, + { + "epoch": 1.47, + "logps_train/chosen": -153.0819854736328, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -140.51028442382812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.736420750617981, + "rewards_train/margins": 3.3056238889694214, + "rewards_train/rejected": -4.042044639587402, + "step": 129 + }, + { + "epoch": 1.47, + "logps_train/chosen": -179.62692260742188, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -159.4015655517578, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3000822067260742, + "rewards_train/margins": 3.679259777069092, + "rewards_train/rejected": -4.979341983795166, + "step": 129 + }, + { + "epoch": 1.47, + "logps_train/chosen": -162.6141815185547, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -156.1014862060547, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.0475513935089111, + "rewards_train/margins": 3.2958009243011475, + "rewards_train/rejected": -4.343352317810059, + "step": 129 + }, + { + "epoch": 1.47, + "logps_train/chosen": -174.86026000976562, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -182.4564971923828, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3674092292785645, + "rewards_train/margins": 3.3553266525268555, + "rewards_train/rejected": -4.72273588180542, + "step": 129 + }, + { + "epoch": 1.49, + "learning_rate": 2.639622640211277e-05, + "loss": 0.1287, + "step": 130 + }, + { + "epoch": 1.49, + "logps_train/chosen": -142.3889923095703, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -153.8872528076172, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3541336059570312, + "rewards_train/margins": 3.004904270172119, + "rewards_train/rejected": -4.35903787612915, + "step": 130 + }, + { + "epoch": 1.49, + "logps_train/chosen": -168.5977020263672, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -160.97059631347656, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.1273491382598877, + "rewards_train/margins": 3.112093210220337, + "rewards_train/rejected": -4.239442348480225, + "step": 130 + }, + { + "epoch": 1.49, + "logps_train/chosen": -186.99581909179688, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -178.27426147460938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.5144252777099609, + "rewards_train/margins": 3.7290172576904297, + "rewards_train/rejected": -4.243442535400391, + "step": 130 + }, + { + "epoch": 1.49, + "logps_train/chosen": -163.23785400390625, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -131.27413940429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0287654399871826, + "rewards_train/margins": 3.533609628677368, + "rewards_train/rejected": -4.562375068664551, + "step": 130 + }, + { + "epoch": 1.5, + "learning_rate": 2.6086177187489453e-05, + "loss": 0.1228, + "step": 131 + }, + { + "epoch": 1.5, + "logps_train/chosen": -147.3597412109375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -145.3800048828125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1206421852111816, + "rewards_train/margins": 3.2051515579223633, + "rewards_train/rejected": -4.325793743133545, + "step": 131 + }, + { + "epoch": 1.5, + "logps_train/chosen": -165.80751037597656, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -178.4746551513672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1619529724121094, + "rewards_train/margins": 3.133852005004883, + "rewards_train/rejected": -4.295804977416992, + "step": 131 + }, + { + "epoch": 1.5, + "logps_train/chosen": -169.84341430664062, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -160.15591430664062, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.9361000061035156, + "rewards_train/margins": 3.2722649574279785, + "rewards_train/rejected": -4.208364963531494, + "step": 131 + }, + { + "epoch": 1.5, + "logps_train/chosen": -163.28036499023438, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -147.83535766601562, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.7479585409164429, + "rewards_train/margins": 3.2120429277420044, + "rewards_train/rejected": -3.9600014686584473, + "step": 131 + }, + { + "epoch": 1.51, + "learning_rate": 2.5775960496395564e-05, + "loss": 0.1649, + "step": 132 + }, + { + "epoch": 1.51, + "logps_train/chosen": -149.8038330078125, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -147.8526611328125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4508914947509766, + "rewards_train/margins": 3.12939453125, + "rewards_train/rejected": -4.580286026000977, + "step": 132 + }, + { + "epoch": 1.51, + "logps_train/chosen": -158.60960388183594, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -155.93435668945312, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.1415255069732666, + "rewards_train/margins": 2.831206798553467, + "rewards_train/rejected": -3.9727323055267334, + "step": 132 + }, + { + "epoch": 1.51, + "logps_train/chosen": -177.83819580078125, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -161.01573181152344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7680490612983704, + "rewards_train/margins": 3.2229778170585632, + "rewards_train/rejected": -3.9910268783569336, + "step": 132 + }, + { + "epoch": 1.51, + "logps_train/chosen": -150.2593231201172, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -158.00173950195312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7834035754203796, + "rewards_train/margins": 3.515306293964386, + "rewards_train/rejected": -4.298709869384766, + "step": 132 + }, + { + "epoch": 1.52, + "learning_rate": 2.5465624160802847e-05, + "loss": 0.1705, + "step": 133 + }, + { + "epoch": 1.52, + "logps_train/chosen": -163.82518005371094, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -160.40072631835938, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.2626445293426514, + "rewards_train/margins": 2.3889267444610596, + "rewards_train/rejected": -3.651571273803711, + "step": 133 + }, + { + "epoch": 1.52, + "logps_train/chosen": -138.35488891601562, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -150.45436096191406, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -0.8314854502677917, + "rewards_train/margins": 3.542002499103546, + "rewards_train/rejected": -4.373487949371338, + "step": 133 + }, + { + "epoch": 1.52, + "logps_train/chosen": -164.03900146484375, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -152.27687072753906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3316338062286377, + "rewards_train/margins": 3.427401304244995, + "rewards_train/rejected": -4.759035110473633, + "step": 133 + }, + { + "epoch": 1.52, + "logps_train/chosen": -193.63211059570312, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -177.69418334960938, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.9139914512634277, + "rewards_train/margins": 3.4447827339172363, + "rewards_train/rejected": -4.358774185180664, + "step": 133 + }, + { + "epoch": 1.53, + "learning_rate": 2.515521603113088e-05, + "loss": 0.187, + "step": 134 + }, + { + "epoch": 1.53, + "logps_train/chosen": -175.33932495117188, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -165.0, + "logps_train/rejected": -213.67547607421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5542943477630615, + "rewards_train/margins": 4.358224153518677, + "rewards_train/rejected": -4.912518501281738, + "step": 134 + }, + { + "epoch": 1.53, + "logps_train/chosen": -158.3114013671875, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -173.7991943359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7678590416908264, + "rewards_train/margins": 4.60151332616806, + "rewards_train/rejected": -5.369372367858887, + "step": 134 + }, + { + "epoch": 1.53, + "logps_train/chosen": -131.25491333007812, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -131.59129333496094, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3272488117218018, + "rewards_train/margins": 2.7177212238311768, + "rewards_train/rejected": -4.0449700355529785, + "step": 134 + }, + { + "epoch": 1.53, + "logps_train/chosen": -139.65713500976562, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -143.0663299560547, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.1557526588439941, + "rewards_train/margins": 3.5598645210266113, + "rewards_train/rejected": -4.7156171798706055, + "step": 134 + }, + { + "epoch": 1.54, + "learning_rate": 2.4844783968869126e-05, + "loss": 0.1299, + "step": 135 + }, + { + "epoch": 1.54, + "logps_train/chosen": -155.28524780273438, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -126.5, + "logps_train/rejected": -163.39198303222656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8066498041152954, + "rewards_train/margins": 2.9134079217910767, + "rewards_train/rejected": -3.720057725906372, + "step": 135 + }, + { + "epoch": 1.54, + "logps_train/chosen": -178.38946533203125, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -192.02011108398438, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0768359899520874, + "rewards_train/margins": 3.238333821296692, + "rewards_train/rejected": -4.315169811248779, + "step": 135 + }, + { + "epoch": 1.54, + "logps_train/chosen": -160.60389709472656, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -163.22579956054688, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.4700582027435303, + "rewards_train/margins": 2.681037187576294, + "rewards_train/rejected": -4.151095390319824, + "step": 135 + }, + { + "epoch": 1.54, + "logps_train/chosen": -191.0967559814453, + "logps_train/ref_chosen": -185.0, + "logps_train/ref_rejected": -146.0, + "logps_train/rejected": -190.99061584472656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6215896606445312, + "rewards_train/margins": 3.8812813758850098, + "rewards_train/rejected": -4.502871036529541, + "step": 135 + }, + { + "epoch": 1.55, + "learning_rate": 2.4534375839197166e-05, + "loss": 0.1323, + "step": 136 + }, + { + "epoch": 1.55, + "logps_train/chosen": -216.13201904296875, + "logps_train/ref_chosen": -210.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -161.54336547851562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6491394639015198, + "rewards_train/margins": 3.5523642897605896, + "rewards_train/rejected": -4.201503753662109, + "step": 136 + }, + { + "epoch": 1.55, + "logps_train/chosen": -186.5906219482422, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -165.56106567382812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7216589450836182, + "rewards_train/margins": 3.585913896560669, + "rewards_train/rejected": -4.307572841644287, + "step": 136 + }, + { + "epoch": 1.55, + "logps_train/chosen": -157.28329467773438, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -144.79754638671875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.2453217506408691, + "rewards_train/margins": 3.0565037727355957, + "rewards_train/rejected": -4.301825523376465, + "step": 136 + }, + { + "epoch": 1.55, + "logps_train/chosen": -146.12713623046875, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -144.28741455078125, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.4028985500335693, + "rewards_train/margins": 2.723278284072876, + "rewards_train/rejected": -4.126176834106445, + "step": 136 + }, + { + "epoch": 1.57, + "learning_rate": 2.4224039503604435e-05, + "loss": 0.1395, + "step": 137 + }, + { + "epoch": 1.57, + "logps_train/chosen": -193.05108642578125, + "logps_train/ref_chosen": -186.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -180.34556579589844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.7160476446151733, + "rewards_train/margins": 3.606399178504944, + "rewards_train/rejected": -4.322446823120117, + "step": 137 + }, + { + "epoch": 1.57, + "logps_train/chosen": -177.28684997558594, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -169.56716918945312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8280016779899597, + "rewards_train/margins": 3.3246129155158997, + "rewards_train/rejected": -4.152614593505859, + "step": 137 + }, + { + "epoch": 1.57, + "logps_train/chosen": -181.3519744873047, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -164.0, + "logps_train/rejected": -209.20787048339844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.585979163646698, + "rewards_train/margins": 3.8938897252082825, + "rewards_train/rejected": -4.4798688888549805, + "step": 137 + }, + { + "epoch": 1.57, + "logps_train/chosen": -159.36561584472656, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -163.44479370117188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.33968687057495117, + "rewards_train/margins": 3.739558219909668, + "rewards_train/rejected": -4.079245090484619, + "step": 137 + }, + { + "epoch": 1.58, + "learning_rate": 2.391382281251055e-05, + "loss": 0.1132, + "step": 138 + }, + { + "epoch": 1.58, + "logps_train/chosen": -160.30413818359375, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -160.53933715820312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.747503936290741, + "rewards_train/margins": 3.6289380192756653, + "rewards_train/rejected": -4.376441955566406, + "step": 138 + }, + { + "epoch": 1.58, + "logps_train/chosen": -193.16165161132812, + "logps_train/ref_chosen": -189.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -167.0458984375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.4382353723049164, + "rewards_train/margins": 3.1267053186893463, + "rewards_train/rejected": -3.5649406909942627, + "step": 138 + }, + { + "epoch": 1.58, + "logps_train/chosen": -177.08596801757812, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -176.0380401611328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2597697973251343, + "rewards_train/margins": 3.0808502435684204, + "rewards_train/rejected": -4.340620040893555, + "step": 138 + }, + { + "epoch": 1.58, + "logps_train/chosen": -159.55792236328125, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -145.29629516601562, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9486621618270874, + "rewards_train/margins": 3.1882187128067017, + "rewards_train/rejected": -4.136880874633789, + "step": 138 + }, + { + "epoch": 1.59, + "learning_rate": 2.3603773597887237e-05, + "loss": 0.1249, + "step": 139 + }, + { + "epoch": 1.59, + "logps_train/chosen": -165.89608764648438, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -172.01133728027344, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.7296485900878906, + "rewards_train/margins": 3.3035411834716797, + "rewards_train/rejected": -4.03318977355957, + "step": 139 + }, + { + "epoch": 1.59, + "logps_train/chosen": -144.5615234375, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -117.5, + "logps_train/rejected": -163.75762939453125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.721398651599884, + "rewards_train/margins": 3.903387725353241, + "rewards_train/rejected": -4.624786376953125, + "step": 139 + }, + { + "epoch": 1.59, + "logps_train/chosen": -169.52536010742188, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -170.62039184570312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9957976341247559, + "rewards_train/margins": 2.806670904159546, + "rewards_train/rejected": -3.8024685382843018, + "step": 139 + }, + { + "epoch": 1.59, + "logps_train/chosen": -159.63851928710938, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -154.8424072265625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.4302584230899811, + "rewards_train/margins": 3.5069116055965424, + "rewards_train/rejected": -3.9371700286865234, + "step": 139 + }, + { + "epoch": 1.6, + "learning_rate": 2.329393966588323e-05, + "loss": 0.1378, + "step": 140 + }, + { + "epoch": 1.6, + "logps_train/chosen": -174.330078125, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -172.95440673828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9353509545326233, + "rewards_train/margins": 3.4886531233787537, + "rewards_train/rejected": -4.424004077911377, + "step": 140 + }, + { + "epoch": 1.6, + "logps_train/chosen": -166.16903686523438, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -187.2434844970703, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.7706148624420166, + "rewards_train/margins": 3.949387788772583, + "rewards_train/rejected": -4.7200026512146, + "step": 140 + }, + { + "epoch": 1.6, + "logps_train/chosen": -163.92617797851562, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -188.00546264648438, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8798237442970276, + "rewards_train/margins": 3.081660211086273, + "rewards_train/rejected": -3.961483955383301, + "step": 140 + }, + { + "epoch": 1.6, + "logps_train/chosen": -154.75360107421875, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -161.3019561767578, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.1850275993347168, + "rewards_train/margins": 3.391695022583008, + "rewards_train/rejected": -4.576722621917725, + "step": 140 + }, + { + "epoch": 1.61, + "learning_rate": 2.298436878945306e-05, + "loss": 0.1181, + "step": 141 + }, + { + "epoch": 1.61, + "logps_train/chosen": -142.72462463378906, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -179.27532958984375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9942886829376221, + "rewards_train/margins": 3.0164482593536377, + "rewards_train/rejected": -4.01073694229126, + "step": 141 + }, + { + "epoch": 1.61, + "logps_train/chosen": -190.180419921875, + "logps_train/ref_chosen": -181.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -180.17337036132812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9520259499549866, + "rewards_train/margins": 3.4328898787498474, + "rewards_train/rejected": -4.384915828704834, + "step": 141 + }, + { + "epoch": 1.61, + "logps_train/chosen": -134.74435424804688, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -111.79988098144531, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.6727750301361084, + "rewards_train/margins": 2.240952968597412, + "rewards_train/rejected": -3.9137279987335205, + "step": 141 + }, + { + "epoch": 1.61, + "logps_train/chosen": -176.04776000976562, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -164.4101104736328, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1286051273345947, + "rewards_train/margins": 3.44355845451355, + "rewards_train/rejected": -4.5721635818481445, + "step": 141 + }, + { + "epoch": 1.62, + "learning_rate": 2.267510870099101e-05, + "loss": 0.1654, + "step": 142 + }, + { + "epoch": 1.62, + "logps_train/chosen": -188.28570556640625, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -180.6005859375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1609909534454346, + "rewards_train/margins": 3.7696726322174072, + "rewards_train/rejected": -4.930663585662842, + "step": 142 + }, + { + "epoch": 1.62, + "logps_train/chosen": -157.38246154785156, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -157.08433532714844, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.2556779384613037, + "rewards_train/margins": 2.78234601020813, + "rewards_train/rejected": -4.038023948669434, + "step": 142 + }, + { + "epoch": 1.62, + "logps_train/chosen": -139.43142700195312, + "logps_train/ref_chosen": -127.5, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -141.7026824951172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.159501314163208, + "rewards_train/margins": 3.0188729763031006, + "rewards_train/rejected": -4.178374290466309, + "step": 142 + }, + { + "epoch": 1.62, + "logps_train/chosen": -125.58201599121094, + "logps_train/ref_chosen": -104.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -139.86517333984375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.145481824874878, + "rewards_train/margins": 3.053291082382202, + "rewards_train/rejected": -5.19877290725708, + "step": 142 + }, + { + "epoch": 1.63, + "learning_rate": 2.2366207084971325e-05, + "loss": 0.1394, + "step": 143 + }, + { + "epoch": 1.63, + "logps_train/chosen": -119.32640838623047, + "logps_train/ref_chosen": -104.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -134.8964385986328, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5528064966201782, + "rewards_train/margins": 2.5242398977279663, + "rewards_train/rejected": -4.0770463943481445, + "step": 143 + }, + { + "epoch": 1.63, + "logps_train/chosen": -166.6683349609375, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -158.14041137695312, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.3560905456542969, + "rewards_train/margins": 2.8011627197265625, + "rewards_train/rejected": -4.157253265380859, + "step": 143 + }, + { + "epoch": 1.63, + "logps_train/chosen": -153.33035278320312, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -163.28121948242188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.7734657526016235, + "rewards_train/margins": 3.996648669242859, + "rewards_train/rejected": -4.770114421844482, + "step": 143 + }, + { + "epoch": 1.63, + "logps_train/chosen": -171.47213745117188, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -174.22564697265625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.0140111446380615, + "rewards_train/margins": 2.929255723953247, + "rewards_train/rejected": -3.9432668685913086, + "step": 143 + }, + { + "epoch": 1.65, + "learning_rate": 2.2057711570595746e-05, + "loss": 0.2175, + "step": 144 + }, + { + "epoch": 1.65, + "logps_train/chosen": -173.53387451171875, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -161.1766357421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8866889476776123, + "rewards_train/margins": 3.222722291946411, + "rewards_train/rejected": -4.109411239624023, + "step": 144 + }, + { + "epoch": 1.65, + "logps_train/chosen": -141.76498413085938, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -159.65899658203125, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.348568320274353, + "rewards_train/margins": 3.194480061531067, + "rewards_train/rejected": -4.54304838180542, + "step": 144 + }, + { + "epoch": 1.65, + "logps_train/chosen": -197.79476928710938, + "logps_train/ref_chosen": -192.0, + "logps_train/ref_rejected": -166.0, + "logps_train/rejected": -209.7325439453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5577000379562378, + "rewards_train/margins": 3.7895785570144653, + "rewards_train/rejected": -4.347278594970703, + "step": 144 + }, + { + "epoch": 1.65, + "logps_train/chosen": -130.68243408203125, + "logps_train/ref_chosen": -122.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -138.5338897705078, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8820131421089172, + "rewards_train/margins": 3.5245015025138855, + "rewards_train/rejected": -4.406514644622803, + "step": 144 + }, + { + "epoch": 1.66, + "learning_rate": 2.174966972444967e-05, + "loss": 0.1446, + "step": 145 + }, + { + "epoch": 1.66, + "logps_train/chosen": -163.034912109375, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -165.3939666748047, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.1952872276306152, + "rewards_train/margins": 3.1185240745544434, + "rewards_train/rejected": -4.313811302185059, + "step": 145 + }, + { + "epoch": 1.66, + "logps_train/chosen": -182.8961181640625, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -172.29110717773438, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.5673462152481079, + "rewards_train/margins": 3.453562617301941, + "rewards_train/rejected": -4.020908832550049, + "step": 145 + }, + { + "epoch": 1.66, + "logps_train/chosen": -150.4124298095703, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -159.89599609375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4033517837524414, + "rewards_train/margins": 3.661468505859375, + "rewards_train/rejected": -5.064820289611816, + "step": 145 + }, + { + "epoch": 1.66, + "logps_train/chosen": -184.98175048828125, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -164.98641967773438, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.6009587049484253, + "rewards_train/margins": 3.9541760683059692, + "rewards_train/rejected": -4.5551347732543945, + "step": 145 + }, + { + "epoch": 1.67, + "learning_rate": 2.1442129043167874e-05, + "loss": 0.1344, + "step": 146 + }, + { + "epoch": 1.67, + "logps_train/chosen": -164.82098388671875, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -179.12875366210938, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3958685398101807, + "rewards_train/margins": 3.42560076713562, + "rewards_train/rejected": -4.821469306945801, + "step": 146 + }, + { + "epoch": 1.67, + "logps_train/chosen": -143.52203369140625, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -173.89883422851562, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.5310614109039307, + "rewards_train/margins": 3.589242696762085, + "rewards_train/rejected": -5.120304107666016, + "step": 146 + }, + { + "epoch": 1.67, + "logps_train/chosen": -171.19552612304688, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -150.95005798339844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.160958170890808, + "rewards_train/margins": 3.271157383918762, + "rewards_train/rejected": -4.43211555480957, + "step": 146 + }, + { + "epoch": 1.67, + "logps_train/chosen": -185.29385375976562, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -177.15206909179688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1695337295532227, + "rewards_train/margins": 3.7931838035583496, + "rewards_train/rejected": -4.962717533111572, + "step": 146 + }, + { + "epoch": 1.68, + "learning_rate": 2.1135136946111078e-05, + "loss": 0.1606, + "step": 147 + }, + { + "epoch": 1.68, + "logps_train/chosen": -178.3898162841797, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -172.91342163085938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1439135074615479, + "rewards_train/margins": 3.5383970737457275, + "rewards_train/rejected": -4.682310581207275, + "step": 147 + }, + { + "epoch": 1.68, + "logps_train/chosen": -175.71603393554688, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -164.0, + "logps_train/rejected": -209.33184814453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5641824007034302, + "rewards_train/margins": 4.025251746177673, + "rewards_train/rejected": -4.5894341468811035, + "step": 147 + }, + { + "epoch": 1.68, + "logps_train/chosen": -186.2715301513672, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -172.0, + "logps_train/rejected": -216.0167999267578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4318392276763916, + "rewards_train/margins": 4.031365156173706, + "rewards_train/rejected": -4.463204383850098, + "step": 147 + }, + { + "epoch": 1.68, + "logps_train/chosen": -159.61163330078125, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -172.23619079589844, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.1693428754806519, + "rewards_train/margins": 3.491873860359192, + "rewards_train/rejected": -4.661216735839844, + "step": 147 + }, + { + "epoch": 1.69, + "learning_rate": 2.0828740768054405e-05, + "loss": 0.094, + "step": 148 + }, + { + "epoch": 1.69, + "logps_train/chosen": -146.35887145996094, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -136.61520385742188, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -1.5266588926315308, + "rewards_train/margins": 2.8000484704971313, + "rewards_train/rejected": -4.326707363128662, + "step": 148 + }, + { + "epoch": 1.69, + "logps_train/chosen": -187.1864013671875, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -154.72918701171875, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3573615550994873, + "rewards_train/margins": 2.9866015911102295, + "rewards_train/rejected": -4.343963146209717, + "step": 148 + }, + { + "epoch": 1.69, + "logps_train/chosen": -170.3841552734375, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -188.68914794921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0149770975112915, + "rewards_train/margins": 3.598516821861267, + "rewards_train/rejected": -4.613493919372559, + "step": 148 + }, + { + "epoch": 1.69, + "logps_train/chosen": -184.0547332763672, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -186.50656127929688, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.43496501445770264, + "rewards_train/margins": 4.409247994422913, + "rewards_train/rejected": -4.844213008880615, + "step": 148 + }, + { + "epoch": 1.7, + "learning_rate": 2.0522987751888878e-05, + "loss": 0.1432, + "step": 149 + }, + { + "epoch": 1.7, + "logps_train/chosen": -172.67449951171875, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -175.04855346679688, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.4930354356765747, + "rewards_train/margins": 2.8857582807540894, + "rewards_train/rejected": -4.378793716430664, + "step": 149 + }, + { + "epoch": 1.7, + "logps_train/chosen": -178.00057983398438, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -162.48727416992188, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.4646331071853638, + "rewards_train/margins": 2.847375750541687, + "rewards_train/rejected": -4.312008857727051, + "step": 149 + }, + { + "epoch": 1.7, + "logps_train/chosen": -177.27328491210938, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -173.37014770507812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8803309798240662, + "rewards_train/margins": 3.7075143456459045, + "rewards_train/rejected": -4.587845325469971, + "step": 149 + }, + { + "epoch": 1.7, + "logps_train/chosen": -178.50416564941406, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -192.19558715820312, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.926978588104248, + "rewards_train/margins": 3.855861186981201, + "rewards_train/rejected": -4.782839775085449, + "step": 149 + }, + { + "epoch": 1.71, + "learning_rate": 2.0217925041337088e-05, + "loss": 0.1603, + "step": 150 + }, + { + "epoch": 1.71, + "logps_train/chosen": -182.9842071533203, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -156.3119354248047, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1948070526123047, + "rewards_train/margins": 3.445127010345459, + "rewards_train/rejected": -4.639934062957764, + "step": 150 + }, + { + "epoch": 1.71, + "logps_train/chosen": -196.98382568359375, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -182.46234130859375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4740171432495117, + "rewards_train/margins": 3.0879406929016113, + "rewards_train/rejected": -4.561957836151123, + "step": 150 + }, + { + "epoch": 1.71, + "logps_train/chosen": -188.139892578125, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -190.43099975585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2782472372055054, + "rewards_train/margins": 3.6082111597061157, + "rewards_train/rejected": -4.886458396911621, + "step": 150 + }, + { + "epoch": 1.71, + "logps_train/chosen": -205.94732666015625, + "logps_train/ref_chosen": -193.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -199.26638793945312, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.2850637435913086, + "rewards_train/margins": 3.3791723251342773, + "rewards_train/rejected": -4.664236068725586, + "step": 150 + }, + { + "epoch": 1.73, + "learning_rate": 1.991359967368416e-05, + "loss": 0.1191, + "step": 151 + }, + { + "epoch": 1.73, + "logps_train/chosen": -164.7177276611328, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -163.2447509765625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.7888622283935547, + "rewards_train/margins": 2.895867347717285, + "rewards_train/rejected": -4.68472957611084, + "step": 151 + }, + { + "epoch": 1.73, + "logps_train/chosen": -192.6652374267578, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -164.0, + "logps_train/rejected": -208.13421630859375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.8331253528594971, + "rewards_train/margins": 3.594896078109741, + "rewards_train/rejected": -4.428021430969238, + "step": 151 + }, + { + "epoch": 1.73, + "logps_train/chosen": -179.51992797851562, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -167.351318359375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3398842811584473, + "rewards_train/margins": 3.2446608543395996, + "rewards_train/rejected": -4.584545135498047, + "step": 151 + }, + { + "epoch": 1.73, + "logps_train/chosen": -142.92648315429688, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -151.46743774414062, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6470386981964111, + "rewards_train/margins": 3.038767099380493, + "rewards_train/rejected": -4.685805797576904, + "step": 151 + }, + { + "epoch": 1.74, + "learning_rate": 1.9610058572525126e-05, + "loss": 0.156, + "step": 152 + }, + { + "epoch": 1.74, + "logps_train/chosen": -125.03524017333984, + "logps_train/ref_chosen": -104.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -143.35089111328125, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -2.051748275756836, + "rewards_train/margins": 2.4157614707946777, + "rewards_train/rejected": -4.467509746551514, + "step": 152 + }, + { + "epoch": 1.74, + "logps_train/chosen": -163.41842651367188, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -166.00680541992188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.5096149444580078, + "rewards_train/margins": 3.4148945808410645, + "rewards_train/rejected": -4.924509525299072, + "step": 152 + }, + { + "epoch": 1.74, + "logps_train/chosen": -184.01190185546875, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -182.28060913085938, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.1560717821121216, + "rewards_train/margins": 3.208804965019226, + "rewards_train/rejected": -4.364876747131348, + "step": 152 + }, + { + "epoch": 1.74, + "logps_train/chosen": -134.97909545898438, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -154.97637939453125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7018145322799683, + "rewards_train/margins": 2.6661349534988403, + "rewards_train/rejected": -4.367949485778809, + "step": 152 + }, + { + "epoch": 1.75, + "learning_rate": 1.9307348540529842e-05, + "loss": 0.199, + "step": 153 + }, + { + "epoch": 1.75, + "logps_train/chosen": -184.7167205810547, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -184.70632934570312, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.5790934562683105, + "rewards_train/margins": 3.391930103302002, + "rewards_train/rejected": -4.9710235595703125, + "step": 153 + }, + { + "epoch": 1.75, + "logps_train/chosen": -170.3045654296875, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -183.49859619140625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.2700073719024658, + "rewards_train/margins": 3.2787787914276123, + "rewards_train/rejected": -4.548786163330078, + "step": 153 + }, + { + "epoch": 1.75, + "logps_train/chosen": -192.09291076660156, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -182.16331481933594, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -0.9595848321914673, + "rewards_train/margins": 3.4182342290878296, + "rewards_train/rejected": -4.377819061279297, + "step": 153 + }, + { + "epoch": 1.75, + "logps_train/chosen": -185.59799194335938, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -156.0, + "logps_train/rejected": -207.5470733642578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3219197988510132, + "rewards_train/margins": 3.890599846839905, + "rewards_train/rejected": -5.212519645690918, + "step": 153 + }, + { + "epoch": 1.76, + "learning_rate": 1.90055162522265e-05, + "loss": 0.1899, + "step": 154 + }, + { + "epoch": 1.76, + "logps_train/chosen": -160.90257263183594, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -174.55712890625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.113547921180725, + "rewards_train/margins": 3.4354270696640015, + "rewards_train/rejected": -4.548974990844727, + "step": 154 + }, + { + "epoch": 1.76, + "logps_train/chosen": -168.6278076171875, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -185.01776123046875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.2065556049346924, + "rewards_train/margins": 3.4405343532562256, + "rewards_train/rejected": -4.647089958190918, + "step": 154 + }, + { + "epoch": 1.76, + "logps_train/chosen": -179.93496704101562, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -182.0, + "logps_train/rejected": -236.11505126953125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.095841407775879, + "rewards_train/margins": 4.302578449249268, + "rewards_train/rejected": -5.3984198570251465, + "step": 154 + }, + { + "epoch": 1.76, + "logps_train/chosen": -141.23049926757812, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -173.46426391601562, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.7132835388183594, + "rewards_train/margins": 3.7317748069763184, + "rewards_train/rejected": -5.445058345794678, + "step": 154 + }, + { + "epoch": 1.77, + "learning_rate": 1.8704608246804956e-05, + "loss": 0.1316, + "step": 155 + }, + { + "epoch": 1.77, + "logps_train/chosen": -169.93484497070312, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -159.2445526123047, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.6998562812805176, + "rewards_train/margins": 3.3152976036071777, + "rewards_train/rejected": -5.015153884887695, + "step": 155 + }, + { + "epoch": 1.77, + "logps_train/chosen": -170.24639892578125, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -179.92730712890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5101873874664307, + "rewards_train/margins": 3.3885018825531006, + "rewards_train/rejected": -4.898689270019531, + "step": 155 + }, + { + "epoch": 1.77, + "logps_train/chosen": -172.0523681640625, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -177.50497436523438, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.215980052947998, + "rewards_train/margins": 3.4622879028320312, + "rewards_train/rejected": -4.678267955780029, + "step": 155 + }, + { + "epoch": 1.77, + "logps_train/chosen": -167.3325958251953, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -181.28607177734375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4187091588974, + "rewards_train/margins": 3.3278671503067017, + "rewards_train/rejected": -4.746576309204102, + "step": 155 + }, + { + "epoch": 1.78, + "learning_rate": 1.840467092094091e-05, + "loss": 0.1536, + "step": 156 + }, + { + "epoch": 1.78, + "logps_train/chosen": -170.8963623046875, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -178.3448028564453, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.1062872409820557, + "rewards_train/margins": 3.4995310306549072, + "rewards_train/rejected": -4.605818271636963, + "step": 156 + }, + { + "epoch": 1.78, + "logps_train/chosen": -152.4623260498047, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -152.89100646972656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9597091674804688, + "rewards_train/margins": 2.9297332763671875, + "rewards_train/rejected": -4.889442443847656, + "step": 156 + }, + { + "epoch": 1.78, + "logps_train/chosen": -140.10516357421875, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -136.63473510742188, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.0900087356567383, + "rewards_train/margins": 3.295339584350586, + "rewards_train/rejected": -4.385348320007324, + "step": 156 + }, + { + "epoch": 1.78, + "logps_train/chosen": -178.86492919921875, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -143.0, + "logps_train/rejected": -188.87057495117188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0580618381500244, + "rewards_train/margins": 3.5533130168914795, + "rewards_train/rejected": -4.611374855041504, + "step": 156 + }, + { + "epoch": 1.79, + "learning_rate": 1.8105750521642034e-05, + "loss": 0.1377, + "step": 157 + }, + { + "epoch": 1.79, + "logps_train/chosen": -176.15272521972656, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -193.9800567626953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5117566585540771, + "rewards_train/margins": 3.3692567348480225, + "rewards_train/rejected": -4.8810133934021, + "step": 157 + }, + { + "epoch": 1.79, + "logps_train/chosen": -165.08853149414062, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -168.65086364746094, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.9819000959396362, + "rewards_train/margins": 4.159559845924377, + "rewards_train/rejected": -5.141459941864014, + "step": 157 + }, + { + "epoch": 1.79, + "logps_train/chosen": -171.33531188964844, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -165.46160888671875, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.4948601722717285, + "rewards_train/margins": 3.4002251625061035, + "rewards_train/rejected": -4.895085334777832, + "step": 157 + }, + { + "epoch": 1.79, + "logps_train/chosen": -158.28863525390625, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -168.30862426757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3315372467041016, + "rewards_train/margins": 3.280808448791504, + "rewards_train/rejected": -4.6123456954956055, + "step": 157 + }, + { + "epoch": 1.81, + "learning_rate": 1.780789313911722e-05, + "loss": 0.1212, + "step": 158 + }, + { + "epoch": 1.81, + "logps_train/chosen": -164.3837890625, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -195.4906005859375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.437110424041748, + "rewards_train/margins": 3.6194682121276855, + "rewards_train/rejected": -5.056578636169434, + "step": 158 + }, + { + "epoch": 1.81, + "logps_train/chosen": -167.58389282226562, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -165.21438598632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1453032493591309, + "rewards_train/margins": 3.3836545944213867, + "rewards_train/rejected": -4.528957843780518, + "step": 158 + }, + { + "epoch": 1.81, + "logps_train/chosen": -167.3607177734375, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -142.85699462890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.172424077987671, + "rewards_train/margins": 3.6320254802703857, + "rewards_train/rejected": -4.804449558258057, + "step": 158 + }, + { + "epoch": 1.81, + "logps_train/chosen": -138.3846893310547, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -128.16195678710938, + "rewards_train/accuracies": 0.84375, + "rewards_train/chosen": -2.0708906650543213, + "rewards_train/margins": 1.8548266887664795, + "rewards_train/rejected": -3.925717353820801, + "step": 158 + }, + { + "epoch": 1.82, + "learning_rate": 1.7511144699669966e-05, + "loss": 0.1585, + "step": 159 + }, + { + "epoch": 1.82, + "logps_train/chosen": -184.75267028808594, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -182.9331512451172, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -0.784446656703949, + "rewards_train/margins": 4.016046583652496, + "rewards_train/rejected": -4.800493240356445, + "step": 159 + }, + { + "epoch": 1.82, + "logps_train/chosen": -138.78643798828125, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -133.27508544921875, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.0814690589904785, + "rewards_train/margins": 2.621307849884033, + "rewards_train/rejected": -4.702776908874512, + "step": 159 + }, + { + "epoch": 1.82, + "logps_train/chosen": -174.599609375, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -172.98977661132812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.085846424102783, + "rewards_train/margins": 3.226027488708496, + "rewards_train/rejected": -5.311873912811279, + "step": 159 + }, + { + "epoch": 1.82, + "logps_train/chosen": -154.55592346191406, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -148.77499389648438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5021743774414062, + "rewards_train/margins": 2.780306339263916, + "rewards_train/rejected": -4.282480716705322, + "step": 159 + }, + { + "epoch": 1.83, + "learning_rate": 1.7215550958617034e-05, + "loss": 0.1462, + "step": 160 + }, + { + "epoch": 1.83, + "logps_train/chosen": -162.04150390625, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -162.67465209960938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.419677495956421, + "rewards_train/margins": 3.43099045753479, + "rewards_train/rejected": -4.850667953491211, + "step": 160 + }, + { + "epoch": 1.83, + "logps_train/chosen": -147.52159118652344, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -146.4416961669922, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.1266707181930542, + "rewards_train/margins": 3.2333184480667114, + "rewards_train/rejected": -4.359989166259766, + "step": 160 + }, + { + "epoch": 1.83, + "logps_train/chosen": -168.31619262695312, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -152.9374542236328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2656030654907227, + "rewards_train/margins": 3.5036306381225586, + "rewards_train/rejected": -4.769233703613281, + "step": 160 + }, + { + "epoch": 1.83, + "logps_train/chosen": -205.88299560546875, + "logps_train/ref_chosen": -196.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -174.3811492919922, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -0.9983091354370117, + "rewards_train/margins": 3.4017200469970703, + "rewards_train/rejected": -4.400029182434082, + "step": 160 + }, + { + "epoch": 1.84, + "learning_rate": 1.6921157493233532e-05, + "loss": 0.1424, + "step": 161 + }, + { + "epoch": 1.84, + "logps_train/chosen": -167.32449340820312, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -164.69781494140625, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.4970972537994385, + "rewards_train/margins": 3.72248911857605, + "rewards_train/rejected": -5.219586372375488, + "step": 161 + }, + { + "epoch": 1.84, + "logps_train/chosen": -172.23770141601562, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -154.22283935546875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0136151313781738, + "rewards_train/margins": 3.5768332481384277, + "rewards_train/rejected": -4.590448379516602, + "step": 161 + }, + { + "epoch": 1.84, + "logps_train/chosen": -151.36978149414062, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -171.3380126953125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3356115818023682, + "rewards_train/margins": 3.544039487838745, + "rewards_train/rejected": -4.879651069641113, + "step": 161 + }, + { + "epoch": 1.84, + "logps_train/chosen": -199.06410217285156, + "logps_train/ref_chosen": -188.0, + "logps_train/ref_rejected": -147.0, + "logps_train/rejected": -194.74569702148438, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1116838455200195, + "rewards_train/margins": 3.712299346923828, + "rewards_train/rejected": -4.823983192443848, + "step": 161 + }, + { + "epoch": 1.85, + "learning_rate": 1.6628009695725346e-05, + "loss": 0.129, + "step": 162 + }, + { + "epoch": 1.85, + "logps_train/chosen": -131.08058166503906, + "logps_train/ref_chosen": -121.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -166.07493591308594, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0106275081634521, + "rewards_train/margins": 3.0171782970428467, + "rewards_train/rejected": -4.027805805206299, + "step": 162 + }, + { + "epoch": 1.85, + "logps_train/chosen": -137.0918426513672, + "logps_train/ref_chosen": -115.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -155.005859375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.2230069637298584, + "rewards_train/margins": 2.3856842517852783, + "rewards_train/rejected": -4.608691215515137, + "step": 162 + }, + { + "epoch": 1.85, + "logps_train/chosen": -135.45196533203125, + "logps_train/ref_chosen": -121.0, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -156.66986083984375, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.433404564857483, + "rewards_train/margins": 2.7299917936325073, + "rewards_train/rejected": -4.16339635848999, + "step": 162 + }, + { + "epoch": 1.85, + "logps_train/chosen": -134.3225860595703, + "logps_train/ref_chosen": -114.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -150.91635131835938, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -2.0572102069854736, + "rewards_train/margins": 2.305445909500122, + "rewards_train/rejected": -4.362656116485596, + "step": 162 + }, + { + "epoch": 1.86, + "learning_rate": 1.6336152766230232e-05, + "loss": 0.1969, + "step": 163 + }, + { + "epoch": 1.86, + "logps_train/chosen": -142.69918823242188, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -166.99905395507812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.13105309009552, + "rewards_train/margins": 3.8543986082077026, + "rewards_train/rejected": -4.985451698303223, + "step": 163 + }, + { + "epoch": 1.86, + "logps_train/chosen": -167.85174560546875, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -168.38937377929688, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.7078323364257812, + "rewards_train/margins": 3.499269485473633, + "rewards_train/rejected": -5.207101821899414, + "step": 163 + }, + { + "epoch": 1.86, + "logps_train/chosen": -165.04327392578125, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -174.62939453125, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.4283990859985352, + "rewards_train/margins": 3.1969175338745117, + "rewards_train/rejected": -4.625316619873047, + "step": 163 + }, + { + "epoch": 1.86, + "logps_train/chosen": -189.43804931640625, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -183.73580932617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0531786680221558, + "rewards_train/margins": 3.821672558784485, + "rewards_train/rejected": -4.874851226806641, + "step": 163 + }, + { + "epoch": 1.87, + "learning_rate": 1.6045631705848404e-05, + "loss": 0.1373, + "step": 164 + }, + { + "epoch": 1.87, + "logps_train/chosen": -160.4321746826172, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -136.58428955078125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9580607414245605, + "rewards_train/margins": 2.7091808319091797, + "rewards_train/rejected": -4.66724157333374, + "step": 164 + }, + { + "epoch": 1.87, + "logps_train/chosen": -157.96083068847656, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -179.47161865234375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4216691255569458, + "rewards_train/margins": 3.2209337949752808, + "rewards_train/rejected": -4.642602920532227, + "step": 164 + }, + { + "epoch": 1.87, + "logps_train/chosen": -156.0556640625, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -152.37881469726562, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3762691020965576, + "rewards_train/margins": 3.1942293643951416, + "rewards_train/rejected": -4.570498466491699, + "step": 164 + }, + { + "epoch": 1.87, + "logps_train/chosen": -154.2939453125, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -157.37115478515625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.6712883710861206, + "rewards_train/margins": 2.8872863054275513, + "rewards_train/rejected": -4.558574676513672, + "step": 164 + }, + { + "epoch": 1.89, + "learning_rate": 1.5756491309703875e-05, + "loss": 0.1581, + "step": 165 + }, + { + "epoch": 1.89, + "logps_train/chosen": -135.0899658203125, + "logps_train/ref_chosen": -121.5, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -153.34164428710938, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.354859709739685, + "rewards_train/margins": 3.076668381690979, + "rewards_train/rejected": -4.431528091430664, + "step": 165 + }, + { + "epoch": 1.89, + "logps_train/chosen": -184.4627227783203, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -203.6276397705078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5243486166000366, + "rewards_train/margins": 4.656775116920471, + "rewards_train/rejected": -5.181123733520508, + "step": 165 + }, + { + "epoch": 1.89, + "logps_train/chosen": -161.5297393798828, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -185.64584350585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4633244276046753, + "rewards_train/margins": 3.1752835512161255, + "rewards_train/rejected": -4.638607978820801, + "step": 165 + }, + { + "epoch": 1.89, + "logps_train/chosen": -185.7200164794922, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -151.0, + "logps_train/rejected": -196.57159423828125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0503220558166504, + "rewards_train/margins": 3.486915111541748, + "rewards_train/rejected": -4.537237167358398, + "step": 165 + }, + { + "epoch": 1.9, + "learning_rate": 1.5468776160037556e-05, + "loss": 0.1205, + "step": 166 + }, + { + "epoch": 1.9, + "logps_train/chosen": -168.28759765625, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -169.3665313720703, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3095204830169678, + "rewards_train/margins": 2.7026207447052, + "rewards_train/rejected": -4.012141227722168, + "step": 166 + }, + { + "epoch": 1.9, + "logps_train/chosen": -141.98912048339844, + "logps_train/ref_chosen": -126.5, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -143.87429809570312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.543443202972412, + "rewards_train/margins": 3.1720151901245117, + "rewards_train/rejected": -4.715458393096924, + "step": 166 + }, + { + "epoch": 1.9, + "logps_train/chosen": -146.0329132080078, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -158.03030395507812, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.224214792251587, + "rewards_train/margins": 2.7690494060516357, + "rewards_train/rejected": -3.9932641983032227, + "step": 166 + }, + { + "epoch": 1.9, + "logps_train/chosen": -185.71340942382812, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -182.1860809326172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5620156526565552, + "rewards_train/margins": 3.35896098613739, + "rewards_train/rejected": -4.920976638793945, + "step": 166 + }, + { + "epoch": 1.91, + "learning_rate": 1.5182530619333169e-05, + "loss": 0.1542, + "step": 167 + }, + { + "epoch": 1.91, + "logps_train/chosen": -149.9785919189453, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -152.57215881347656, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3188555240631104, + "rewards_train/margins": 3.0212714672088623, + "rewards_train/rejected": -4.340126991271973, + "step": 167 + }, + { + "epoch": 1.91, + "logps_train/chosen": -168.6658935546875, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -182.68592834472656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.816979169845581, + "rewards_train/margins": 3.0047385692596436, + "rewards_train/rejected": -4.821717739105225, + "step": 167 + }, + { + "epoch": 1.91, + "logps_train/chosen": -164.0330810546875, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -181.12750244140625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.766735315322876, + "rewards_train/margins": 3.4669129848480225, + "rewards_train/rejected": -5.233648300170898, + "step": 167 + }, + { + "epoch": 1.91, + "logps_train/chosen": -201.2147674560547, + "logps_train/ref_chosen": -189.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -180.60372924804688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.19296133518219, + "rewards_train/margins": 4.000518202781677, + "rewards_train/rejected": -5.193479537963867, + "step": 167 + }, + { + "epoch": 1.92, + "learning_rate": 1.4897798823477043e-05, + "loss": 0.1167, + "step": 168 + }, + { + "epoch": 1.92, + "logps_train/chosen": -154.00045776367188, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -161.27391052246094, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.832126498222351, + "rewards_train/margins": 3.5510259866714478, + "rewards_train/rejected": -5.383152484893799, + "step": 168 + }, + { + "epoch": 1.92, + "logps_train/chosen": -171.59112548828125, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -170.0, + "logps_train/rejected": -216.93157958984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3428027629852295, + "rewards_train/margins": 3.343519449234009, + "rewards_train/rejected": -4.686322212219238, + "step": 168 + }, + { + "epoch": 1.92, + "logps_train/chosen": -174.04019165039062, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -172.20095825195312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.2645671367645264, + "rewards_train/margins": 3.8841536045074463, + "rewards_train/rejected": -5.148720741271973, + "step": 168 + }, + { + "epoch": 1.92, + "logps_train/chosen": -169.83084106445312, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -158.2201690673828, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.683249592781067, + "rewards_train/margins": 3.1385172605514526, + "rewards_train/rejected": -4.8217668533325195, + "step": 168 + }, + { + "epoch": 1.93, + "learning_rate": 1.4614624674952842e-05, + "loss": 0.1329, + "step": 169 + }, + { + "epoch": 1.93, + "logps_train/chosen": -177.6451416015625, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -176.8157958984375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3176395893096924, + "rewards_train/margins": 3.5905025005340576, + "rewards_train/rejected": -4.90814208984375, + "step": 169 + }, + { + "epoch": 1.93, + "logps_train/chosen": -178.3532257080078, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -161.0, + "logps_train/rejected": -208.62347412109375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.913349449634552, + "rewards_train/margins": 3.8398191332817078, + "rewards_train/rejected": -4.75316858291626, + "step": 169 + }, + { + "epoch": 1.93, + "logps_train/chosen": -160.98016357421875, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -149.50071716308594, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.2862005233764648, + "rewards_train/margins": 3.435941696166992, + "rewards_train/rejected": -4.722142219543457, + "step": 169 + }, + { + "epoch": 1.93, + "logps_train/chosen": -214.80264282226562, + "logps_train/ref_chosen": -205.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -185.12115478515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0412020683288574, + "rewards_train/margins": 4.142787456512451, + "rewards_train/rejected": -5.183989524841309, + "step": 169 + }, + { + "epoch": 1.94, + "learning_rate": 1.4333051836072298e-05, + "loss": 0.1112, + "step": 170 + }, + { + "epoch": 1.94, + "logps_train/chosen": -177.8150634765625, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -182.40084838867188, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -1.512269139289856, + "rewards_train/margins": 3.168343424797058, + "rewards_train/rejected": -4.680612564086914, + "step": 170 + }, + { + "epoch": 1.94, + "logps_train/chosen": -157.65682983398438, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -151.7862091064453, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.602914571762085, + "rewards_train/margins": 3.1795151233673096, + "rewards_train/rejected": -4.7824296951293945, + "step": 170 + }, + { + "epoch": 1.94, + "logps_train/chosen": -176.32493591308594, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -140.84335327148438, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.5847883224487305, + "rewards_train/margins": 2.9966893196105957, + "rewards_train/rejected": -4.581477642059326, + "step": 170 + }, + { + "epoch": 1.94, + "logps_train/chosen": -140.81861877441406, + "logps_train/ref_chosen": -122.5, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -162.77423095703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8218281269073486, + "rewards_train/margins": 2.9552524089813232, + "rewards_train/rejected": -4.777080535888672, + "step": 170 + }, + { + "epoch": 1.95, + "learning_rate": 1.405312372224294e-05, + "loss": 0.1776, + "step": 171 + }, + { + "epoch": 1.95, + "logps_train/chosen": -138.87081909179688, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -153.38121032714844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.489914894104004, + "rewards_train/margins": 3.1869759559631348, + "rewards_train/rejected": -4.676890850067139, + "step": 171 + }, + { + "epoch": 1.95, + "logps_train/chosen": -162.38807678222656, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -174.22418212890625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.095973014831543, + "rewards_train/margins": 3.369903087615967, + "rewards_train/rejected": -4.46587610244751, + "step": 171 + }, + { + "epoch": 1.95, + "logps_train/chosen": -162.04440307617188, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -163.14654541015625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0364702939987183, + "rewards_train/margins": 3.1233197450637817, + "rewards_train/rejected": -4.1597900390625, + "step": 171 + }, + { + "epoch": 1.95, + "logps_train/chosen": -145.47537231445312, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -128.49514770507812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4886016845703125, + "rewards_train/margins": 2.50949764251709, + "rewards_train/rejected": -3.9980993270874023, + "step": 171 + }, + { + "epoch": 1.97, + "learning_rate": 1.3774883495273985e-05, + "loss": 0.155, + "step": 172 + }, + { + "epoch": 1.97, + "logps_train/chosen": -164.489013671875, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -144.42698669433594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8249762058258057, + "rewards_train/margins": 2.9649884700775146, + "rewards_train/rejected": -4.78996467590332, + "step": 172 + }, + { + "epoch": 1.97, + "logps_train/chosen": -149.56802368164062, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -144.60494995117188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.797525405883789, + "rewards_train/margins": 2.945197105407715, + "rewards_train/rejected": -4.742722511291504, + "step": 172 + }, + { + "epoch": 1.97, + "logps_train/chosen": -142.54705810546875, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -157.16668701171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2089290618896484, + "rewards_train/margins": 3.610962390899658, + "rewards_train/rejected": -4.819891452789307, + "step": 172 + }, + { + "epoch": 1.97, + "logps_train/chosen": -162.0865936279297, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -163.69024658203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1669116020202637, + "rewards_train/margins": 3.1393208503723145, + "rewards_train/rejected": -4.306232452392578, + "step": 172 + }, + { + "epoch": 1.98, + "learning_rate": 1.3498374056721197e-05, + "loss": 0.1313, + "step": 173 + }, + { + "epoch": 1.98, + "logps_train/chosen": -166.25057983398438, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -172.05795288085938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.346738338470459, + "rewards_train/margins": 3.3271727561950684, + "rewards_train/rejected": -4.673911094665527, + "step": 173 + }, + { + "epoch": 1.98, + "logps_train/chosen": -170.06703186035156, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -181.25033569335938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.454749345779419, + "rewards_train/margins": 3.6730196475982666, + "rewards_train/rejected": -5.1277689933776855, + "step": 173 + }, + { + "epoch": 1.98, + "logps_train/chosen": -140.8558349609375, + "logps_train/ref_chosen": -124.5, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -145.61558532714844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.617444396018982, + "rewards_train/margins": 3.549827218055725, + "rewards_train/rejected": -5.167271614074707, + "step": 173 + }, + { + "epoch": 1.98, + "logps_train/chosen": -215.5902099609375, + "logps_train/ref_chosen": -202.0, + "logps_train/ref_rejected": -167.0, + "logps_train/rejected": -217.3975067138672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4094116687774658, + "rewards_train/margins": 3.6104161739349365, + "rewards_train/rejected": -5.019827842712402, + "step": 173 + }, + { + "epoch": 1.99, + "learning_rate": 1.3223638041271979e-05, + "loss": 0.1236, + "step": 174 + }, + { + "epoch": 1.99, + "logps_train/chosen": -196.11367797851562, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -175.70323181152344, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3418858051300049, + "rewards_train/margins": 3.1817080974578857, + "rewards_train/rejected": -4.523593902587891, + "step": 174 + }, + { + "epoch": 1.99, + "logps_train/chosen": -142.29310607910156, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -174.541015625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3043105602264404, + "rewards_train/margins": 3.299423933029175, + "rewards_train/rejected": -4.603734493255615, + "step": 174 + }, + { + "epoch": 1.99, + "logps_train/chosen": -172.05809020996094, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -163.4266357421875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.619040846824646, + "rewards_train/margins": 3.089199662208557, + "rewards_train/rejected": -4.708240509033203, + "step": 174 + }, + { + "epoch": 1.99, + "logps_train/chosen": -171.18453979492188, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -167.75234985351562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.761239767074585, + "rewards_train/margins": 3.911907911300659, + "rewards_train/rejected": -4.673147678375244, + "step": 174 + }, + { + "epoch": 2.0, + "learning_rate": 1.2950717810171558e-05, + "loss": 0.124, + "step": 175 + }, + { + "epoch": 2.0, + "logps_train/chosen": -178.75906372070312, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -158.2779998779297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.51145339012146, + "rewards_train/margins": 3.475623846054077, + "rewards_train/rejected": -4.987077236175537, + "step": 175 + }, + { + "epoch": 2.0, + "logps_train/chosen": -144.22889709472656, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -159.568603515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2451064586639404, + "rewards_train/margins": 3.9777700901031494, + "rewards_train/rejected": -5.22287654876709, + "step": 175 + }, + { + "epoch": 2.0, + "logps_train/chosen": -159.3805694580078, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -156.07077026367188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3049516677856445, + "rewards_train/margins": 3.2286882400512695, + "rewards_train/rejected": -4.533639907836914, + "step": 175 + }, + { + "epoch": 2.0, + "logps_train/chosen": -160.80697631835938, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -182.2629852294922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6911461353302002, + "rewards_train/margins": 3.7947232723236084, + "rewards_train/rejected": -5.485869407653809, + "step": 175 + }, + { + "epoch": 2.01, + "learning_rate": 1.2679655444691369e-05, + "loss": 0.0755, + "step": 176 + }, + { + "epoch": 2.01, + "logps_train/chosen": -161.154052734375, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -167.23745727539062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2068119049072266, + "rewards_train/margins": 3.918104648590088, + "rewards_train/rejected": -5.1249165534973145, + "step": 176 + }, + { + "epoch": 2.01, + "logps_train/chosen": -145.90554809570312, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -144.0738525390625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.574514627456665, + "rewards_train/margins": 3.397310972213745, + "rewards_train/rejected": -4.97182559967041, + "step": 176 + }, + { + "epoch": 2.01, + "logps_train/chosen": -192.4039306640625, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -172.21121215820312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8327764272689819, + "rewards_train/margins": 4.236098408699036, + "rewards_train/rejected": -5.068874835968018, + "step": 176 + }, + { + "epoch": 2.01, + "logps_train/chosen": -153.53134155273438, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -117.5, + "logps_train/rejected": -175.02682495117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4470796585083008, + "rewards_train/margins": 4.318201065063477, + "rewards_train/rejected": -5.765280723571777, + "step": 176 + }, + { + "epoch": 2.02, + "learning_rate": 1.2410492739640592e-05, + "loss": 0.085, + "step": 177 + }, + { + "epoch": 2.02, + "logps_train/chosen": -191.07339477539062, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -188.9764404296875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.7616373896598816, + "rewards_train/margins": 4.577509701251984, + "rewards_train/rejected": -5.339147090911865, + "step": 177 + }, + { + "epoch": 2.02, + "logps_train/chosen": -185.62136840820312, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -184.6957550048828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6105749011039734, + "rewards_train/margins": 4.272965252399445, + "rewards_train/rejected": -4.883540153503418, + "step": 177 + }, + { + "epoch": 2.02, + "logps_train/chosen": -191.00115966796875, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -179.1207733154297, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.8412289619445801, + "rewards_train/margins": 3.8856916427612305, + "rewards_train/rejected": -4.7269206047058105, + "step": 177 + }, + { + "epoch": 2.02, + "logps_train/chosen": -163.87918090820312, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -188.97149658203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.558229923248291, + "rewards_train/margins": 3.8317909240722656, + "rewards_train/rejected": -5.390020847320557, + "step": 177 + }, + { + "epoch": 2.03, + "learning_rate": 1.2143271196921831e-05, + "loss": 0.0734, + "step": 178 + }, + { + "epoch": 2.03, + "logps_train/chosen": -149.78662109375, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -154.28677368164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2491698265075684, + "rewards_train/margins": 3.2221827507019043, + "rewards_train/rejected": -4.471352577209473, + "step": 178 + }, + { + "epoch": 2.03, + "logps_train/chosen": -180.86485290527344, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -168.6180419921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9179306030273438, + "rewards_train/margins": 3.6434578895568848, + "rewards_train/rejected": -4.5613884925842285, + "step": 178 + }, + { + "epoch": 2.03, + "logps_train/chosen": -191.31576538085938, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -154.0, + "logps_train/rejected": -205.51625061035156, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.9044272899627686, + "rewards_train/margins": 4.209453344345093, + "rewards_train/rejected": -5.113880634307861, + "step": 178 + }, + { + "epoch": 2.03, + "logps_train/chosen": -157.76119995117188, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -172.5016632080078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2712382078170776, + "rewards_train/margins": 3.9011937379837036, + "rewards_train/rejected": -5.172431945800781, + "step": 178 + }, + { + "epoch": 2.05, + "learning_rate": 1.1878032019132016e-05, + "loss": 0.0712, + "step": 179 + }, + { + "epoch": 2.05, + "logps_train/chosen": -179.49224853515625, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -168.34971618652344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9605532884597778, + "rewards_train/margins": 4.291605830192566, + "rewards_train/rejected": -5.252159118652344, + "step": 179 + }, + { + "epoch": 2.05, + "logps_train/chosen": -182.27786254882812, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -147.0, + "logps_train/rejected": -195.55474853515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.051900215446949005, + "rewards_train/margins": 4.886940307915211, + "rewards_train/rejected": -4.835040092468262, + "step": 179 + }, + { + "epoch": 2.05, + "logps_train/chosen": -144.2906494140625, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -156.61428833007812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.2723876237869263, + "rewards_train/margins": 3.064677119255066, + "rewards_train/rejected": -4.337064743041992, + "step": 179 + }, + { + "epoch": 2.05, + "logps_train/chosen": -184.02662658691406, + "logps_train/ref_chosen": -177.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -187.46713256835938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.6745372414588928, + "rewards_train/margins": 4.093368470668793, + "rewards_train/rejected": -4.7679057121276855, + "step": 179 + }, + { + "epoch": 2.06, + "learning_rate": 1.1614816103209363e-05, + "loss": 0.0749, + "step": 180 + }, + { + "epoch": 2.06, + "logps_train/chosen": -152.71478271484375, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -169.14288330078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3001878261566162, + "rewards_train/margins": 3.4307992458343506, + "rewards_train/rejected": -4.730987071990967, + "step": 180 + }, + { + "epoch": 2.06, + "logps_train/chosen": -175.60665893554688, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -172.89309692382812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.662753701210022, + "rewards_train/margins": 3.5386651754379272, + "rewards_train/rejected": -5.201418876647949, + "step": 180 + }, + { + "epoch": 2.06, + "logps_train/chosen": -145.56417846679688, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -156.00181579589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1640660762786865, + "rewards_train/margins": 3.8189284801483154, + "rewards_train/rejected": -4.982994556427002, + "step": 180 + }, + { + "epoch": 2.06, + "logps_train/chosen": -172.25607299804688, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -152.76123046875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -0.6297088265419006, + "rewards_train/margins": 4.331765592098236, + "rewards_train/rejected": -4.961474418640137, + "step": 180 + }, + { + "epoch": 2.07, + "learning_rate": 1.1353664034127583e-05, + "loss": 0.0793, + "step": 181 + }, + { + "epoch": 2.07, + "logps_train/chosen": -149.04946899414062, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -154.0411376953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2665691375732422, + "rewards_train/margins": 3.5102977752685547, + "rewards_train/rejected": -4.776866912841797, + "step": 181 + }, + { + "epoch": 2.07, + "logps_train/chosen": -158.56723022460938, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -159.50074768066406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.218638300895691, + "rewards_train/margins": 3.6543132066726685, + "rewards_train/rejected": -4.872951507568359, + "step": 181 + }, + { + "epoch": 2.07, + "logps_train/chosen": -147.66925048828125, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -168.28912353515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.185003638267517, + "rewards_train/margins": 4.234533905982971, + "rewards_train/rejected": -5.419537544250488, + "step": 181 + }, + { + "epoch": 2.07, + "logps_train/chosen": -179.8079833984375, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -197.74644470214844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2968143224716187, + "rewards_train/margins": 4.06601345539093, + "rewards_train/rejected": -5.362827777862549, + "step": 181 + }, + { + "epoch": 2.08, + "learning_rate": 1.1094616078638123e-05, + "loss": 0.0717, + "step": 182 + }, + { + "epoch": 2.08, + "logps_train/chosen": -190.28167724609375, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -181.79185485839844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6578567028045654, + "rewards_train/margins": 4.172305345535278, + "rewards_train/rejected": -4.830162048339844, + "step": 182 + }, + { + "epoch": 2.08, + "logps_train/chosen": -170.9873046875, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -172.44073486328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5471186637878418, + "rewards_train/margins": 4.172564506530762, + "rewards_train/rejected": -4.7196831703186035, + "step": 182 + }, + { + "epoch": 2.08, + "logps_train/chosen": -170.7721405029297, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -159.09173583984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.270280122756958, + "rewards_train/margins": 3.9390900135040283, + "rewards_train/rejected": -5.209370136260986, + "step": 182 + }, + { + "epoch": 2.08, + "logps_train/chosen": -175.2460174560547, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -195.99169921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9697037935256958, + "rewards_train/margins": 3.803244471549988, + "rewards_train/rejected": -4.772948265075684, + "step": 182 + }, + { + "epoch": 2.09, + "learning_rate": 1.083771217906143e-05, + "loss": 0.0599, + "step": 183 + }, + { + "epoch": 2.09, + "logps_train/chosen": -190.64321899414062, + "logps_train/ref_chosen": -187.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -189.67926025390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.39599961042404175, + "rewards_train/margins": 4.678371250629425, + "rewards_train/rejected": -5.074370861053467, + "step": 183 + }, + { + "epoch": 2.09, + "logps_train/chosen": -138.62399291992188, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -144.83941650390625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.0373992919921875, + "rewards_train/margins": 3.527449131011963, + "rewards_train/rejected": -4.56484842300415, + "step": 183 + }, + { + "epoch": 2.09, + "logps_train/chosen": -136.85711669921875, + "logps_train/ref_chosen": -118.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -131.87420654296875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8283874988555908, + "rewards_train/margins": 2.936718702316284, + "rewards_train/rejected": -4.765106201171875, + "step": 183 + }, + { + "epoch": 2.09, + "logps_train/chosen": -148.16285705566406, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -186.79367065429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6797617077827454, + "rewards_train/margins": 4.7876922488212585, + "rewards_train/rejected": -5.467453956604004, + "step": 183 + }, + { + "epoch": 2.1, + "learning_rate": 1.0582991947128324e-05, + "loss": 0.0863, + "step": 184 + }, + { + "epoch": 2.1, + "logps_train/chosen": -168.52122497558594, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -180.79949951171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8917717337608337, + "rewards_train/margins": 3.991675913333893, + "rewards_train/rejected": -4.883447647094727, + "step": 184 + }, + { + "epoch": 2.1, + "logps_train/chosen": -143.18820190429688, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -146.11032104492188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5001935958862305, + "rewards_train/margins": 3.6904263496398926, + "rewards_train/rejected": -5.190619945526123, + "step": 184 + }, + { + "epoch": 2.1, + "logps_train/chosen": -147.32476806640625, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -184.9254150390625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.9073777198791504, + "rewards_train/margins": 3.7832117080688477, + "rewards_train/rejected": -5.690589427947998, + "step": 184 + }, + { + "epoch": 2.1, + "logps_train/chosen": -147.30819702148438, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -155.67486572265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4561140537261963, + "rewards_train/margins": 3.524458646774292, + "rewards_train/rejected": -4.980572700500488, + "step": 184 + }, + { + "epoch": 2.11, + "learning_rate": 1.0330494657872312e-05, + "loss": 0.0853, + "step": 185 + }, + { + "epoch": 2.11, + "logps_train/chosen": -162.71340942382812, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -179.08816528320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3596220016479492, + "rewards_train/margins": 3.8323984146118164, + "rewards_train/rejected": -5.192020416259766, + "step": 185 + }, + { + "epoch": 2.11, + "logps_train/chosen": -162.77186584472656, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -192.93026733398438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.177870273590088, + "rewards_train/margins": 4.452266216278076, + "rewards_train/rejected": -5.630136489868164, + "step": 185 + }, + { + "epoch": 2.11, + "logps_train/chosen": -152.37396240234375, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -195.00845336914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4057559967041016, + "rewards_train/margins": 4.478293418884277, + "rewards_train/rejected": -5.884049415588379, + "step": 185 + }, + { + "epoch": 2.11, + "logps_train/chosen": -151.218505859375, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -170.10499572753906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1394281387329102, + "rewards_train/margins": 3.907008647918701, + "rewards_train/rejected": -5.046436786651611, + "step": 185 + }, + { + "epoch": 2.13, + "learning_rate": 1.0080259243573789e-05, + "loss": 0.0513, + "step": 186 + }, + { + "epoch": 2.13, + "logps_train/chosen": -166.97433471679688, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -182.7489471435547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.403684139251709, + "rewards_train/margins": 4.301093101501465, + "rewards_train/rejected": -5.704777240753174, + "step": 186 + }, + { + "epoch": 2.13, + "logps_train/chosen": -131.2030792236328, + "logps_train/ref_chosen": -119.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -171.1971435546875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.2050743103027344, + "rewards_train/margins": 3.998037815093994, + "rewards_train/rejected": -5.2031121253967285, + "step": 186 + }, + { + "epoch": 2.13, + "logps_train/chosen": -153.02066040039062, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -172.2113494873047, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.3235623836517334, + "rewards_train/margins": 3.6091701984405518, + "rewards_train/rejected": -4.932732582092285, + "step": 186 + }, + { + "epoch": 2.13, + "logps_train/chosen": -162.50509643554688, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -177.67140197753906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7423069477081299, + "rewards_train/margins": 4.284990072250366, + "rewards_train/rejected": -5.027297019958496, + "step": 186 + }, + { + "epoch": 2.14, + "learning_rate": 9.832324287757158e-06, + "loss": 0.0607, + "step": 187 + }, + { + "epoch": 2.14, + "logps_train/chosen": -145.08290100097656, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -177.64239501953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1745011806488037, + "rewards_train/margins": 4.237786531448364, + "rewards_train/rejected": -5.412287712097168, + "step": 187 + }, + { + "epoch": 2.14, + "logps_train/chosen": -172.32980346679688, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -179.23846435546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4877278804779053, + "rewards_train/margins": 3.9437358379364014, + "rewards_train/rejected": -5.431463718414307, + "step": 187 + }, + { + "epoch": 2.14, + "logps_train/chosen": -153.09422302246094, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -191.51918029785156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5256340503692627, + "rewards_train/margins": 4.172573804855347, + "rewards_train/rejected": -5.698207855224609, + "step": 187 + }, + { + "epoch": 2.14, + "logps_train/chosen": -176.45843505859375, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -167.15196228027344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1481870412826538, + "rewards_train/margins": 4.72482168674469, + "rewards_train/rejected": -5.873008728027344, + "step": 187 + }, + { + "epoch": 2.15, + "learning_rate": 9.586728019241623e-06, + "loss": 0.0399, + "step": 188 + }, + { + "epoch": 2.15, + "logps_train/chosen": -154.5926513671875, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -142.7076873779297, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.872741937637329, + "rewards_train/margins": 3.5749804973602295, + "rewards_train/rejected": -5.447722434997559, + "step": 188 + }, + { + "epoch": 2.15, + "logps_train/chosen": -207.6587371826172, + "logps_train/ref_chosen": -192.0, + "logps_train/ref_rejected": -169.0, + "logps_train/rejected": -234.085693359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6053264141082764, + "rewards_train/margins": 4.921602487564087, + "rewards_train/rejected": -6.526928901672363, + "step": 188 + }, + { + "epoch": 2.15, + "logps_train/chosen": -149.80093383789062, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -179.54971313476562, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6450353860855103, + "rewards_train/margins": 3.782885432243347, + "rewards_train/rejected": -5.427920818328857, + "step": 188 + }, + { + "epoch": 2.15, + "logps_train/chosen": -163.64334106445312, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -165.87039184570312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0383586883544922, + "rewards_train/margins": 4.294675827026367, + "rewards_train/rejected": -5.333034515380859, + "step": 188 + }, + { + "epoch": 2.16, + "learning_rate": 9.343508306246771e-06, + "loss": 0.0743, + "step": 189 + }, + { + "epoch": 2.16, + "logps_train/chosen": -162.09934997558594, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -154.43948364257812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7151105403900146, + "rewards_train/margins": 4.205791234970093, + "rewards_train/rejected": -5.920901775360107, + "step": 189 + }, + { + "epoch": 2.16, + "logps_train/chosen": -187.39578247070312, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -198.8822479248047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.358717918395996, + "rewards_train/margins": 4.348062515258789, + "rewards_train/rejected": -5.706780433654785, + "step": 189 + }, + { + "epoch": 2.16, + "logps_train/chosen": -142.24322509765625, + "logps_train/ref_chosen": -126.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -162.78729248046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6396063566207886, + "rewards_train/margins": 3.697179913520813, + "rewards_train/rejected": -5.336786270141602, + "step": 189 + }, + { + "epoch": 2.16, + "logps_train/chosen": -169.55081176757812, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -203.53314208984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2864294052124023, + "rewards_train/margins": 5.046573162078857, + "rewards_train/rejected": -6.33300256729126, + "step": 189 + }, + { + "epoch": 2.17, + "learning_rate": 9.102702650553671e-06, + "loss": 0.0536, + "step": 190 + }, + { + "epoch": 2.17, + "logps_train/chosen": -187.0035400390625, + "logps_train/ref_chosen": -178.0, + "logps_train/ref_rejected": -153.0, + "logps_train/rejected": -205.14060974121094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8618786334991455, + "rewards_train/margins": 4.305112600326538, + "rewards_train/rejected": -5.166991233825684, + "step": 190 + }, + { + "epoch": 2.17, + "logps_train/chosen": -148.8421630859375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -156.6807861328125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.293787956237793, + "rewards_train/margins": 3.910130023956299, + "rewards_train/rejected": -5.203917980194092, + "step": 190 + }, + { + "epoch": 2.17, + "logps_train/chosen": -162.02560424804688, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -167.29222106933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2023648023605347, + "rewards_train/margins": 4.4929715394973755, + "rewards_train/rejected": -5.69533634185791, + "step": 190 + }, + { + "epoch": 2.17, + "logps_train/chosen": -149.6796417236328, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -193.026611328125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.1812937259674072, + "rewards_train/margins": 4.417267084121704, + "rewards_train/rejected": -5.598560810089111, + "step": 190 + }, + { + "epoch": 2.18, + "learning_rate": 8.864348181722559e-06, + "loss": 0.0645, + "step": 191 + }, + { + "epoch": 2.18, + "logps_train/chosen": -165.83389282226562, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -161.61724853515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1698150634765625, + "rewards_train/margins": 3.5971102714538574, + "rewards_train/rejected": -5.76692533493042, + "step": 191 + }, + { + "epoch": 2.18, + "logps_train/chosen": -139.57894897460938, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -169.34829711914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.577218770980835, + "rewards_train/margins": 4.069085359573364, + "rewards_train/rejected": -5.646304130554199, + "step": 191 + }, + { + "epoch": 2.18, + "logps_train/chosen": -188.30484008789062, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -201.06988525390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2504067420959473, + "rewards_train/margins": 4.388809680938721, + "rewards_train/rejected": -5.639216423034668, + "step": 191 + }, + { + "epoch": 2.18, + "logps_train/chosen": -193.16360473632812, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -143.0, + "logps_train/rejected": -201.50115966796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3569847345352173, + "rewards_train/margins": 4.465201735496521, + "rewards_train/rejected": -5.822186470031738, + "step": 191 + }, + { + "epoch": 2.19, + "learning_rate": 8.628481651367876e-06, + "loss": 0.0616, + "step": 192 + }, + { + "epoch": 2.19, + "logps_train/chosen": -127.2575912475586, + "logps_train/ref_chosen": -104.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -145.93881225585938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.2746853828430176, + "rewards_train/margins": 3.400190830230713, + "rewards_train/rejected": -5.6748762130737305, + "step": 192 + }, + { + "epoch": 2.19, + "logps_train/chosen": -160.23959350585938, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -163.14401245117188, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8308442831039429, + "rewards_train/margins": 4.124375939369202, + "rewards_train/rejected": -5.9552202224731445, + "step": 192 + }, + { + "epoch": 2.19, + "logps_train/chosen": -203.55633544921875, + "logps_train/ref_chosen": -195.0, + "logps_train/ref_rejected": -159.0, + "logps_train/rejected": -215.8226776123047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8862971067428589, + "rewards_train/margins": 4.777416348457336, + "rewards_train/rejected": -5.663713455200195, + "step": 192 + }, + { + "epoch": 2.19, + "logps_train/chosen": -154.07431030273438, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -111.5, + "logps_train/rejected": -165.9130859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1087489128112793, + "rewards_train/margins": 4.343350887298584, + "rewards_train/rejected": -5.452099800109863, + "step": 192 + }, + { + "epoch": 2.21, + "learning_rate": 8.395139427491517e-06, + "loss": 0.0737, + "step": 193 + }, + { + "epoch": 2.21, + "logps_train/chosen": -161.2333526611328, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -172.9558868408203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.696040391921997, + "rewards_train/margins": 3.8845107555389404, + "rewards_train/rejected": -5.5805511474609375, + "step": 193 + }, + { + "epoch": 2.21, + "logps_train/chosen": -179.02362060546875, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -207.66558837890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.911834478378296, + "rewards_train/margins": 4.097449064254761, + "rewards_train/rejected": -6.009283542633057, + "step": 193 + }, + { + "epoch": 2.21, + "logps_train/chosen": -180.44769287109375, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -193.8340301513672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.302825689315796, + "rewards_train/margins": 4.537803888320923, + "rewards_train/rejected": -5.840629577636719, + "step": 193 + }, + { + "epoch": 2.21, + "logps_train/chosen": -169.3468017578125, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -172.2258758544922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8215930461883545, + "rewards_train/margins": 4.061346769332886, + "rewards_train/rejected": -5.88293981552124, + "step": 193 + }, + { + "epoch": 2.22, + "learning_rate": 8.164357488875348e-06, + "loss": 0.0566, + "step": 194 + }, + { + "epoch": 2.22, + "logps_train/chosen": -200.07351684570312, + "logps_train/ref_chosen": -184.0, + "logps_train/ref_rejected": -151.0, + "logps_train/rejected": -209.89173889160156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5717073678970337, + "rewards_train/margins": 4.342369437217712, + "rewards_train/rejected": -5.914076805114746, + "step": 194 + }, + { + "epoch": 2.22, + "logps_train/chosen": -171.31915283203125, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -182.50357055664062, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.122978925704956, + "rewards_train/margins": 4.234654664993286, + "rewards_train/rejected": -5.357633590698242, + "step": 194 + }, + { + "epoch": 2.22, + "logps_train/chosen": -115.46754455566406, + "logps_train/ref_chosen": -97.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -135.1890106201172, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8070234060287476, + "rewards_train/margins": 3.7197171449661255, + "rewards_train/rejected": -5.526740550994873, + "step": 194 + }, + { + "epoch": 2.22, + "logps_train/chosen": -158.11651611328125, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -179.56265258789062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8428046703338623, + "rewards_train/margins": 4.047834157943726, + "rewards_train/rejected": -5.890638828277588, + "step": 194 + }, + { + "epoch": 2.23, + "learning_rate": 7.936171419533653e-06, + "loss": 0.0772, + "step": 195 + }, + { + "epoch": 2.23, + "logps_train/chosen": -175.13031005859375, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -145.96676635742188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.54506254196167, + "rewards_train/margins": 4.095754146575928, + "rewards_train/rejected": -5.640816688537598, + "step": 195 + }, + { + "epoch": 2.23, + "logps_train/chosen": -173.42425537109375, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -184.82421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6176198720932007, + "rewards_train/margins": 4.269880890846252, + "rewards_train/rejected": -5.887500762939453, + "step": 195 + }, + { + "epoch": 2.23, + "logps_train/chosen": -196.7584686279297, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -205.75381469726562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.33219575881958, + "rewards_train/margins": 4.877925395965576, + "rewards_train/rejected": -6.210121154785156, + "step": 195 + }, + { + "epoch": 2.23, + "logps_train/chosen": -184.88011169433594, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -196.70294189453125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7086167335510254, + "rewards_train/margins": 4.16851282119751, + "rewards_train/rejected": -5.877129554748535, + "step": 195 + }, + { + "epoch": 2.24, + "learning_rate": 7.710616403226459e-06, + "loss": 0.0644, + "step": 196 + }, + { + "epoch": 2.24, + "logps_train/chosen": -154.9520263671875, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -183.3641815185547, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9811406135559082, + "rewards_train/margins": 4.288285732269287, + "rewards_train/rejected": -6.269426345825195, + "step": 196 + }, + { + "epoch": 2.24, + "logps_train/chosen": -180.44895935058594, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -169.0, + "logps_train/rejected": -230.28929138183594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.85615074634552, + "rewards_train/margins": 5.313965678215027, + "rewards_train/rejected": -6.170116424560547, + "step": 196 + }, + { + "epoch": 2.24, + "logps_train/chosen": -158.23910522460938, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -124.5, + "logps_train/rejected": -179.76210021972656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.5342793464660645, + "rewards_train/margins": 3.97666597366333, + "rewards_train/rejected": -5.5109453201293945, + "step": 196 + }, + { + "epoch": 2.24, + "logps_train/chosen": -148.6269073486328, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -161.82305908203125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.707613229751587, + "rewards_train/margins": 4.167392015457153, + "rewards_train/rejected": -5.87500524520874, + "step": 196 + }, + { + "epoch": 2.25, + "learning_rate": 7.487727218034646e-06, + "loss": 0.0777, + "step": 197 + }, + { + "epoch": 2.25, + "logps_train/chosen": -184.1099853515625, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -201.27886962890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.521935224533081, + "rewards_train/margins": 4.687398195266724, + "rewards_train/rejected": -6.209333419799805, + "step": 197 + }, + { + "epoch": 2.25, + "logps_train/chosen": -162.78753662109375, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -185.3925323486328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8591243028640747, + "rewards_train/margins": 4.596633791923523, + "rewards_train/rejected": -6.455758094787598, + "step": 197 + }, + { + "epoch": 2.25, + "logps_train/chosen": -155.07752990722656, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -191.83187866210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9492814540863037, + "rewards_train/margins": 4.413836717605591, + "rewards_train/rejected": -6.3631181716918945, + "step": 197 + }, + { + "epoch": 2.25, + "logps_train/chosen": -158.76815795898438, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -183.657958984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2400364875793457, + "rewards_train/margins": 3.898550510406494, + "rewards_train/rejected": -6.13858699798584, + "step": 197 + }, + { + "epoch": 2.26, + "learning_rate": 7.267538230997487e-06, + "loss": 0.065, + "step": 198 + }, + { + "epoch": 2.26, + "logps_train/chosen": -154.61886596679688, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -165.445556640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5989946126937866, + "rewards_train/margins": 3.7381139993667603, + "rewards_train/rejected": -5.337108612060547, + "step": 198 + }, + { + "epoch": 2.26, + "logps_train/chosen": -174.43276977539062, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -194.56985473632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.813735842704773, + "rewards_train/margins": 3.821837544441223, + "rewards_train/rejected": -5.635573387145996, + "step": 198 + }, + { + "epoch": 2.26, + "logps_train/chosen": -171.7181396484375, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -204.68441772460938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9548206329345703, + "rewards_train/margins": 4.630368232727051, + "rewards_train/rejected": -6.585188865661621, + "step": 198 + }, + { + "epoch": 2.26, + "logps_train/chosen": -156.4467010498047, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -171.12014770507812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.553849220275879, + "rewards_train/margins": 3.99556827545166, + "rewards_train/rejected": -5.549417495727539, + "step": 198 + }, + { + "epoch": 2.27, + "learning_rate": 7.05008339281365e-06, + "loss": 0.079, + "step": 199 + }, + { + "epoch": 2.27, + "logps_train/chosen": -201.37559509277344, + "logps_train/ref_chosen": -189.0, + "logps_train/ref_rejected": -159.0, + "logps_train/rejected": -222.2662811279297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2338494062423706, + "rewards_train/margins": 5.101566910743713, + "rewards_train/rejected": -6.335416316986084, + "step": 199 + }, + { + "epoch": 2.27, + "logps_train/chosen": -155.3743438720703, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -185.0620880126953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0335280895233154, + "rewards_train/margins": 4.135473966598511, + "rewards_train/rejected": -6.169002056121826, + "step": 199 + }, + { + "epoch": 2.27, + "logps_train/chosen": -214.00473022460938, + "logps_train/ref_chosen": -203.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -188.80557250976562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0920145511627197, + "rewards_train/margins": 4.752117395401001, + "rewards_train/rejected": -5.844131946563721, + "step": 199 + }, + { + "epoch": 2.27, + "logps_train/chosen": -157.0347900390625, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -181.71475219726562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7769157886505127, + "rewards_train/margins": 4.140556573867798, + "rewards_train/rejected": -5.9174723625183105, + "step": 199 + }, + { + "epoch": 2.29, + "learning_rate": 6.835396232606414e-06, + "loss": 0.0586, + "step": 200 + }, + { + "epoch": 2.29, + "logps_train/chosen": -161.72032165527344, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -177.61468505859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.022033214569092, + "rewards_train/margins": 4.182990550994873, + "rewards_train/rejected": -6.205023765563965, + "step": 200 + }, + { + "epoch": 2.29, + "logps_train/chosen": -134.15977478027344, + "logps_train/ref_chosen": -110.5, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -164.42269897460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.3491806983947754, + "rewards_train/margins": 4.333714962005615, + "rewards_train/rejected": -6.682895660400391, + "step": 200 + }, + { + "epoch": 2.29, + "logps_train/chosen": -177.84542846679688, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -192.08859252929688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0337610244750977, + "rewards_train/margins": 4.461132049560547, + "rewards_train/rejected": -6.4948930740356445, + "step": 200 + }, + { + "epoch": 2.29, + "logps_train/chosen": -196.11642456054688, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -202.897705078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6030492782592773, + "rewards_train/margins": 4.695511817932129, + "rewards_train/rejected": -6.298561096191406, + "step": 200 + }, + { + "epoch": 2.3, + "learning_rate": 6.623509852753798e-06, + "loss": 0.0547, + "step": 201 + }, + { + "epoch": 2.3, + "logps_train/chosen": -207.26229858398438, + "logps_train/ref_chosen": -188.0, + "logps_train/ref_rejected": -145.0, + "logps_train/rejected": -211.3671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9156838655471802, + "rewards_train/margins": 4.669862151145935, + "rewards_train/rejected": -6.585546016693115, + "step": 201 + }, + { + "epoch": 2.3, + "logps_train/chosen": -154.91036987304688, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -166.4049072265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6626181602478027, + "rewards_train/margins": 4.285686016082764, + "rewards_train/rejected": -5.948304176330566, + "step": 201 + }, + { + "epoch": 2.3, + "logps_train/chosen": -140.320556640625, + "logps_train/ref_chosen": -120.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -179.07057189941406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0460212230682373, + "rewards_train/margins": 4.754591226577759, + "rewards_train/rejected": -6.800612449645996, + "step": 201 + }, + { + "epoch": 2.3, + "logps_train/chosen": -167.5859832763672, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -170.61170959472656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4132864475250244, + "rewards_train/margins": 4.2639000415802, + "rewards_train/rejected": -5.677186489105225, + "step": 201 + }, + { + "epoch": 2.31, + "learning_rate": 6.414456923784593e-06, + "loss": 0.0495, + "step": 202 + }, + { + "epoch": 2.31, + "logps_train/chosen": -174.45191955566406, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -183.5615234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.61784827709198, + "rewards_train/margins": 4.5230690240859985, + "rewards_train/rejected": -6.1409173011779785, + "step": 202 + }, + { + "epoch": 2.31, + "logps_train/chosen": -164.99447631835938, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -153.0, + "logps_train/rejected": -214.52035522460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3355815410614014, + "rewards_train/margins": 4.774393320083618, + "rewards_train/rejected": -6.1099748611450195, + "step": 202 + }, + { + "epoch": 2.31, + "logps_train/chosen": -186.1416778564453, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -173.00531005859375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.7241289615631104, + "rewards_train/margins": 3.9765965938568115, + "rewards_train/rejected": -5.700725555419922, + "step": 202 + }, + { + "epoch": 2.31, + "logps_train/chosen": -183.17779541015625, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -193.62240600585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.05098295211792, + "rewards_train/margins": 4.535184383392334, + "rewards_train/rejected": -6.586167335510254, + "step": 202 + }, + { + "epoch": 2.32, + "learning_rate": 6.208269679340886e-06, + "loss": 0.0549, + "step": 203 + }, + { + "epoch": 2.32, + "logps_train/chosen": -164.665283203125, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -175.95152282714844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9498772621154785, + "rewards_train/margins": 4.297813415527344, + "rewards_train/rejected": -6.247690677642822, + "step": 203 + }, + { + "epoch": 2.32, + "logps_train/chosen": -179.72842407226562, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -199.34719848632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5472551584243774, + "rewards_train/margins": 4.814515471458435, + "rewards_train/rejected": -6.3617706298828125, + "step": 203 + }, + { + "epoch": 2.32, + "logps_train/chosen": -139.86717224121094, + "logps_train/ref_chosen": -115.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -162.47723388671875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4136698246002197, + "rewards_train/margins": 3.998114824295044, + "rewards_train/rejected": -6.411784648895264, + "step": 203 + }, + { + "epoch": 2.32, + "logps_train/chosen": -152.8355712890625, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -174.1539306640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8427374362945557, + "rewards_train/margins": 4.584960222244263, + "rewards_train/rejected": -6.427697658538818, + "step": 203 + }, + { + "epoch": 2.33, + "learning_rate": 6.004979911208006e-06, + "loss": 0.0592, + "step": 204 + }, + { + "epoch": 2.33, + "logps_train/chosen": -150.49539184570312, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -201.37484741210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8733186721801758, + "rewards_train/margins": 4.557817459106445, + "rewards_train/rejected": -6.431136131286621, + "step": 204 + }, + { + "epoch": 2.33, + "logps_train/chosen": -145.48204040527344, + "logps_train/ref_chosen": -119.5, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -165.45367431640625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.6023051738739014, + "rewards_train/margins": 3.6914026737213135, + "rewards_train/rejected": -6.293707847595215, + "step": 204 + }, + { + "epoch": 2.33, + "logps_train/chosen": -163.88088989257812, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -191.28944396972656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.48164439201355, + "rewards_train/margins": 4.2371437549591064, + "rewards_train/rejected": -6.718788146972656, + "step": 204 + }, + { + "epoch": 2.33, + "logps_train/chosen": -130.41600036621094, + "logps_train/ref_chosen": -109.0, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -160.140869140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.111424207687378, + "rewards_train/margins": 4.071851015090942, + "rewards_train/rejected": -6.18327522277832, + "step": 204 + }, + { + "epoch": 2.34, + "learning_rate": 5.804618964412586e-06, + "loss": 0.0761, + "step": 205 + }, + { + "epoch": 2.34, + "logps_train/chosen": -164.7631072998047, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -161.48898315429688, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.1265063285827637, + "rewards_train/margins": 3.8559865951538086, + "rewards_train/rejected": -5.982492923736572, + "step": 205 + }, + { + "epoch": 2.34, + "logps_train/chosen": -169.73573303222656, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -160.00714111328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.167128086090088, + "rewards_train/margins": 3.958268642425537, + "rewards_train/rejected": -6.125396728515625, + "step": 205 + }, + { + "epoch": 2.34, + "logps_train/chosen": -210.82345581054688, + "logps_train/ref_chosen": -191.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -185.9019012451172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9597394466400146, + "rewards_train/margins": 4.313043832778931, + "rewards_train/rejected": -6.272783279418945, + "step": 205 + }, + { + "epoch": 2.34, + "logps_train/chosen": -204.55963134765625, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -194.1874237060547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.501666784286499, + "rewards_train/margins": 4.02088475227356, + "rewards_train/rejected": -6.522551536560059, + "step": 205 + }, + { + "epoch": 2.35, + "learning_rate": 5.607217732389503e-06, + "loss": 0.0711, + "step": 206 + }, + { + "epoch": 2.35, + "logps_train/chosen": -168.4705352783203, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -167.60079956054688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6452956199645996, + "rewards_train/margins": 4.4018449783325195, + "rewards_train/rejected": -6.047140598297119, + "step": 206 + }, + { + "epoch": 2.35, + "logps_train/chosen": -189.11264038085938, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -184.2760772705078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1286468505859375, + "rewards_train/margins": 4.595640182495117, + "rewards_train/rejected": -6.724287033081055, + "step": 206 + }, + { + "epoch": 2.35, + "logps_train/chosen": -167.8518829345703, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -179.5463104248047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.855109453201294, + "rewards_train/margins": 4.18018651008606, + "rewards_train/rejected": -6.0352959632873535, + "step": 206 + }, + { + "epoch": 2.35, + "logps_train/chosen": -165.7491912841797, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -152.43096923828125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.695818305015564, + "rewards_train/margins": 4.109387040138245, + "rewards_train/rejected": -5.805205345153809, + "step": 206 + }, + { + "epoch": 2.37, + "learning_rate": 5.412806652218469e-06, + "loss": 0.0656, + "step": 207 + }, + { + "epoch": 2.37, + "logps_train/chosen": -169.99899291992188, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -170.46067810058594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8358477354049683, + "rewards_train/margins": 4.719790577888489, + "rewards_train/rejected": -6.555638313293457, + "step": 207 + }, + { + "epoch": 2.37, + "logps_train/chosen": -193.06130981445312, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -196.26321411132812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2590603828430176, + "rewards_train/margins": 4.279368877410889, + "rewards_train/rejected": -6.538429260253906, + "step": 207 + }, + { + "epoch": 2.37, + "logps_train/chosen": -193.26812744140625, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -202.84500122070312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.311529517173767, + "rewards_train/margins": 5.111006379127502, + "rewards_train/rejected": -6.4225358963012695, + "step": 207 + }, + { + "epoch": 2.37, + "logps_train/chosen": -148.03170776367188, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -186.12869262695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.667038917541504, + "rewards_train/margins": 4.782499313354492, + "rewards_train/rejected": -6.449538230895996, + "step": 207 + }, + { + "epoch": 2.38, + "learning_rate": 5.221415699930951e-06, + "loss": 0.0451, + "step": 208 + }, + { + "epoch": 2.38, + "logps_train/chosen": -194.32412719726562, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -124.5, + "logps_train/rejected": -189.37872314453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.066347599029541, + "rewards_train/margins": 4.420987129211426, + "rewards_train/rejected": -6.487334728240967, + "step": 208 + }, + { + "epoch": 2.38, + "logps_train/chosen": -185.17953491210938, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -201.2667236328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.35320782661438, + "rewards_train/margins": 4.4406516551971436, + "rewards_train/rejected": -6.793859481811523, + "step": 208 + }, + { + "epoch": 2.38, + "logps_train/chosen": -205.48251342773438, + "logps_train/ref_chosen": -195.0, + "logps_train/ref_rejected": -160.0, + "logps_train/rejected": -219.9037628173828, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.028524398803711, + "rewards_train/margins": 5.0177106857299805, + "rewards_train/rejected": -6.046235084533691, + "step": 208 + }, + { + "epoch": 2.38, + "logps_train/chosen": -159.63363647460938, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -179.73924255371094, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.698287844657898, + "rewards_train/margins": 4.654713749885559, + "rewards_train/rejected": -6.353001594543457, + "step": 208 + }, + { + "epoch": 2.39, + "learning_rate": 5.033074385888189e-06, + "loss": 0.0496, + "step": 209 + }, + { + "epoch": 2.39, + "logps_train/chosen": -171.902099609375, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -179.21429443359375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.372437000274658, + "rewards_train/margins": 4.280241966247559, + "rewards_train/rejected": -6.652678966522217, + "step": 209 + }, + { + "epoch": 2.39, + "logps_train/chosen": -148.18333435058594, + "logps_train/ref_chosen": -132.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -195.27471923828125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6556390523910522, + "rewards_train/margins": 4.759748101234436, + "rewards_train/rejected": -6.415387153625488, + "step": 209 + }, + { + "epoch": 2.39, + "logps_train/chosen": -197.36880493164062, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -206.9953155517578, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.864400863647461, + "rewards_train/margins": 5.523899555206299, + "rewards_train/rejected": -7.38830041885376, + "step": 209 + }, + { + "epoch": 2.39, + "logps_train/chosen": -198.30517578125, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -149.0, + "logps_train/rejected": -218.72198486328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.7881360054016113, + "rewards_train/margins": 4.146368980407715, + "rewards_train/rejected": -6.934504985809326, + "step": 209 + }, + { + "epoch": 2.4, + "learning_rate": 4.847811750231057e-06, + "loss": 0.0851, + "step": 210 + }, + { + "epoch": 2.4, + "logps_train/chosen": -196.61697387695312, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -168.0, + "logps_train/rejected": -235.59994506835938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0059356689453125, + "rewards_train/margins": 4.734723091125488, + "rewards_train/rejected": -6.740658760070801, + "step": 210 + }, + { + "epoch": 2.4, + "logps_train/chosen": -177.05670166015625, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -194.96536254882812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9225655794143677, + "rewards_train/margins": 4.452094912528992, + "rewards_train/rejected": -6.374660491943359, + "step": 210 + }, + { + "epoch": 2.4, + "logps_train/chosen": -209.01028442382812, + "logps_train/ref_chosen": -187.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -218.34353637695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1807167530059814, + "rewards_train/margins": 4.460277795791626, + "rewards_train/rejected": -6.640994548797607, + "step": 210 + }, + { + "epoch": 2.4, + "logps_train/chosen": -218.32777404785156, + "logps_train/ref_chosen": -199.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -211.5435333251953, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9003549814224243, + "rewards_train/margins": 4.402435660362244, + "rewards_train/rejected": -6.302790641784668, + "step": 210 + }, + { + "epoch": 2.41, + "learning_rate": 4.665656358402395e-06, + "loss": 0.0516, + "step": 211 + }, + { + "epoch": 2.41, + "logps_train/chosen": -173.509521484375, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -179.2303009033203, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8173097372055054, + "rewards_train/margins": 4.784577965736389, + "rewards_train/rejected": -6.6018877029418945, + "step": 211 + }, + { + "epoch": 2.41, + "logps_train/chosen": -172.63043212890625, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -187.04351806640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2126522064208984, + "rewards_train/margins": 4.117969512939453, + "rewards_train/rejected": -6.330621719360352, + "step": 211 + }, + { + "epoch": 2.41, + "logps_train/chosen": -221.95367431640625, + "logps_train/ref_chosen": -201.0, + "logps_train/ref_rejected": -170.0, + "logps_train/rejected": -236.30203247070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0931222438812256, + "rewards_train/margins": 4.478243589401245, + "rewards_train/rejected": -6.571365833282471, + "step": 211 + }, + { + "epoch": 2.41, + "logps_train/chosen": -185.05625915527344, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -126.5, + "logps_train/rejected": -186.24945068359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3360211849212646, + "rewards_train/margins": 4.64351487159729, + "rewards_train/rejected": -5.979536056518555, + "step": 211 + }, + { + "epoch": 2.42, + "learning_rate": 4.486636296742506e-06, + "loss": 0.0509, + "step": 212 + }, + { + "epoch": 2.42, + "logps_train/chosen": -193.8851318359375, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -187.48406982421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2984743118286133, + "rewards_train/margins": 4.142901420593262, + "rewards_train/rejected": -6.441375732421875, + "step": 212 + }, + { + "epoch": 2.42, + "logps_train/chosen": -180.93228149414062, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -183.302978515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5795819759368896, + "rewards_train/margins": 4.18977952003479, + "rewards_train/rejected": -5.76936149597168, + "step": 212 + }, + { + "epoch": 2.42, + "logps_train/chosen": -163.54148864746094, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -170.50917053222656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.1980700492858887, + "rewards_train/margins": 3.664858818054199, + "rewards_train/rejected": -5.862928867340088, + "step": 212 + }, + { + "epoch": 2.42, + "logps_train/chosen": -172.4043731689453, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -184.43423461914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.632063388824463, + "rewards_train/margins": 4.868025302886963, + "rewards_train/rejected": -6.500088691711426, + "step": 212 + }, + { + "epoch": 2.43, + "learning_rate": 4.3107791681585655e-06, + "loss": 0.0701, + "step": 213 + }, + { + "epoch": 2.43, + "logps_train/chosen": -182.57791137695312, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -188.65792846679688, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9670684337615967, + "rewards_train/margins": 4.3333799839019775, + "rewards_train/rejected": -6.300448417663574, + "step": 213 + }, + { + "epoch": 2.43, + "logps_train/chosen": -178.8064422607422, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -188.4781494140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4776663780212402, + "rewards_train/margins": 4.212725639343262, + "rewards_train/rejected": -6.690392017364502, + "step": 213 + }, + { + "epoch": 2.43, + "logps_train/chosen": -180.3985595703125, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -190.71658325195312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.1126112937927246, + "rewards_train/margins": 4.2702765464782715, + "rewards_train/rejected": -6.382887840270996, + "step": 213 + }, + { + "epoch": 2.43, + "logps_train/chosen": -214.46804809570312, + "logps_train/ref_chosen": -188.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -214.39895629882812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.7165300846099854, + "rewards_train/margins": 4.324148416519165, + "rewards_train/rejected": -7.04067850112915, + "step": 213 + }, + { + "epoch": 2.45, + "learning_rate": 4.138112087868576e-06, + "loss": 0.0602, + "step": 214 + }, + { + "epoch": 2.45, + "logps_train/chosen": -184.88540649414062, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -208.5177001953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1092917919158936, + "rewards_train/margins": 5.026073694229126, + "rewards_train/rejected": -7.1353654861450195, + "step": 214 + }, + { + "epoch": 2.45, + "logps_train/chosen": -168.20083618164062, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -153.44461059570312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.237173557281494, + "rewards_train/margins": 3.7916626930236816, + "rewards_train/rejected": -6.028836250305176, + "step": 214 + }, + { + "epoch": 2.45, + "logps_train/chosen": -139.26658630371094, + "logps_train/ref_chosen": -117.5, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -174.18048095703125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.160618305206299, + "rewards_train/margins": 4.620589733123779, + "rewards_train/rejected": -6.781208038330078, + "step": 214 + }, + { + "epoch": 2.45, + "logps_train/chosen": -167.5859375, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -176.97976684570312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.373340606689453, + "rewards_train/margins": 4.333230018615723, + "rewards_train/rejected": -6.706570625305176, + "step": 214 + }, + { + "epoch": 2.46, + "learning_rate": 3.968661679220468e-06, + "loss": 0.0675, + "step": 215 + }, + { + "epoch": 2.46, + "logps_train/chosen": -178.07106018066406, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -178.66525268554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.273902416229248, + "rewards_train/margins": 4.306295394897461, + "rewards_train/rejected": -6.580197811126709, + "step": 215 + }, + { + "epoch": 2.46, + "logps_train/chosen": -186.64599609375, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -190.8783721923828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4751794338226318, + "rewards_train/margins": 4.996055841445923, + "rewards_train/rejected": -6.471235275268555, + "step": 215 + }, + { + "epoch": 2.46, + "logps_train/chosen": -155.9014129638672, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -111.5, + "logps_train/rejected": -170.11361694335938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.4900078773498535, + "rewards_train/margins": 4.389421463012695, + "rewards_train/rejected": -5.879429340362549, + "step": 215 + }, + { + "epoch": 2.46, + "logps_train/chosen": -206.13455200195312, + "logps_train/ref_chosen": -191.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -218.8090057373047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4799585342407227, + "rewards_train/margins": 5.572231292724609, + "rewards_train/rejected": -7.052189826965332, + "step": 215 + }, + { + "epoch": 2.47, + "learning_rate": 3.8024540695871274e-06, + "loss": 0.0435, + "step": 216 + }, + { + "epoch": 2.47, + "logps_train/chosen": -211.99249267578125, + "logps_train/ref_chosen": -196.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -193.9930419921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6373107433319092, + "rewards_train/margins": 4.699286699295044, + "rewards_train/rejected": -6.336597442626953, + "step": 216 + }, + { + "epoch": 2.47, + "logps_train/chosen": -162.2291259765625, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -182.76242065429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9232053756713867, + "rewards_train/margins": 4.727743148803711, + "rewards_train/rejected": -6.650948524475098, + "step": 216 + }, + { + "epoch": 2.47, + "logps_train/chosen": -186.01992797851562, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -187.78819274902344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7814736366271973, + "rewards_train/margins": 4.773713111877441, + "rewards_train/rejected": -6.555186748504639, + "step": 216 + }, + { + "epoch": 2.47, + "logps_train/chosen": -179.65704345703125, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -193.5622100830078, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4718570709228516, + "rewards_train/margins": 3.95780086517334, + "rewards_train/rejected": -6.429657936096191, + "step": 216 + }, + { + "epoch": 2.48, + "learning_rate": 3.6395148863377858e-06, + "loss": 0.0425, + "step": 217 + }, + { + "epoch": 2.48, + "logps_train/chosen": -185.7981719970703, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -194.738037109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9814281463623047, + "rewards_train/margins": 4.51200532913208, + "rewards_train/rejected": -6.493433475494385, + "step": 217 + }, + { + "epoch": 2.48, + "logps_train/chosen": -174.4874267578125, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -197.3990020751953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0167109966278076, + "rewards_train/margins": 4.54799485206604, + "rewards_train/rejected": -6.564705848693848, + "step": 217 + }, + { + "epoch": 2.48, + "logps_train/chosen": -186.6697998046875, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -150.0, + "logps_train/rejected": -213.67283630371094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.491588830947876, + "rewards_train/margins": 4.921689748764038, + "rewards_train/rejected": -6.413278579711914, + "step": 217 + }, + { + "epoch": 2.48, + "logps_train/chosen": -185.66867065429688, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -215.50732421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.834641695022583, + "rewards_train/margins": 4.791409254074097, + "rewards_train/rejected": -7.62605094909668, + "step": 217 + }, + { + "epoch": 2.49, + "learning_rate": 3.4798692528866057e-06, + "loss": 0.0431, + "step": 218 + }, + { + "epoch": 2.49, + "logps_train/chosen": -158.78350830078125, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -162.4704132080078, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4692697525024414, + "rewards_train/margins": 4.1036505699157715, + "rewards_train/rejected": -6.572920322418213, + "step": 218 + }, + { + "epoch": 2.49, + "logps_train/chosen": -181.54193115234375, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -218.23255920410156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7103451490402222, + "rewards_train/margins": 5.347406983375549, + "rewards_train/rejected": -7.0577521324157715, + "step": 218 + }, + { + "epoch": 2.49, + "logps_train/chosen": -175.21749877929688, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -192.23492431640625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.2478251457214355, + "rewards_train/margins": 4.507697582244873, + "rewards_train/rejected": -6.755522727966309, + "step": 218 + }, + { + "epoch": 2.49, + "logps_train/chosen": -182.02206420898438, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -197.85394287109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8292454481124878, + "rewards_train/margins": 4.512984395027161, + "rewards_train/rejected": -6.342229843139648, + "step": 218 + }, + { + "epoch": 2.5, + "learning_rate": 3.3235417848188983e-06, + "loss": 0.0539, + "step": 219 + }, + { + "epoch": 2.5, + "logps_train/chosen": -184.61679077148438, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -177.779296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.90328049659729, + "rewards_train/margins": 4.3316810131073, + "rewards_train/rejected": -6.23496150970459, + "step": 219 + }, + { + "epoch": 2.5, + "logps_train/chosen": -141.23770141601562, + "logps_train/ref_chosen": -116.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -154.41409301757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4947173595428467, + "rewards_train/margins": 4.093469858169556, + "rewards_train/rejected": -6.588187217712402, + "step": 219 + }, + { + "epoch": 2.5, + "logps_train/chosen": -149.37660217285156, + "logps_train/ref_chosen": -124.5, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -173.98623657226562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.497622013092041, + "rewards_train/margins": 4.532507419586182, + "rewards_train/rejected": -7.030129432678223, + "step": 219 + }, + { + "epoch": 2.5, + "logps_train/chosen": -192.66302490234375, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -148.0, + "logps_train/rejected": -212.57070922851562, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.0915956497192383, + "rewards_train/margins": 4.3946027755737305, + "rewards_train/rejected": -6.486198425292969, + "step": 219 + }, + { + "epoch": 2.51, + "learning_rate": 3.170556586095699e-06, + "loss": 0.0649, + "step": 220 + }, + { + "epoch": 2.51, + "logps_train/chosen": -189.02066040039062, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -177.7373046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.7520666122436523, + "rewards_train/margins": 4.081625461578369, + "rewards_train/rejected": -6.8336920738220215, + "step": 220 + }, + { + "epoch": 2.51, + "logps_train/chosen": -176.2943878173828, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -187.67111206054688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.6185994148254395, + "rewards_train/margins": 4.177418231964111, + "rewards_train/rejected": -6.796017646789551, + "step": 220 + }, + { + "epoch": 2.51, + "logps_train/chosen": -151.6334686279297, + "logps_train/ref_chosen": -131.0, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -156.61749267578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.107999324798584, + "rewards_train/margins": 4.088809013366699, + "rewards_train/rejected": -6.196808338165283, + "step": 220 + }, + { + "epoch": 2.51, + "logps_train/chosen": -145.2259521484375, + "logps_train/ref_chosen": -127.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -161.01107788085938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.803601622581482, + "rewards_train/margins": 4.087031006813049, + "rewards_train/rejected": -5.890632629394531, + "step": 220 + }, + { + "epoch": 2.53, + "learning_rate": 3.0209372453372077e-06, + "loss": 0.0763, + "step": 221 + }, + { + "epoch": 2.53, + "logps_train/chosen": -177.82476806640625, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -166.97608947753906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1691970825195312, + "rewards_train/margins": 4.20849084854126, + "rewards_train/rejected": -6.377687931060791, + "step": 221 + }, + { + "epoch": 2.53, + "logps_train/chosen": -163.2164306640625, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -205.69760131835938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.090000867843628, + "rewards_train/margins": 4.568700075149536, + "rewards_train/rejected": -6.658700942993164, + "step": 221 + }, + { + "epoch": 2.53, + "logps_train/chosen": -165.90538024902344, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -179.6652069091797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1149516105651855, + "rewards_train/margins": 4.325690269470215, + "rewards_train/rejected": -6.4406418800354, + "step": 221 + }, + { + "epoch": 2.53, + "logps_train/chosen": -191.64077758789062, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -155.56088256835938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.883876919746399, + "rewards_train/margins": 4.743695616722107, + "rewards_train/rejected": -6.627572536468506, + "step": 221 + }, + { + "epoch": 2.54, + "learning_rate": 2.8747068321856556e-06, + "loss": 0.0509, + "step": 222 + }, + { + "epoch": 2.54, + "logps_train/chosen": -175.29031372070312, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -152.0, + "logps_train/rejected": -223.1751708984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.350759983062744, + "rewards_train/margins": 4.803475379943848, + "rewards_train/rejected": -7.154235363006592, + "step": 222 + }, + { + "epoch": 2.54, + "logps_train/chosen": -191.7686767578125, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -201.64723205566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.020545721054077, + "rewards_train/margins": 4.550428628921509, + "rewards_train/rejected": -6.570974349975586, + "step": 222 + }, + { + "epoch": 2.54, + "logps_train/chosen": -204.0155029296875, + "logps_train/ref_chosen": -178.0, + "logps_train/ref_rejected": -144.0, + "logps_train/rejected": -215.03477478027344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.5512094497680664, + "rewards_train/margins": 4.560812950134277, + "rewards_train/rejected": -7.112022399902344, + "step": 222 + }, + { + "epoch": 2.54, + "logps_train/chosen": -166.68942260742188, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -186.78805541992188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4038538932800293, + "rewards_train/margins": 4.059913635253906, + "rewards_train/rejected": -6.4637675285339355, + "step": 222 + }, + { + "epoch": 2.55, + "learning_rate": 2.731887893748242e-06, + "loss": 0.0351, + "step": 223 + }, + { + "epoch": 2.55, + "logps_train/chosen": -181.3331298828125, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -190.67776489257812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4753060340881348, + "rewards_train/margins": 3.9983296394348145, + "rewards_train/rejected": -6.473635673522949, + "step": 223 + }, + { + "epoch": 2.55, + "logps_train/chosen": -164.18081665039062, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -196.8584747314453, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.398452043533325, + "rewards_train/margins": 4.512004613876343, + "rewards_train/rejected": -6.910456657409668, + "step": 223 + }, + { + "epoch": 2.55, + "logps_train/chosen": -180.45425415039062, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -187.70980834960938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.8418128490448, + "rewards_train/margins": 4.628679037094116, + "rewards_train/rejected": -7.470491886138916, + "step": 223 + }, + { + "epoch": 2.55, + "logps_train/chosen": -182.337890625, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -192.41957092285156, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7076172828674316, + "rewards_train/margins": 4.944886684417725, + "rewards_train/rejected": -6.652503967285156, + "step": 223 + }, + { + "epoch": 2.56, + "learning_rate": 2.5925024511206207e-06, + "loss": 0.0606, + "step": 224 + }, + { + "epoch": 2.56, + "logps_train/chosen": -193.09463500976562, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -207.1976776123047, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4825100898742676, + "rewards_train/margins": 4.484133243560791, + "rewards_train/rejected": -6.966643333435059, + "step": 224 + }, + { + "epoch": 2.56, + "logps_train/chosen": -160.484619140625, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -165.8876190185547, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7858647108078003, + "rewards_train/margins": 4.05990469455719, + "rewards_train/rejected": -5.84576940536499, + "step": 224 + }, + { + "epoch": 2.56, + "logps_train/chosen": -203.03643798828125, + "logps_train/ref_chosen": -188.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -178.2891082763672, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.5149726867675781, + "rewards_train/margins": 4.285617828369141, + "rewards_train/rejected": -5.800590515136719, + "step": 224 + }, + { + "epoch": 2.56, + "logps_train/chosen": -177.3469696044922, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -200.88308715820312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.568193197250366, + "rewards_train/margins": 4.182811975479126, + "rewards_train/rejected": -6.751005172729492, + "step": 224 + }, + { + "epoch": 2.57, + "learning_rate": 2.45657199599148e-06, + "loss": 0.0759, + "step": 225 + }, + { + "epoch": 2.57, + "logps_train/chosen": -173.48208618164062, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -146.0, + "logps_train/rejected": -207.01780700683594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8036763668060303, + "rewards_train/margins": 4.3025963306427, + "rewards_train/rejected": -6.1062726974487305, + "step": 225 + }, + { + "epoch": 2.57, + "logps_train/chosen": -188.08656311035156, + "logps_train/ref_chosen": -175.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -201.78933715820312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3506485223770142, + "rewards_train/margins": 4.938831686973572, + "rewards_train/rejected": -6.289480209350586, + "step": 225 + }, + { + "epoch": 2.57, + "logps_train/chosen": -116.22421264648438, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -164.86378479003906, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.71127986907959, + "rewards_train/margins": 3.7973580360412598, + "rewards_train/rejected": -6.50863790512085, + "step": 225 + }, + { + "epoch": 2.57, + "logps_train/chosen": -160.8203125, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -185.1500244140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3180701732635498, + "rewards_train/margins": 4.57881760597229, + "rewards_train/rejected": -5.89688777923584, + "step": 225 + }, + { + "epoch": 2.58, + "learning_rate": 2.324117487328789e-06, + "loss": 0.0696, + "step": 226 + }, + { + "epoch": 2.58, + "logps_train/chosen": -172.0726776123047, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -184.76156616210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2580490112304688, + "rewards_train/margins": 4.4107346534729, + "rewards_train/rejected": -6.668783664703369, + "step": 226 + }, + { + "epoch": 2.58, + "logps_train/chosen": -170.86233520507812, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -213.73233032226562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.606351375579834, + "rewards_train/margins": 5.066394329071045, + "rewards_train/rejected": -7.672745704650879, + "step": 226 + }, + { + "epoch": 2.58, + "logps_train/chosen": -155.60812377929688, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -177.47715759277344, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.057589054107666, + "rewards_train/margins": 4.667080402374268, + "rewards_train/rejected": -6.724669456481934, + "step": 226 + }, + { + "epoch": 2.58, + "logps_train/chosen": -188.93655395507812, + "logps_train/ref_chosen": -173.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -175.76377868652344, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6039090156555176, + "rewards_train/margins": 3.646785259246826, + "rewards_train/rejected": -5.250694274902344, + "step": 226 + }, + { + "epoch": 2.59, + "learning_rate": 2.1951593481481237e-06, + "loss": 0.0553, + "step": 227 + }, + { + "epoch": 2.59, + "logps_train/chosen": -165.5153350830078, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -174.52703857421875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7658891677856445, + "rewards_train/margins": 5.043651580810547, + "rewards_train/rejected": -6.809540748596191, + "step": 227 + }, + { + "epoch": 2.59, + "logps_train/chosen": -160.86737060546875, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -181.62149047851562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4635443687438965, + "rewards_train/margins": 4.161396026611328, + "rewards_train/rejected": -6.624940395355225, + "step": 227 + }, + { + "epoch": 2.59, + "logps_train/chosen": -167.77220153808594, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -166.71109008789062, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4367904663085938, + "rewards_train/margins": 4.332364082336426, + "rewards_train/rejected": -6.7691545486450195, + "step": 227 + }, + { + "epoch": 2.59, + "logps_train/chosen": -157.61546325683594, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -115.0, + "logps_train/rejected": -174.14193725585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6726795434951782, + "rewards_train/margins": 4.217974066734314, + "rewards_train/rejected": -5.890653610229492, + "step": 227 + }, + { + "epoch": 2.61, + "learning_rate": 2.0697174623636794e-06, + "loss": 0.0618, + "step": 228 + }, + { + "epoch": 2.61, + "logps_train/chosen": -177.1771697998047, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -185.34759521484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.171818733215332, + "rewards_train/margins": 3.963038444519043, + "rewards_train/rejected": -6.134857177734375, + "step": 228 + }, + { + "epoch": 2.61, + "logps_train/chosen": -184.7438507080078, + "logps_train/ref_chosen": -165.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -185.59344482421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.953291416168213, + "rewards_train/margins": 4.560252666473389, + "rewards_train/rejected": -6.513544082641602, + "step": 228 + }, + { + "epoch": 2.61, + "logps_train/chosen": -173.85379028320312, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -178.25070190429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.407742500305176, + "rewards_train/margins": 4.145502090454102, + "rewards_train/rejected": -6.553244590759277, + "step": 228 + }, + { + "epoch": 2.61, + "logps_train/chosen": -158.80514526367188, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -188.6862335205078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4136208295822144, + "rewards_train/margins": 4.584104657173157, + "rewards_train/rejected": -5.997725486755371, + "step": 228 + }, + { + "epoch": 2.62, + "learning_rate": 1.947811171722397e-06, + "loss": 0.0588, + "step": 229 + }, + { + "epoch": 2.62, + "logps_train/chosen": -171.05941772460938, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -162.3818817138672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7870943546295166, + "rewards_train/margins": 4.338154077529907, + "rewards_train/rejected": -6.125248432159424, + "step": 229 + }, + { + "epoch": 2.62, + "logps_train/chosen": -142.09471130371094, + "logps_train/ref_chosen": -114.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -192.31854248046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.8323230743408203, + "rewards_train/margins": 4.283173084259033, + "rewards_train/rejected": -7.1154961585998535, + "step": 229 + }, + { + "epoch": 2.62, + "logps_train/chosen": -179.9519500732422, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -205.62069702148438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.694608449935913, + "rewards_train/margins": 4.932012319564819, + "rewards_train/rejected": -6.626620769500732, + "step": 229 + }, + { + "epoch": 2.62, + "logps_train/chosen": -160.87034606933594, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -182.16702270507812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.20851993560791, + "rewards_train/margins": 4.76931619644165, + "rewards_train/rejected": -6.9778361320495605, + "step": 229 + }, + { + "epoch": 2.63, + "learning_rate": 1.8294592728216765e-06, + "loss": 0.0485, + "step": 230 + }, + { + "epoch": 2.63, + "logps_train/chosen": -157.534423828125, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -174.2003173828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9986941814422607, + "rewards_train/margins": 4.557275056838989, + "rewards_train/rejected": -6.55596923828125, + "step": 230 + }, + { + "epoch": 2.63, + "logps_train/chosen": -168.04296875, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -171.24734497070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8662118911743164, + "rewards_train/margins": 4.531960487365723, + "rewards_train/rejected": -6.398172378540039, + "step": 230 + }, + { + "epoch": 2.63, + "logps_train/chosen": -185.285400390625, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -170.35165405273438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4988040924072266, + "rewards_train/margins": 4.609604835510254, + "rewards_train/rejected": -6.1084089279174805, + "step": 230 + }, + { + "epoch": 2.63, + "logps_train/chosen": -173.4306640625, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -188.17437744140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2563962936401367, + "rewards_train/margins": 4.628278732299805, + "rewards_train/rejected": -6.884675025939941, + "step": 230 + }, + { + "epoch": 2.64, + "learning_rate": 1.7146800142111535e-06, + "loss": 0.0496, + "step": 231 + }, + { + "epoch": 2.64, + "logps_train/chosen": -171.4075469970703, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -121.5, + "logps_train/rejected": -183.84478759765625, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4142889976501465, + "rewards_train/margins": 3.8182373046875, + "rewards_train/rejected": -6.2325263023376465, + "step": 231 + }, + { + "epoch": 2.64, + "logps_train/chosen": -164.04263305664062, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -173.68544006347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1521155834198, + "rewards_train/margins": 4.001340627670288, + "rewards_train/rejected": -6.153456211090088, + "step": 231 + }, + { + "epoch": 2.64, + "logps_train/chosen": -179.4583282470703, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -180.95101928710938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6806962490081787, + "rewards_train/margins": 4.379248857498169, + "rewards_train/rejected": -6.059945106506348, + "step": 231 + }, + { + "epoch": 2.64, + "logps_train/chosen": -181.32591247558594, + "logps_train/ref_chosen": -160.0, + "logps_train/ref_rejected": -135.0, + "logps_train/rejected": -201.79678344726562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1640372276306152, + "rewards_train/margins": 4.5218915939331055, + "rewards_train/rejected": -6.685928821563721, + "step": 231 + }, + { + "epoch": 2.65, + "learning_rate": 1.6034910935789627e-06, + "loss": 0.0576, + "step": 232 + }, + { + "epoch": 2.65, + "logps_train/chosen": -177.09873962402344, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -197.26693725585938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.002061605453491, + "rewards_train/margins": 4.639084577560425, + "rewards_train/rejected": -6.641146183013916, + "step": 232 + }, + { + "epoch": 2.65, + "logps_train/chosen": -167.2610321044922, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -206.99166870117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.846024990081787, + "rewards_train/margins": 5.2769694328308105, + "rewards_train/rejected": -7.122994422912598, + "step": 232 + }, + { + "epoch": 2.65, + "logps_train/chosen": -135.89576721191406, + "logps_train/ref_chosen": -107.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -161.03927612304688, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.8300557136535645, + "rewards_train/margins": 3.8118605613708496, + "rewards_train/rejected": -6.641916275024414, + "step": 232 + }, + { + "epoch": 2.65, + "logps_train/chosen": -162.01553344726562, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -202.14938354492188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1653234958648682, + "rewards_train/margins": 5.1688525676727295, + "rewards_train/rejected": -6.334176063537598, + "step": 232 + }, + { + "epoch": 2.66, + "learning_rate": 1.4959096550229645e-06, + "loss": 0.0669, + "step": 233 + }, + { + "epoch": 2.66, + "logps_train/chosen": -180.497314453125, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -137.0, + "logps_train/rejected": -197.39334106445312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7840571403503418, + "rewards_train/margins": 4.3093791007995605, + "rewards_train/rejected": -6.093436241149902, + "step": 233 + }, + { + "epoch": 2.66, + "logps_train/chosen": -181.78704833984375, + "logps_train/ref_chosen": -161.0, + "logps_train/ref_rejected": -119.5, + "logps_train/rejected": -179.69802856445312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.05507230758667, + "rewards_train/margins": 3.9480795860290527, + "rewards_train/rejected": -6.003151893615723, + "step": 233 + }, + { + "epoch": 2.66, + "logps_train/chosen": -177.9119110107422, + "logps_train/ref_chosen": -152.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -173.1656951904297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.5379676818847656, + "rewards_train/margins": 4.04774284362793, + "rewards_train/rejected": -6.585710525512695, + "step": 233 + }, + { + "epoch": 2.66, + "logps_train/chosen": -191.3437042236328, + "logps_train/ref_chosen": -174.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -195.78623962402344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7578561305999756, + "rewards_train/margins": 4.763178110122681, + "rewards_train/rejected": -6.521034240722656, + "step": 233 + }, + { + "epoch": 2.67, + "learning_rate": 1.391952286407311e-06, + "loss": 0.0564, + "step": 234 + }, + { + "epoch": 2.67, + "logps_train/chosen": -164.625244140625, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -191.79592895507812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1351804733276367, + "rewards_train/margins": 4.474490165710449, + "rewards_train/rejected": -6.609670639038086, + "step": 234 + }, + { + "epoch": 2.67, + "logps_train/chosen": -192.8685302734375, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -180.1429443359375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.744861364364624, + "rewards_train/margins": 4.417285203933716, + "rewards_train/rejected": -6.16214656829834, + "step": 234 + }, + { + "epoch": 2.67, + "logps_train/chosen": -172.70458984375, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -175.58883666992188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7502193450927734, + "rewards_train/margins": 4.853781700134277, + "rewards_train/rejected": -6.604001045227051, + "step": 234 + }, + { + "epoch": 2.67, + "logps_train/chosen": -179.72857666015625, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -183.2381134033203, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.149763584136963, + "rewards_train/margins": 4.196936130523682, + "rewards_train/rejected": -6.3466997146606445, + "step": 234 + }, + { + "epoch": 2.69, + "learning_rate": 1.2916350168047681e-06, + "loss": 0.0664, + "step": 235 + }, + { + "epoch": 2.69, + "logps_train/chosen": -172.16232299804688, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -190.31521606445312, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.4478726387023926, + "rewards_train/margins": 4.224077224731445, + "rewards_train/rejected": -6.671949863433838, + "step": 235 + }, + { + "epoch": 2.69, + "logps_train/chosen": -183.98605346679688, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -184.5833740234375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.0531957149505615, + "rewards_train/margins": 4.704164743423462, + "rewards_train/rejected": -6.757360458374023, + "step": 235 + }, + { + "epoch": 2.69, + "logps_train/chosen": -150.15863037109375, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -157.22695922851562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0300607681274414, + "rewards_train/margins": 3.7460880279541016, + "rewards_train/rejected": -5.776148796081543, + "step": 235 + }, + { + "epoch": 2.69, + "logps_train/chosen": -198.83621215820312, + "logps_train/ref_chosen": -183.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -208.21908569335938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5560832023620605, + "rewards_train/margins": 5.394535064697266, + "rewards_train/rejected": -6.950618267059326, + "step": 235 + }, + { + "epoch": 2.7, + "learning_rate": 1.1949733140252466e-06, + "loss": 0.0635, + "step": 236 + }, + { + "epoch": 2.7, + "logps_train/chosen": -161.04502868652344, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -163.6453857421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8402457237243652, + "rewards_train/margins": 4.072827339172363, + "rewards_train/rejected": -5.9130730628967285, + "step": 236 + }, + { + "epoch": 2.7, + "logps_train/chosen": -184.62203979492188, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -143.0, + "logps_train/rejected": -216.21878051757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.894137382507324, + "rewards_train/margins": 4.415827751159668, + "rewards_train/rejected": -7.309965133666992, + "step": 236 + }, + { + "epoch": 2.7, + "logps_train/chosen": -194.06765747070312, + "logps_train/ref_chosen": -182.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -192.44342041015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2655562162399292, + "rewards_train/margins": 4.643270134925842, + "rewards_train/rejected": -5.9088263511657715, + "step": 236 + }, + { + "epoch": 2.7, + "logps_train/chosen": -214.92184448242188, + "logps_train/ref_chosen": -190.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -202.62722778320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.510640859603882, + "rewards_train/margins": 3.877863645553589, + "rewards_train/rejected": -6.388504505157471, + "step": 236 + }, + { + "epoch": 2.71, + "learning_rate": 1.1019820822307985e-06, + "loss": 0.0649, + "step": 237 + }, + { + "epoch": 2.71, + "logps_train/chosen": -126.3829574584961, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -177.629638671875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.102309465408325, + "rewards_train/margins": 4.272714376449585, + "rewards_train/rejected": -6.37502384185791, + "step": 237 + }, + { + "epoch": 2.71, + "logps_train/chosen": -138.21041870117188, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -110.0, + "logps_train/rejected": -169.5449981689453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0431365966796875, + "rewards_train/margins": 3.923570156097412, + "rewards_train/rejected": -5.9667067527771, + "step": 237 + }, + { + "epoch": 2.71, + "logps_train/chosen": -161.8151092529297, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -169.7493438720703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1523118019104004, + "rewards_train/margins": 4.343618869781494, + "rewards_train/rejected": -6.4959306716918945, + "step": 237 + }, + { + "epoch": 2.71, + "logps_train/chosen": -216.87832641601562, + "logps_train/ref_chosen": -204.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -197.9646453857422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3093161582946777, + "rewards_train/margins": 5.389882564544678, + "rewards_train/rejected": -6.6991987228393555, + "step": 237 + }, + { + "epoch": 2.72, + "learning_rate": 1.0126756596375686e-06, + "loss": 0.0598, + "step": 238 + }, + { + "epoch": 2.72, + "logps_train/chosen": -166.60894775390625, + "logps_train/ref_chosen": -147.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -183.59124755859375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9279100894927979, + "rewards_train/margins": 4.252699136734009, + "rewards_train/rejected": -6.180609226226807, + "step": 238 + }, + { + "epoch": 2.72, + "logps_train/chosen": -139.39071655273438, + "logps_train/ref_chosen": -116.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -172.66722106933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.307724952697754, + "rewards_train/margins": 4.875207901000977, + "rewards_train/rejected": -7.1829328536987305, + "step": 238 + }, + { + "epoch": 2.72, + "logps_train/chosen": -171.77474975585938, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -163.83929443359375, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.127732276916504, + "rewards_train/margins": 3.6769256591796875, + "rewards_train/rejected": -5.804657936096191, + "step": 238 + }, + { + "epoch": 2.72, + "logps_train/chosen": -172.7614288330078, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -157.0, + "logps_train/rejected": -227.40777587890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8960649967193604, + "rewards_train/margins": 5.144322156906128, + "rewards_train/rejected": -7.040387153625488, + "step": 238 + }, + { + "epoch": 2.73, + "learning_rate": 9.270678163050217e-07, + "loss": 0.0751, + "step": 239 + }, + { + "epoch": 2.73, + "logps_train/chosen": -163.55027770996094, + "logps_train/ref_chosen": -137.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -145.41427612304688, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.624851703643799, + "rewards_train/margins": 3.6589112281799316, + "rewards_train/rejected": -6.2837629318237305, + "step": 239 + }, + { + "epoch": 2.73, + "logps_train/chosen": -203.0478515625, + "logps_train/ref_chosen": -189.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -199.60626220703125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.3887698650360107, + "rewards_train/margins": 5.202104806900024, + "rewards_train/rejected": -6.590874671936035, + "step": 239 + }, + { + "epoch": 2.73, + "logps_train/chosen": -178.8606719970703, + "logps_train/ref_chosen": -158.0, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -203.78274536132812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.051692008972168, + "rewards_train/margins": 5.257539749145508, + "rewards_train/rejected": -7.309231758117676, + "step": 239 + }, + { + "epoch": 2.73, + "logps_train/chosen": -162.5720977783203, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -175.08572387695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2999839782714844, + "rewards_train/margins": 4.601556777954102, + "rewards_train/rejected": -6.901540756225586, + "step": 239 + }, + { + "epoch": 2.74, + "learning_rate": 8.451717520127273e-07, + "loss": 0.0755, + "step": 240 + }, + { + "epoch": 2.74, + "logps_train/chosen": -180.50900268554688, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -180.9222412109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7348852157592773, + "rewards_train/margins": 4.948159217834473, + "rewards_train/rejected": -6.68304443359375, + "step": 240 + }, + { + "epoch": 2.74, + "logps_train/chosen": -189.4424285888672, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -140.0, + "logps_train/rejected": -208.51254272460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2249059677124023, + "rewards_train/margins": 4.5786919593811035, + "rewards_train/rejected": -6.803597927093506, + "step": 240 + }, + { + "epoch": 2.74, + "logps_train/chosen": -154.6390380859375, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -124.5, + "logps_train/rejected": -186.6710205078125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6452138423919678, + "rewards_train/margins": 4.573157072067261, + "rewards_train/rejected": -6.2183709144592285, + "step": 240 + }, + { + "epoch": 2.74, + "logps_train/chosen": -167.7239990234375, + "logps_train/ref_chosen": -143.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -192.18130493164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4205942153930664, + "rewards_train/margins": 5.258059501647949, + "rewards_train/rejected": -7.678653717041016, + "step": 240 + }, + { + "epoch": 2.75, + "learning_rate": 7.670000942251287e-07, + "loss": 0.0483, + "step": 241 + }, + { + "epoch": 2.75, + "logps_train/chosen": -177.83676147460938, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -197.16542053222656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9257657527923584, + "rewards_train/margins": 4.465288400650024, + "rewards_train/rejected": -6.391054153442383, + "step": 241 + }, + { + "epoch": 2.75, + "logps_train/chosen": -186.260498046875, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -185.99197387695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.358275890350342, + "rewards_train/margins": 4.20889139175415, + "rewards_train/rejected": -6.567167282104492, + "step": 241 + }, + { + "epoch": 2.75, + "logps_train/chosen": -159.35272216796875, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -167.78614807128906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.3801445960998535, + "rewards_train/margins": 4.245735168457031, + "rewards_train/rejected": -6.625879764556885, + "step": 241 + }, + { + "epoch": 2.75, + "logps_train/chosen": -162.38101196289062, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -181.56993103027344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7641749382019043, + "rewards_train/margins": 4.599165439605713, + "rewards_train/rejected": -6.363340377807617, + "step": 241 + }, + { + "epoch": 2.77, + "learning_rate": 6.92564896144493e-07, + "loss": 0.0347, + "step": 242 + }, + { + "epoch": 2.77, + "logps_train/chosen": -169.59744262695312, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -183.14720153808594, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.599441409111023, + "rewards_train/margins": 4.840278744697571, + "rewards_train/rejected": -6.439720153808594, + "step": 242 + }, + { + "epoch": 2.77, + "logps_train/chosen": -156.12826538085938, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -175.92604064941406, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.272787570953369, + "rewards_train/margins": 4.405375957489014, + "rewards_train/rejected": -6.678163528442383, + "step": 242 + }, + { + "epoch": 2.77, + "logps_train/chosen": -196.60855102539062, + "logps_train/ref_chosen": -170.0, + "logps_train/ref_rejected": -127.0, + "logps_train/rejected": -193.12551879882812, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.7350728511810303, + "rewards_train/margins": 3.8505260944366455, + "rewards_train/rejected": -6.585598945617676, + "step": 242 + }, + { + "epoch": 2.77, + "logps_train/chosen": -161.61679077148438, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -195.109130859375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.331406354904175, + "rewards_train/margins": 4.148061990737915, + "rewards_train/rejected": -6.47946834564209, + "step": 242 + }, + { + "epoch": 2.78, + "learning_rate": 6.218776348524663e-07, + "loss": 0.0655, + "step": 243 + }, + { + "epoch": 2.78, + "logps_train/chosen": -182.87452697753906, + "logps_train/ref_chosen": -163.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -194.0487060546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9816908836364746, + "rewards_train/margins": 5.13365364074707, + "rewards_train/rejected": -7.115344524383545, + "step": 243 + }, + { + "epoch": 2.78, + "logps_train/chosen": -156.08993530273438, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -178.7113037109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.6412432193756104, + "rewards_train/margins": 4.133403539657593, + "rewards_train/rejected": -6.774646759033203, + "step": 243 + }, + { + "epoch": 2.78, + "logps_train/chosen": -211.40530395507812, + "logps_train/ref_chosen": -194.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -185.21580505371094, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.7184962034225464, + "rewards_train/margins": 4.194722294807434, + "rewards_train/rejected": -5.9132184982299805, + "step": 243 + }, + { + "epoch": 2.78, + "logps_train/chosen": -179.36248779296875, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -175.9487762451172, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.673431396484375, + "rewards_train/margins": 4.250351905822754, + "rewards_train/rejected": -6.923783302307129, + "step": 243 + }, + { + "epoch": 2.79, + "learning_rate": 5.549492095404202e-07, + "loss": 0.0632, + "step": 244 + }, + { + "epoch": 2.79, + "logps_train/chosen": -172.94845581054688, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -193.4479217529297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8069548606872559, + "rewards_train/margins": 5.433931350708008, + "rewards_train/rejected": -7.240886211395264, + "step": 244 + }, + { + "epoch": 2.79, + "logps_train/chosen": -161.23968505859375, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -179.7657470703125, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.9360785484313965, + "rewards_train/margins": 3.958073139190674, + "rewards_train/rejected": -5.89415168762207, + "step": 244 + }, + { + "epoch": 2.79, + "logps_train/chosen": -160.264892578125, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -171.85986328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4065914154052734, + "rewards_train/margins": 4.195216178894043, + "rewards_train/rejected": -6.601807594299316, + "step": 244 + }, + { + "epoch": 2.79, + "logps_train/chosen": -235.1175079345703, + "logps_train/ref_chosen": -219.0, + "logps_train/ref_rejected": -176.0, + "logps_train/rejected": -239.72584533691406, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.5730059146881104, + "rewards_train/margins": 4.821647882461548, + "rewards_train/rejected": -6.394653797149658, + "step": 244 + }, + { + "epoch": 2.8, + "learning_rate": 4.917899398289377e-07, + "loss": 0.0628, + "step": 245 + }, + { + "epoch": 2.8, + "logps_train/chosen": -171.4566650390625, + "logps_train/ref_chosen": -150.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -161.31747436523438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.139416217803955, + "rewards_train/margins": 3.9931116104125977, + "rewards_train/rejected": -6.132527828216553, + "step": 245 + }, + { + "epoch": 2.8, + "logps_train/chosen": -160.1334228515625, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -197.39840698242188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1715452671051025, + "rewards_train/margins": 4.618248701095581, + "rewards_train/rejected": -6.789793968200684, + "step": 245 + }, + { + "epoch": 2.8, + "logps_train/chosen": -176.39114379882812, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -195.78604125976562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1625027656555176, + "rewards_train/margins": 4.600754737854004, + "rewards_train/rejected": -6.7632575035095215, + "step": 245 + }, + { + "epoch": 2.8, + "logps_train/chosen": -156.68441772460938, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -160.67849731445312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8903658390045166, + "rewards_train/margins": 4.183562994003296, + "rewards_train/rejected": -6.0739288330078125, + "step": 245 + }, + { + "epoch": 2.81, + "learning_rate": 4.324095641766168e-07, + "loss": 0.0534, + "step": 246 + }, + { + "epoch": 2.81, + "logps_train/chosen": -177.7391357421875, + "logps_train/ref_chosen": -159.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -191.41738891601562, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8221569061279297, + "rewards_train/margins": 5.090432167053223, + "rewards_train/rejected": -6.912589073181152, + "step": 246 + }, + { + "epoch": 2.81, + "logps_train/chosen": -133.89585876464844, + "logps_train/ref_chosen": -111.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -163.14993286132812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2561392784118652, + "rewards_train/margins": 3.9409842491149902, + "rewards_train/rejected": -6.1971235275268555, + "step": 246 + }, + { + "epoch": 2.81, + "logps_train/chosen": -184.8391876220703, + "logps_train/ref_chosen": -168.0, + "logps_train/ref_rejected": -164.0, + "logps_train/rejected": -227.5182342529297, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6263017654418945, + "rewards_train/margins": 4.725424289703369, + "rewards_train/rejected": -6.351726055145264, + "step": 246 + }, + { + "epoch": 2.81, + "logps_train/chosen": -177.89773559570312, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -138.0, + "logps_train/rejected": -202.14389038085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0483689308166504, + "rewards_train/margins": 4.4206223487854, + "rewards_train/rejected": -6.468991279602051, + "step": 246 + }, + { + "epoch": 2.82, + "learning_rate": 3.768172383785268e-07, + "loss": 0.0606, + "step": 247 + }, + { + "epoch": 2.82, + "logps_train/chosen": -166.12185668945312, + "logps_train/ref_chosen": -145.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -160.4178466796875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.0884547233581543, + "rewards_train/margins": 3.920907974243164, + "rewards_train/rejected": -6.009362697601318, + "step": 247 + }, + { + "epoch": 2.82, + "logps_train/chosen": -172.78741455078125, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -119.0, + "logps_train/rejected": -189.86375427246094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.8326473236083984, + "rewards_train/margins": 4.249040126800537, + "rewards_train/rejected": -7.0816874504089355, + "step": 247 + }, + { + "epoch": 2.82, + "logps_train/chosen": -185.13803100585938, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -178.22286987304688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.280600070953369, + "rewards_train/margins": 4.309672832489014, + "rewards_train/rejected": -6.590272903442383, + "step": 247 + }, + { + "epoch": 2.82, + "logps_train/chosen": -153.21363830566406, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -171.15322875976562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.703980803489685, + "rewards_train/margins": 4.131362557411194, + "rewards_train/rejected": -5.835343360900879, + "step": 247 + }, + { + "epoch": 2.83, + "learning_rate": 3.2502153415447656e-07, + "loss": 0.0635, + "step": 248 + }, + { + "epoch": 2.83, + "logps_train/chosen": -163.0394744873047, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -140.48609924316406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.749880313873291, + "rewards_train/margins": 3.7217764854431152, + "rewards_train/rejected": -6.471656799316406, + "step": 248 + }, + { + "epoch": 2.83, + "logps_train/chosen": -166.8461151123047, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -129.0, + "logps_train/rejected": -198.37579345703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.565225601196289, + "rewards_train/margins": 4.396816253662109, + "rewards_train/rejected": -6.962041854858398, + "step": 248 + }, + { + "epoch": 2.83, + "logps_train/chosen": -207.8005828857422, + "logps_train/ref_chosen": -190.0, + "logps_train/ref_rejected": -153.0, + "logps_train/rejected": -223.14645385742188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7702926397323608, + "rewards_train/margins": 5.2681804895401, + "rewards_train/rejected": -7.038473129272461, + "step": 248 + }, + { + "epoch": 2.83, + "logps_train/chosen": -154.43231201171875, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -154.94419860839844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8259937763214111, + "rewards_train/margins": 4.078387022018433, + "rewards_train/rejected": -5.904380798339844, + "step": 248 + }, + { + "epoch": 2.85, + "learning_rate": 2.770304378273553e-07, + "loss": 0.0647, + "step": 249 + }, + { + "epoch": 2.85, + "logps_train/chosen": -145.6243438720703, + "logps_train/ref_chosen": -127.5, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -160.12098693847656, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.8032550811767578, + "rewards_train/margins": 3.8635311126708984, + "rewards_train/rejected": -5.666786193847656, + "step": 249 + }, + { + "epoch": 2.85, + "logps_train/chosen": -179.10397338867188, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -195.87139892578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2270493507385254, + "rewards_train/margins": 4.725861549377441, + "rewards_train/rejected": -6.952910900115967, + "step": 249 + }, + { + "epoch": 2.85, + "logps_train/chosen": -160.42538452148438, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -105.5, + "logps_train/rejected": -172.30838012695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4366796016693115, + "rewards_train/margins": 4.239910364151001, + "rewards_train/rejected": -6.6765899658203125, + "step": 249 + }, + { + "epoch": 2.85, + "logps_train/chosen": -163.18991088867188, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -120.5, + "logps_train/rejected": -182.28509521484375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.6743369102478027, + "rewards_train/margins": 4.484737873077393, + "rewards_train/rejected": -6.159074783325195, + "step": 249 + }, + { + "epoch": 2.86, + "learning_rate": 2.3285134909173112e-07, + "loss": 0.0667, + "step": 250 + }, + { + "epoch": 2.86, + "logps_train/chosen": -194.73187255859375, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -188.28573608398438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9335401058197021, + "rewards_train/margins": 5.130775690078735, + "rewards_train/rejected": -7.0643157958984375, + "step": 250 + }, + { + "epoch": 2.86, + "logps_train/chosen": -167.18817138671875, + "logps_train/ref_chosen": -142.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -181.6185302734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.518524169921875, + "rewards_train/margins": 4.051629066467285, + "rewards_train/rejected": -6.57015323638916, + "step": 250 + }, + { + "epoch": 2.86, + "logps_train/chosen": -194.84786987304688, + "logps_train/ref_chosen": -176.0, + "logps_train/ref_rejected": -162.0, + "logps_train/rejected": -225.637451171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.871652364730835, + "rewards_train/margins": 4.4969751834869385, + "rewards_train/rejected": -6.368627548217773, + "step": 250 + }, + { + "epoch": 2.86, + "logps_train/chosen": -161.91610717773438, + "logps_train/ref_chosen": -140.0, + "logps_train/ref_rejected": -124.5, + "logps_train/rejected": -195.36077880859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1696693897247314, + "rewards_train/margins": 4.929694890975952, + "rewards_train/rejected": -7.099364280700684, + "step": 250 + }, + { + "epoch": 2.87, + "learning_rate": 1.924910798728946e-07, + "loss": 0.0365, + "step": 251 + }, + { + "epoch": 2.87, + "logps_train/chosen": -176.8163299560547, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -183.77633666992188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.261936664581299, + "rewards_train/margins": 4.28908634185791, + "rewards_train/rejected": -6.551023006439209, + "step": 251 + }, + { + "epoch": 2.87, + "logps_train/chosen": -173.4698486328125, + "logps_train/ref_chosen": -157.0, + "logps_train/ref_rejected": -122.0, + "logps_train/rejected": -184.99172973632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6786491870880127, + "rewards_train/margins": 4.63419508934021, + "rewards_train/rejected": -6.312844276428223, + "step": 251 + }, + { + "epoch": 2.87, + "logps_train/chosen": -168.40023803710938, + "logps_train/ref_chosen": -146.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -183.943603515625, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.20320725440979, + "rewards_train/margins": 4.175772428512573, + "rewards_train/rejected": -6.378979682922363, + "step": 251 + }, + { + "epoch": 2.87, + "logps_train/chosen": -184.357666015625, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -185.84925842285156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7965086698532104, + "rewards_train/margins": 4.701600193977356, + "rewards_train/rejected": -6.498108863830566, + "step": 251 + }, + { + "epoch": 2.88, + "learning_rate": 1.559558532765404e-07, + "loss": 0.0579, + "step": 252 + }, + { + "epoch": 2.88, + "logps_train/chosen": -133.98130798339844, + "logps_train/ref_chosen": -114.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -166.95147705078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0248889923095703, + "rewards_train/margins": 4.195259094238281, + "rewards_train/rejected": -6.220148086547852, + "step": 252 + }, + { + "epoch": 2.88, + "logps_train/chosen": -198.66932678222656, + "logps_train/ref_chosen": -179.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -210.49432373046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9354877471923828, + "rewards_train/margins": 4.8838677406311035, + "rewards_train/rejected": -6.819355487823486, + "step": 252 + }, + { + "epoch": 2.88, + "logps_train/chosen": -173.53561401367188, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -179.3866729736328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9231053590774536, + "rewards_train/margins": 4.311460375785828, + "rewards_train/rejected": -6.234565734863281, + "step": 252 + }, + { + "epoch": 2.88, + "logps_train/chosen": -192.16253662109375, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -188.32643127441406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.32460355758667, + "rewards_train/margins": 4.6487135887146, + "rewards_train/rejected": -6.9733171463012695, + "step": 252 + }, + { + "epoch": 2.89, + "learning_rate": 1.23251302629232e-07, + "loss": 0.0477, + "step": 253 + }, + { + "epoch": 2.89, + "logps_train/chosen": -164.695068359375, + "logps_train/ref_chosen": -149.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -183.24545288085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.560620665550232, + "rewards_train/margins": 4.226998925209045, + "rewards_train/rejected": -5.787619590759277, + "step": 253 + }, + { + "epoch": 2.89, + "logps_train/chosen": -164.260986328125, + "logps_train/ref_chosen": -139.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -173.06460571289062, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.5073485374450684, + "rewards_train/margins": 4.087198734283447, + "rewards_train/rejected": -6.594547271728516, + "step": 253 + }, + { + "epoch": 2.89, + "logps_train/chosen": -151.60467529296875, + "logps_train/ref_chosen": -128.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -179.28170776367188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.3181824684143066, + "rewards_train/margins": 3.9910435676574707, + "rewards_train/rejected": -6.309226036071777, + "step": 253 + }, + { + "epoch": 2.89, + "logps_train/chosen": -191.03347778320312, + "logps_train/ref_chosen": -166.0, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -191.88462829589844, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.450467109680176, + "rewards_train/margins": 4.3493733406066895, + "rewards_train/rejected": -6.799840450286865, + "step": 253 + }, + { + "epoch": 2.9, + "learning_rate": 9.438247060979955e-08, + "loss": 0.0777, + "step": 254 + }, + { + "epoch": 2.9, + "logps_train/chosen": -172.36068725585938, + "logps_train/ref_chosen": -153.0, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -175.3718719482422, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.9357753992080688, + "rewards_train/margins": 4.9021934270858765, + "rewards_train/rejected": -6.837968826293945, + "step": 254 + }, + { + "epoch": 2.9, + "logps_train/chosen": -179.45162963867188, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -117.5, + "logps_train/rejected": -178.16188049316406, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.3817834854125977, + "rewards_train/margins": 3.6969776153564453, + "rewards_train/rejected": -6.078761100769043, + "step": 254 + }, + { + "epoch": 2.9, + "logps_train/chosen": -167.01654052734375, + "logps_train/ref_chosen": -141.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -179.34791564941406, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.5837817192077637, + "rewards_train/margins": 4.215170383453369, + "rewards_train/rejected": -6.798952102661133, + "step": 254 + }, + { + "epoch": 2.9, + "logps_train/chosen": -170.49327087402344, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -165.78018188476562, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -2.2194879055023193, + "rewards_train/margins": 4.055551767349243, + "rewards_train/rejected": -6.2750396728515625, + "step": 254 + }, + { + "epoch": 2.91, + "learning_rate": 6.935380847182815e-08, + "loss": 0.0963, + "step": 255 + }, + { + "epoch": 2.91, + "logps_train/chosen": -184.44064331054688, + "logps_train/ref_chosen": -162.0, + "logps_train/ref_rejected": -143.0, + "logps_train/rejected": -211.0648956298828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2682833671569824, + "rewards_train/margins": 4.527073860168457, + "rewards_train/rejected": -6.7953572273254395, + "step": 255 + }, + { + "epoch": 2.91, + "logps_train/chosen": -171.735595703125, + "logps_train/ref_chosen": -148.0, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -186.7560272216797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.400291919708252, + "rewards_train/margins": 4.154388904571533, + "rewards_train/rejected": -6.554680824279785, + "step": 255 + }, + { + "epoch": 2.91, + "logps_train/chosen": -175.62689208984375, + "logps_train/ref_chosen": -151.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -202.15704345703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4636664390563965, + "rewards_train/margins": 4.4941277503967285, + "rewards_train/rejected": -6.957794189453125, + "step": 255 + }, + { + "epoch": 2.91, + "logps_train/chosen": -186.03994750976562, + "logps_train/ref_chosen": -169.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -196.6202392578125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.696962833404541, + "rewards_train/margins": 4.931369781494141, + "rewards_train/rejected": -6.628332614898682, + "step": 255 + }, + { + "epoch": 2.93, + "learning_rate": 4.816917535731547e-08, + "loss": 0.0631, + "step": 256 + }, + { + "epoch": 2.93, + "logps_train/chosen": -153.82058715820312, + "logps_train/ref_chosen": -135.0, + "logps_train/ref_rejected": -124.5, + "logps_train/rejected": -184.61001586914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.905739426612854, + "rewards_train/margins": 4.107263922691345, + "rewards_train/rejected": -6.013003349304199, + "step": 256 + }, + { + "epoch": 2.93, + "logps_train/chosen": -152.47775268554688, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -147.11019897460938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.2787318229675293, + "rewards_train/margins": 3.5291028022766113, + "rewards_train/rejected": -5.807834625244141, + "step": 256 + }, + { + "epoch": 2.93, + "logps_train/chosen": -163.71652221679688, + "logps_train/ref_chosen": -138.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -201.04147338867188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.5193088054656982, + "rewards_train/margins": 4.136402368545532, + "rewards_train/rejected": -6.6557111740112305, + "step": 256 + }, + { + "epoch": 2.93, + "logps_train/chosen": -191.13845825195312, + "logps_train/ref_chosen": -172.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -182.69961547851562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8818151950836182, + "rewards_train/margins": 4.146349668502808, + "rewards_train/rejected": -6.028164863586426, + "step": 256 + }, + { + "epoch": 2.94, + "learning_rate": 3.083183770162812e-08, + "loss": 0.0762, + "step": 257 + }, + { + "epoch": 2.94, + "logps_train/chosen": -165.29769897460938, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -193.4224395751953, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.089618444442749, + "rewards_train/margins": 3.9063212871551514, + "rewards_train/rejected": -5.9959397315979, + "step": 257 + }, + { + "epoch": 2.94, + "logps_train/chosen": -175.6675567626953, + "logps_train/ref_chosen": -154.0, + "logps_train/ref_rejected": -142.0, + "logps_train/rejected": -212.16360473632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1210525035858154, + "rewards_train/margins": 4.94413685798645, + "rewards_train/rejected": -7.065189361572266, + "step": 257 + }, + { + "epoch": 2.94, + "logps_train/chosen": -146.52691650390625, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -156.18231201171875, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.743121862411499, + "rewards_train/margins": 3.6365344524383545, + "rewards_train/rejected": -5.3796563148498535, + "step": 257 + }, + { + "epoch": 2.94, + "logps_train/chosen": -187.83929443359375, + "logps_train/ref_chosen": -167.0, + "logps_train/ref_rejected": -122.5, + "logps_train/rejected": -187.14642333984375, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.0775322914123535, + "rewards_train/margins": 4.39111328125, + "rewards_train/rejected": -6.4686455726623535, + "step": 257 + }, + { + "epoch": 2.95, + "learning_rate": 1.73444687298685e-08, + "loss": 0.077, + "step": 258 + }, + { + "epoch": 2.95, + "logps_train/chosen": -177.5012969970703, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -168.67471313476562, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.313605308532715, + "rewards_train/margins": 4.108870983123779, + "rewards_train/rejected": -6.422476291656494, + "step": 258 + }, + { + "epoch": 2.95, + "logps_train/chosen": -150.55606079101562, + "logps_train/ref_chosen": -134.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -195.2097930908203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6292637586593628, + "rewards_train/margins": 4.666714787483215, + "rewards_train/rejected": -6.295978546142578, + "step": 258 + }, + { + "epoch": 2.95, + "logps_train/chosen": -166.3050537109375, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -125.0, + "logps_train/rejected": -192.82681274414062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.15824031829834, + "rewards_train/margins": 4.585964679718018, + "rewards_train/rejected": -6.744204998016357, + "step": 258 + }, + { + "epoch": 2.95, + "logps_train/chosen": -156.59942626953125, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -170.30538940429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.3700997829437256, + "rewards_train/margins": 4.345986604690552, + "rewards_train/rejected": -6.716086387634277, + "step": 258 + }, + { + "epoch": 2.96, + "learning_rate": 7.709148044679481e-09, + "loss": 0.0518, + "step": 259 + }, + { + "epoch": 2.96, + "logps_train/chosen": -173.42506408691406, + "logps_train/ref_chosen": -155.0, + "logps_train/ref_rejected": -125.5, + "logps_train/rejected": -185.6907196044922, + "rewards_train/accuracies": 0.9375, + "rewards_train/chosen": -1.8300071954727173, + "rewards_train/margins": 4.202877402305603, + "rewards_train/rejected": -6.03288459777832, + "step": 259 + }, + { + "epoch": 2.96, + "logps_train/chosen": -176.98544311523438, + "logps_train/ref_chosen": -156.0, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -179.13150024414062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.050905704498291, + "rewards_train/margins": 4.533289432525635, + "rewards_train/rejected": -6.584195137023926, + "step": 259 + }, + { + "epoch": 2.96, + "logps_train/chosen": -193.09963989257812, + "logps_train/ref_chosen": -171.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -172.74990844726562, + "rewards_train/accuracies": 0.90625, + "rewards_train/chosen": -2.1755151748657227, + "rewards_train/margins": 3.523548126220703, + "rewards_train/rejected": -5.699063301086426, + "step": 259 + }, + { + "epoch": 2.96, + "logps_train/chosen": -161.4818115234375, + "logps_train/ref_chosen": -136.0, + "logps_train/ref_rejected": -126.0, + "logps_train/rejected": -198.05450439453125, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.5415897369384766, + "rewards_train/margins": 4.642866134643555, + "rewards_train/rejected": -7.184455871582031, + "step": 259 + }, + { + "epoch": 2.97, + "learning_rate": 1.9273613056008944e-09, + "loss": 0.0788, + "step": 260 + }, + { + "epoch": 2.97, + "logps_train/chosen": -185.74722290039062, + "logps_train/ref_chosen": -164.0, + "logps_train/ref_rejected": -123.0, + "logps_train/rejected": -188.15093994140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.189370632171631, + "rewards_train/margins": 4.313614845275879, + "rewards_train/rejected": -6.50298547744751, + "step": 260 + }, + { + "epoch": 2.97, + "logps_train/chosen": -200.0399932861328, + "logps_train/ref_chosen": -180.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -180.8087615966797, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -2.0358355045318604, + "rewards_train/margins": 4.2098352909088135, + "rewards_train/rejected": -6.245670795440674, + "step": 260 + }, + { + "epoch": 2.97, + "logps_train/chosen": -152.52565002441406, + "logps_train/ref_chosen": -133.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -153.40609741210938, + "rewards_train/accuracies": 0.96875, + "rewards_train/chosen": -1.9625990390777588, + "rewards_train/margins": 3.9296958446502686, + "rewards_train/rejected": -5.892294883728027, + "step": 260 + }, + { + "epoch": 2.97, + "logps_train/chosen": -162.84246826171875, + "logps_train/ref_chosen": -144.0, + "logps_train/ref_rejected": -133.0, + "logps_train/rejected": -196.57708740234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8697924613952637, + "rewards_train/margins": 4.492800235748291, + "rewards_train/rejected": -6.362592697143555, + "step": 260 + }, + { + "epoch": 2.98, + "learning_rate": 0.0, + "loss": 0.06, + "step": 261 + }, + { + "epoch": 2.98, + "step": 261, + "total_flos": 0.0, + "train_loss": 0.26605752715662523, + "train_runtime": 15002.1853, + "train_samples_per_second": 2.236, + "train_steps_per_second": 0.017 + } + ], + "max_steps": 261, + "num_train_epochs": 3, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}