{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.982857142857143, "global_step": 261, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "logps_train/chosen": -124.03406524658203, "logps_train/ref_chosen": -124.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -103.7885971069336, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.021984513849020004, "rewards_train/margins": 0.03294864948838949, "rewards_train/rejected": -0.010964135639369488, "step": 0 }, { "epoch": 0, "logps_train/chosen": -189.20074462890625, "logps_train/ref_chosen": -189.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -143.92283630371094, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": 0.01742708310484886, "rewards_train/margins": -0.006499961018562317, "rewards_train/rejected": 0.02392704412341118, "step": 0 }, { "epoch": 0, "logps_train/chosen": -108.28680419921875, "logps_train/ref_chosen": -108.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -86.16234588623047, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": -0.0033381874673068523, "rewards_train/margins": -0.008782926481217146, "rewards_train/rejected": 0.005444739013910294, "step": 0 }, { "epoch": 0, "logps_train/chosen": -159.1072235107422, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -137.1709747314453, "rewards_train/accuracies": 0.4375, "rewards_train/chosen": -0.1611127108335495, "rewards_train/margins": -0.18815578520298004, "rewards_train/rejected": 0.027043074369430542, "step": 0 }, { "epoch": 0.01, "learning_rate": 6.25e-06, "loss": 0.7329, "step": 1 }, { "epoch": 0.01, "logps_train/chosen": -152.52149963378906, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -146.0, "logps_train/rejected": -145.44671630859375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.3556646704673767, "rewards_train/margins": -0.34302489552646875, "rewards_train/rejected": -0.012639774940907955, "step": 1 }, { "epoch": 0.01, "logps_train/chosen": -162.546875, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -115.53173065185547, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": -0.0027348892763257027, "rewards_train/margins": -0.006935393437743187, "rewards_train/rejected": 0.004200504161417484, "step": 1 }, { "epoch": 0.01, "logps_train/chosen": -166.0098876953125, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -137.22930908203125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.009777255356311798, "rewards_train/margins": 0.0004109693691134453, "rewards_train/rejected": -0.010188224725425243, "step": 1 }, { "epoch": 0.01, "logps_train/chosen": -136.130859375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -112.58856201171875, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.009960669092833996, "rewards_train/margins": 0.021551736630499363, "rewards_train/rejected": -0.011591067537665367, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.25e-05, "loss": 0.7722, "step": 2 }, { "epoch": 0.02, "logps_train/chosen": -124.05975341796875, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -109.16248321533203, "rewards_train/accuracies": 0.34375, "rewards_train/chosen": 0.018145941197872162, "rewards_train/margins": -0.004668369889259338, "rewards_train/rejected": 0.0228143110871315, "step": 2 }, { "epoch": 0.02, "logps_train/chosen": -165.04600524902344, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -138.3419952392578, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": 0.011854463256895542, "rewards_train/margins": -0.011222056113183498, "rewards_train/rejected": 0.02307651937007904, "step": 2 }, { "epoch": 0.02, "logps_train/chosen": -142.4432373046875, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -123.91960144042969, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": 0.008557641878724098, "rewards_train/margins": 0.01167575130239129, "rewards_train/rejected": -0.0031181094236671925, "step": 2 }, { "epoch": 0.02, "logps_train/chosen": -165.10171508789062, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -143.83792114257812, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.02361709624528885, "rewards_train/margins": 0.017565627116709948, "rewards_train/rejected": 0.006051469128578901, "step": 2 }, { "epoch": 0.03, "learning_rate": 1.8750000000000002e-05, "loss": 0.6922, "step": 3 }, { "epoch": 0.03, "logps_train/chosen": -170.82505798339844, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -151.93087768554688, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": 0.013978248462080956, "rewards_train/margins": -0.005433313548564911, "rewards_train/rejected": 0.019411562010645866, "step": 3 }, { "epoch": 0.03, "logps_train/chosen": -139.39083862304688, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -129.07003784179688, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.020486466586589813, "rewards_train/margins": 0.003638278692960739, "rewards_train/rejected": 0.016848187893629074, "step": 3 }, { "epoch": 0.03, "logps_train/chosen": -176.51419067382812, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -132.66586303710938, "rewards_train/accuracies": 0.4375, "rewards_train/chosen": 0.009908124804496765, "rewards_train/margins": -0.014521919190883636, "rewards_train/rejected": 0.0244300439953804, "step": 3 }, { "epoch": 0.03, "logps_train/chosen": -156.88722229003906, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -119.69668579101562, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.019480425864458084, "rewards_train/margins": -0.0017685145139694214, "rewards_train/rejected": 0.021248940378427505, "step": 3 }, { "epoch": 0.05, "learning_rate": 2.5e-05, "loss": 0.6956, "step": 4 }, { "epoch": 0.05, "logps_train/chosen": -150.92123413085938, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -117.88097381591797, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.034377966076135635, "rewards_train/margins": 0.017336303368210793, "rewards_train/rejected": 0.017041662707924843, "step": 4 }, { "epoch": 0.05, "logps_train/chosen": -180.184814453125, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -128.09719848632812, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.0534917414188385, "rewards_train/margins": 0.01155165582895279, "rewards_train/rejected": 0.04194008558988571, "step": 4 }, { "epoch": 0.05, "logps_train/chosen": -143.22642517089844, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -111.45314025878906, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": 0.04542411118745804, "rewards_train/margins": 0.008365228772163391, "rewards_train/rejected": 0.03705888241529465, "step": 4 }, { "epoch": 0.05, "logps_train/chosen": -149.19285583496094, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -132.03118896484375, "rewards_train/accuracies": 0.40625, "rewards_train/chosen": 0.05766720697283745, "rewards_train/margins": 0.0012159161269664764, "rewards_train/rejected": 0.05645129084587097, "step": 4 }, { "epoch": 0.06, "learning_rate": 3.125e-05, "loss": 0.6893, "step": 5 }, { "epoch": 0.06, "logps_train/chosen": -147.8840789794922, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -122.57456970214844, "rewards_train/accuracies": 0.40625, "rewards_train/chosen": 0.05192326009273529, "rewards_train/margins": -0.010029420256614685, "rewards_train/rejected": 0.061952680349349976, "step": 5 }, { "epoch": 0.06, "logps_train/chosen": -170.50418090820312, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -136.78695678710938, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.11383962631225586, "rewards_train/margins": 0.027300000190734863, "rewards_train/rejected": 0.086539626121521, "step": 5 }, { "epoch": 0.06, "logps_train/chosen": -155.12435913085938, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -116.24176025390625, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": 0.08434212952852249, "rewards_train/margins": 0.012327082455158234, "rewards_train/rejected": 0.07201504707336426, "step": 5 }, { "epoch": 0.06, "logps_train/chosen": -140.93975830078125, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -96.0169677734375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.09296239912509918, "rewards_train/margins": 0.042803697288036346, "rewards_train/rejected": 0.050158701837062836, "step": 5 }, { "epoch": 0.07, "learning_rate": 3.7500000000000003e-05, "loss": 0.6848, "step": 6 }, { "epoch": 0.07, "logps_train/chosen": -140.3257293701172, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -110.98545837402344, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.1716260313987732, "rewards_train/margins": 0.07232072949409485, "rewards_train/rejected": 0.09930530190467834, "step": 6 }, { "epoch": 0.07, "logps_train/chosen": -154.02883911132812, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -129.6768798828125, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.20046155154705048, "rewards_train/margins": 0.06691007316112518, "rewards_train/rejected": 0.1335514783859253, "step": 6 }, { "epoch": 0.07, "logps_train/chosen": -148.28170776367188, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -124.03239440917969, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.17491726577281952, "rewards_train/margins": 0.04614986479282379, "rewards_train/rejected": 0.12876740097999573, "step": 6 }, { "epoch": 0.07, "logps_train/chosen": -147.0029296875, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -109.11970520019531, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.14702154695987701, "rewards_train/margins": 0.04444076120853424, "rewards_train/rejected": 0.10258078575134277, "step": 6 }, { "epoch": 0.08, "learning_rate": 4.375e-05, "loss": 0.667, "step": 7 }, { "epoch": 0.08, "logps_train/chosen": -132.00901794433594, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -109.18241882324219, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.226222425699234, "rewards_train/margins": 0.0596490353345871, "rewards_train/rejected": 0.1665733903646469, "step": 7 }, { "epoch": 0.08, "logps_train/chosen": -176.80740356445312, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -171.0, "logps_train/rejected": -168.07736206054688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.302267462015152, "rewards_train/margins": 0.03226765990257263, "rewards_train/rejected": 0.26999980211257935, "step": 7 }, { "epoch": 0.08, "logps_train/chosen": -161.03793334960938, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -107.19043731689453, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.2758948802947998, "rewards_train/margins": 0.10362982749938965, "rewards_train/rejected": 0.17226505279541016, "step": 7 }, { "epoch": 0.08, "logps_train/chosen": -165.82919311523438, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -142.60577392578125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.31333446502685547, "rewards_train/margins": 0.09588496387004852, "rewards_train/rejected": 0.21744950115680695, "step": 7 }, { "epoch": 0.09, "learning_rate": 5e-05, "loss": 0.6623, "step": 8 }, { "epoch": 0.09, "logps_train/chosen": -156.38870239257812, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -145.6343231201172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.423044353723526, "rewards_train/margins": 0.11528569459915161, "rewards_train/rejected": 0.3077586591243744, "step": 8 }, { "epoch": 0.09, "logps_train/chosen": -166.0334014892578, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -124.54536437988281, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.41580069065093994, "rewards_train/margins": 0.15119647979736328, "rewards_train/rejected": 0.26460421085357666, "step": 8 }, { "epoch": 0.09, "logps_train/chosen": -111.77043151855469, "logps_train/ref_chosen": -115.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -76.39224243164062, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.2940497100353241, "rewards_train/margins": 0.1374729573726654, "rewards_train/rejected": 0.1565767526626587, "step": 8 }, { "epoch": 0.09, "logps_train/chosen": -148.72055053710938, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -107.92207336425781, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.42247551679611206, "rewards_train/margins": 0.2193703055381775, "rewards_train/rejected": 0.20310521125793457, "step": 8 }, { "epoch": 0.1, "learning_rate": 4.99980726386944e-05, "loss": 0.6258, "step": 9 }, { "epoch": 0.1, "logps_train/chosen": -171.09103393554688, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -112.07714080810547, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.5837191343307495, "rewards_train/margins": 0.2688255310058594, "rewards_train/rejected": 0.31489360332489014, "step": 9 }, { "epoch": 0.1, "logps_train/chosen": -134.94949340820312, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -132.9979705810547, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.4236791431903839, "rewards_train/margins": 0.02884700894355774, "rewards_train/rejected": 0.39483213424682617, "step": 9 }, { "epoch": 0.1, "logps_train/chosen": -154.36537170410156, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -127.57532501220703, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.5209816694259644, "rewards_train/margins": 0.10341629385948181, "rewards_train/rejected": 0.41756537556648254, "step": 9 }, { "epoch": 0.1, "logps_train/chosen": -149.526611328125, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -134.63084411621094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.48503243923187256, "rewards_train/margins": 0.11374184489250183, "rewards_train/rejected": 0.3712905943393707, "step": 9 }, { "epoch": 0.11, "learning_rate": 4.9992290851955325e-05, "loss": 0.6503, "step": 10 }, { "epoch": 0.11, "logps_train/chosen": -153.533935546875, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -126.86518859863281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.5384516716003418, "rewards_train/margins": 0.16266614198684692, "rewards_train/rejected": 0.3757855296134949, "step": 10 }, { "epoch": 0.11, "logps_train/chosen": -167.98931884765625, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -144.03042602539062, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": 0.6893491744995117, "rewards_train/margins": 0.16035926342010498, "rewards_train/rejected": 0.5289899110794067, "step": 10 }, { "epoch": 0.11, "logps_train/chosen": -155.04034423828125, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -130.8607177734375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.6690115332603455, "rewards_train/margins": 0.2056686282157898, "rewards_train/rejected": 0.46334290504455566, "step": 10 }, { "epoch": 0.11, "logps_train/chosen": -130.7728271484375, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -109.23792266845703, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.6114521622657776, "rewards_train/margins": 0.13563531637191772, "rewards_train/rejected": 0.47581684589385986, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.998265553127013e-05, "loss": 0.6345, "step": 11 }, { "epoch": 0.13, "logps_train/chosen": -160.4620819091797, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -125.24628448486328, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.8416832685470581, "rewards_train/margins": 0.36943644285202026, "rewards_train/rejected": 0.47224682569503784, "step": 11 }, { "epoch": 0.13, "logps_train/chosen": -145.121826171875, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -132.90176391601562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.646020233631134, "rewards_train/margins": 0.17330646514892578, "rewards_train/rejected": 0.47271376848220825, "step": 11 }, { "epoch": 0.13, "logps_train/chosen": -151.66098022460938, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -153.0, "logps_train/rejected": -147.03488159179688, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.7320475578308105, "rewards_train/margins": 0.1371973752975464, "rewards_train/rejected": 0.5948501825332642, "step": 11 }, { "epoch": 0.13, "logps_train/chosen": -142.59877014160156, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -114.5, "logps_train/rejected": -109.92486572265625, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.6653189659118652, "rewards_train/margins": 0.20673072338104248, "rewards_train/rejected": 0.45858824253082275, "step": 11 }, { "epoch": 0.14, "learning_rate": 4.996916816229837e-05, "loss": 0.6218, "step": 12 }, { "epoch": 0.14, "logps_train/chosen": -129.76406860351562, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -104.38932800292969, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.7065520882606506, "rewards_train/margins": 0.4883071482181549, "rewards_train/rejected": 0.21824494004249573, "step": 12 }, { "epoch": 0.14, "logps_train/chosen": -149.8843994140625, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -126.18021392822266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.9509168863296509, "rewards_train/margins": 0.3822464942932129, "rewards_train/rejected": 0.568670392036438, "step": 12 }, { "epoch": 0.14, "logps_train/chosen": -141.76194763183594, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -141.0, "logps_train/rejected": -134.5399169921875, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.6832780838012695, "rewards_train/margins": 0.04976832866668701, "rewards_train/rejected": 0.6335097551345825, "step": 12 }, { "epoch": 0.14, "logps_train/chosen": -135.6645965576172, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -115.1833724975586, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.6000440716743469, "rewards_train/margins": 0.09338122606277466, "rewards_train/rejected": 0.5066628456115723, "step": 12 }, { "epoch": 0.15, "learning_rate": 4.995183082464269e-05, "loss": 0.6331, "step": 13 }, { "epoch": 0.15, "logps_train/chosen": -156.60064697265625, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -118.9285888671875, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 1.034368634223938, "rewards_train/margins": 0.5694149434566498, "rewards_train/rejected": 0.4649536907672882, "step": 13 }, { "epoch": 0.15, "logps_train/chosen": -173.77572631835938, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -163.0, "logps_train/rejected": -156.2445526123047, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.9569660425186157, "rewards_train/margins": 0.2895270586013794, "rewards_train/rejected": 0.6674389839172363, "step": 13 }, { "epoch": 0.15, "logps_train/chosen": -151.65478515625, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -115.70185852050781, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 1.0789567232131958, "rewards_train/margins": 0.48078322410583496, "rewards_train/rejected": 0.5981734991073608, "step": 13 }, { "epoch": 0.15, "logps_train/chosen": -110.07494354248047, "logps_train/ref_chosen": -118.5, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -97.74917602539062, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.8415290117263794, "rewards_train/margins": 0.2806065082550049, "rewards_train/rejected": 0.5609225034713745, "step": 13 }, { "epoch": 0.16, "learning_rate": 4.9930646191528175e-05, "loss": 0.5759, "step": 14 }, { "epoch": 0.16, "logps_train/chosen": -177.0446319580078, "logps_train/ref_chosen": -187.0, "logps_train/ref_rejected": -176.0, "logps_train/rejected": -168.82325744628906, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 1.0113569498062134, "rewards_train/margins": 0.356975793838501, "rewards_train/rejected": 0.6543811559677124, "step": 14 }, { "epoch": 0.16, "logps_train/chosen": -151.45489501953125, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -143.0, "logps_train/rejected": -136.52796936035156, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": 0.8385694026947021, "rewards_train/margins": 0.13999921083450317, "rewards_train/rejected": 0.698570191860199, "step": 14 }, { "epoch": 0.16, "logps_train/chosen": -135.15379333496094, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -125.3636703491211, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.8960466980934143, "rewards_train/margins": 0.2092685103416443, "rewards_train/rejected": 0.68677818775177, "step": 14 }, { "epoch": 0.16, "logps_train/chosen": -146.58248901367188, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -120.76148223876953, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.9701685905456543, "rewards_train/margins": 0.3346949815750122, "rewards_train/rejected": 0.6354736089706421, "step": 14 }, { "epoch": 0.17, "learning_rate": 4.9905617529390203e-05, "loss": 0.6141, "step": 15 }, { "epoch": 0.17, "logps_train/chosen": -105.37283325195312, "logps_train/ref_chosen": -112.5, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -107.2294921875, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.717697262763977, "rewards_train/margins": 0.21457242965698242, "rewards_train/rejected": 0.5031248331069946, "step": 15 }, { "epoch": 0.17, "logps_train/chosen": -140.62977600097656, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -97.55780792236328, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.7388781309127808, "rewards_train/margins": 0.35042065382003784, "rewards_train/rejected": 0.3884574770927429, "step": 15 }, { "epoch": 0.17, "logps_train/chosen": -150.58285522460938, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -99.97172546386719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.8648346662521362, "rewards_train/margins": 0.408088356256485, "rewards_train/rejected": 0.45674630999565125, "step": 15 }, { "epoch": 0.17, "logps_train/chosen": -141.79176330566406, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -125.314453125, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.8631088733673096, "rewards_train/margins": 0.32648766040802, "rewards_train/rejected": 0.5366212129592896, "step": 15 }, { "epoch": 0.18, "learning_rate": 4.987674869737077e-05, "loss": 0.5859, "step": 16 }, { "epoch": 0.18, "logps_train/chosen": -136.44541931152344, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -105.30747985839844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.7658092975616455, "rewards_train/margins": 0.42277851700782776, "rewards_train/rejected": 0.34303078055381775, "step": 16 }, { "epoch": 0.18, "logps_train/chosen": -142.18093872070312, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -115.7771987915039, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.7697968482971191, "rewards_train/margins": 0.11177456378936768, "rewards_train/rejected": 0.6580222845077515, "step": 16 }, { "epoch": 0.18, "logps_train/chosen": -154.24090576171875, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -126.07774353027344, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 1.0580410957336426, "rewards_train/margins": 0.3286939859390259, "rewards_train/rejected": 0.7293471097946167, "step": 16 }, { "epoch": 0.18, "logps_train/chosen": -152.1510467529297, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -104.72694396972656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.752070426940918, "rewards_train/margins": 0.3003508448600769, "rewards_train/rejected": 0.45171958208084106, "step": 16 }, { "epoch": 0.19, "learning_rate": 4.984404414672346e-05, "loss": 0.6126, "step": 17 }, { "epoch": 0.19, "logps_train/chosen": -142.31809997558594, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -126.29586791992188, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.7054945230484009, "rewards_train/margins": 0.10695618391036987, "rewards_train/rejected": 0.598538339138031, "step": 17 }, { "epoch": 0.19, "logps_train/chosen": -89.5583724975586, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -88.00601959228516, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": 0.49502912163734436, "rewards_train/margins": 0.10490813851356506, "rewards_train/rejected": 0.3901209831237793, "step": 17 }, { "epoch": 0.19, "logps_train/chosen": -141.4034881591797, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -95.42372131347656, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.9821124076843262, "rewards_train/margins": 0.6551483273506165, "rewards_train/rejected": 0.3269640803337097, "step": 17 }, { "epoch": 0.19, "logps_train/chosen": -126.50776672363281, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -115.35845947265625, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.8917524814605713, "rewards_train/margins": 0.34312617778778076, "rewards_train/rejected": 0.5486263036727905, "step": 17 }, { "epoch": 0.21, "learning_rate": 4.980750892012711e-05, "loss": 0.616, "step": 18 }, { "epoch": 0.21, "logps_train/chosen": -161.98281860351562, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -129.00392150878906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.9355075359344482, "rewards_train/margins": 0.31050950288772583, "rewards_train/rejected": 0.6249980330467224, "step": 18 }, { "epoch": 0.21, "logps_train/chosen": -109.2407455444336, "logps_train/ref_chosen": -115.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -87.75328063964844, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.634067952632904, "rewards_train/margins": 0.21584081649780273, "rewards_train/rejected": 0.4182271361351013, "step": 18 }, { "epoch": 0.21, "logps_train/chosen": -125.10116577148438, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -109.39839172363281, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.7695701122283936, "rewards_train/margins": 0.3268895447254181, "rewards_train/rejected": 0.44268056750297546, "step": 18 }, { "epoch": 0.21, "logps_train/chosen": -154.5267791748047, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -111.5, "logps_train/rejected": -105.82466125488281, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.8263745307922363, "rewards_train/margins": 0.2686058282852173, "rewards_train/rejected": 0.557768702507019, "step": 18 }, { "epoch": 0.22, "learning_rate": 4.976714865090827e-05, "loss": 0.6399, "step": 19 }, { "epoch": 0.22, "logps_train/chosen": -170.05242919921875, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -133.59002685546875, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 1.0042303800582886, "rewards_train/margins": 0.6750620007514954, "rewards_train/rejected": 0.3291683793067932, "step": 19 }, { "epoch": 0.22, "logps_train/chosen": -138.31671142578125, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -90.01516723632812, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.5841480493545532, "rewards_train/margins": 0.30216851830482483, "rewards_train/rejected": 0.2819795310497284, "step": 19 }, { "epoch": 0.22, "logps_train/chosen": -127.0712890625, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -87.40908813476562, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.6961421966552734, "rewards_train/margins": 0.25760751962661743, "rewards_train/rejected": 0.438534677028656, "step": 19 }, { "epoch": 0.22, "logps_train/chosen": -152.77476501464844, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -118.90397644042969, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.7926408648490906, "rewards_train/margins": 0.37669146060943604, "rewards_train/rejected": 0.41594940423965454, "step": 19 }, { "epoch": 0.23, "learning_rate": 4.972296956217265e-05, "loss": 0.5838, "step": 20 }, { "epoch": 0.23, "logps_train/chosen": -148.97178649902344, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -102.91510009765625, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.6525766849517822, "rewards_train/margins": 0.4128361642360687, "rewards_train/rejected": 0.2397405207157135, "step": 20 }, { "epoch": 0.23, "logps_train/chosen": -188.7393035888672, "logps_train/ref_chosen": -201.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -136.55224609375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 1.208785057067871, "rewards_train/margins": 1.0652238130569458, "rewards_train/rejected": 0.1435612440109253, "step": 20 }, { "epoch": 0.23, "logps_train/chosen": -136.43887329101562, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -106.124755859375, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.8320896625518799, "rewards_train/margins": 0.5457369983196259, "rewards_train/rejected": 0.28635266423225403, "step": 20 }, { "epoch": 0.23, "logps_train/chosen": -137.87374877929688, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -103.64527893066406, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.6388455629348755, "rewards_train/margins": 0.2835492193698883, "rewards_train/rejected": 0.3552963435649872, "step": 20 }, { "epoch": 0.24, "learning_rate": 4.967497846584552e-05, "loss": 0.5307, "step": 21 }, { "epoch": 0.24, "logps_train/chosen": -127.54042053222656, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -93.16413116455078, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.4815409779548645, "rewards_train/margins": 0.36279795318841934, "rewards_train/rejected": 0.11874302476644516, "step": 21 }, { "epoch": 0.24, "logps_train/chosen": -145.63665771484375, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -127.99638366699219, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.3527398109436035, "rewards_train/margins": 0.19827693700790405, "rewards_train/rejected": 0.15446287393569946, "step": 21 }, { "epoch": 0.24, "logps_train/chosen": -143.23275756835938, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -99.59611511230469, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.8170070648193359, "rewards_train/margins": 0.7010326832532883, "rewards_train/rejected": 0.11597438156604767, "step": 21 }, { "epoch": 0.24, "logps_train/chosen": -134.1798095703125, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -103.10169982910156, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.8344590663909912, "rewards_train/margins": 0.4115239083766937, "rewards_train/rejected": 0.4229351580142975, "step": 21 }, { "epoch": 0.25, "learning_rate": 4.962318276162148e-05, "loss": 0.5787, "step": 22 }, { "epoch": 0.25, "logps_train/chosen": -169.95230102539062, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -147.85079956054688, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 1.0285433530807495, "rewards_train/margins": 0.5781982243061066, "rewards_train/rejected": 0.45034512877464294, "step": 22 }, { "epoch": 0.25, "logps_train/chosen": -159.36582946777344, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -116.03822326660156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.9459124803543091, "rewards_train/margins": 0.6589144468307495, "rewards_train/rejected": 0.28699803352355957, "step": 22 }, { "epoch": 0.25, "logps_train/chosen": -140.4551544189453, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -113.26671600341797, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.6787030696868896, "rewards_train/margins": 0.34983301162719727, "rewards_train/rejected": 0.3288700580596924, "step": 22 }, { "epoch": 0.25, "logps_train/chosen": -182.3084259033203, "logps_train/ref_chosen": -191.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -122.35601806640625, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.8974780440330505, "rewards_train/margins": 0.5072007179260254, "rewards_train/rejected": 0.39027732610702515, "step": 22 }, { "epoch": 0.26, "learning_rate": 4.9567590435823383e-05, "loss": 0.5552, "step": 23 }, { "epoch": 0.26, "logps_train/chosen": -123.2986068725586, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -114.5, "logps_train/rejected": -111.90111541748047, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.4740452170372009, "rewards_train/margins": 0.20946910977363586, "rewards_train/rejected": 0.26457610726356506, "step": 23 }, { "epoch": 0.26, "logps_train/chosen": -143.50405883789062, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -114.5, "logps_train/rejected": -110.6120376586914, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.8933444023132324, "rewards_train/margins": 0.49614936113357544, "rewards_train/rejected": 0.397195041179657, "step": 23 }, { "epoch": 0.26, "logps_train/chosen": -155.1194610595703, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -121.85575866699219, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 1.0275070667266846, "rewards_train/margins": 0.6742154359817505, "rewards_train/rejected": 0.3532916307449341, "step": 23 }, { "epoch": 0.26, "logps_train/chosen": -140.42703247070312, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -122.62283325195312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.627119779586792, "rewards_train/margins": 0.5090075731277466, "rewards_train/rejected": 0.11811220645904541, "step": 23 }, { "epoch": 0.27, "learning_rate": 4.950821006017107e-05, "loss": 0.5787, "step": 24 }, { "epoch": 0.27, "logps_train/chosen": -136.32363891601562, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -101.22764587402344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.6326753497123718, "rewards_train/margins": 0.4577834904193878, "rewards_train/rejected": 0.174891859292984, "step": 24 }, { "epoch": 0.27, "logps_train/chosen": -157.54037475585938, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -126.21077728271484, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.776432454586029, "rewards_train/margins": 0.4170653820037842, "rewards_train/rejected": 0.3593670725822449, "step": 24 }, { "epoch": 0.27, "logps_train/chosen": -107.36483001708984, "logps_train/ref_chosen": -112.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -82.24800109863281, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.5118322372436523, "rewards_train/margins": 0.4391593262553215, "rewards_train/rejected": 0.07267291098833084, "step": 24 }, { "epoch": 0.27, "logps_train/chosen": -137.41708374023438, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -112.7099609375, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.5886627435684204, "rewards_train/margins": 0.302530437707901, "rewards_train/rejected": 0.2861323058605194, "step": 24 }, { "epoch": 0.29, "learning_rate": 4.944505079045958e-05, "loss": 0.6016, "step": 25 }, { "epoch": 0.29, "logps_train/chosen": -157.17239379882812, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -100.15357971191406, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.8933084011077881, "rewards_train/margins": 0.4814213514328003, "rewards_train/rejected": 0.4118870496749878, "step": 25 }, { "epoch": 0.29, "logps_train/chosen": -128.82952880859375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -114.12107849121094, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.7554748058319092, "rewards_train/margins": 0.49004340171813965, "rewards_train/rejected": 0.26543140411376953, "step": 25 }, { "epoch": 0.29, "logps_train/chosen": -174.43142700195312, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -161.0, "logps_train/rejected": -156.95753479003906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.6013880968093872, "rewards_train/margins": 0.15554097294807434, "rewards_train/rejected": 0.44584712386131287, "step": 25 }, { "epoch": 0.29, "logps_train/chosen": -179.6756591796875, "logps_train/ref_chosen": -190.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -136.37588500976562, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.9904413819313049, "rewards_train/margins": 0.6243195533752441, "rewards_train/rejected": 0.3661218285560608, "step": 25 }, { "epoch": 0.3, "learning_rate": 4.9378122365147536e-05, "loss": 0.5856, "step": 26 }, { "epoch": 0.3, "logps_train/chosen": -160.62625122070312, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -112.54586029052734, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.568233847618103, "rewards_train/margins": 0.46276117116212845, "rewards_train/rejected": 0.10547267645597458, "step": 26 }, { "epoch": 0.3, "logps_train/chosen": -180.06460571289062, "logps_train/ref_chosen": -188.0, "logps_train/ref_rejected": -170.0, "logps_train/rejected": -165.44969177246094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.8118870854377747, "rewards_train/margins": 0.4134972393512726, "rewards_train/rejected": 0.3983898460865021, "step": 26 }, { "epoch": 0.3, "logps_train/chosen": -165.16346740722656, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -125.14801025390625, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.733361005783081, "rewards_train/margins": 0.4440605640411377, "rewards_train/rejected": 0.28930044174194336, "step": 26 }, { "epoch": 0.3, "logps_train/chosen": -149.80445861816406, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -172.0, "logps_train/rejected": -168.4322967529297, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.5533437728881836, "rewards_train/margins": 0.24852675199508667, "rewards_train/rejected": 0.3048170208930969, "step": 26 }, { "epoch": 0.31, "learning_rate": 4.9307435103855507e-05, "loss": 0.6072, "step": 27 }, { "epoch": 0.31, "logps_train/chosen": -140.88067626953125, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -111.68878173828125, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.4423023760318756, "rewards_train/margins": 0.2377922534942627, "rewards_train/rejected": 0.20451012253761292, "step": 27 }, { "epoch": 0.31, "logps_train/chosen": -127.85845947265625, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -104.07398986816406, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.3409123420715332, "rewards_train/margins": 0.4412309601902962, "rewards_train/rejected": -0.10031861811876297, "step": 27 }, { "epoch": 0.31, "logps_train/chosen": -124.84896087646484, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -100.35652160644531, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.36427372694015503, "rewards_train/margins": 0.3226316273212433, "rewards_train/rejected": 0.04164209961891174, "step": 27 }, { "epoch": 0.31, "logps_train/chosen": -147.45672607421875, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -114.71719360351562, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.6011910438537598, "rewards_train/margins": 0.5735943987965584, "rewards_train/rejected": 0.027596645057201385, "step": 27 }, { "epoch": 0.32, "learning_rate": 4.923299990577488e-05, "loss": 0.617, "step": 28 }, { "epoch": 0.32, "logps_train/chosen": -169.756103515625, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -124.39460754394531, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.40368735790252686, "rewards_train/margins": 0.5618972331285477, "rewards_train/rejected": -0.1582098752260208, "step": 28 }, { "epoch": 0.32, "logps_train/chosen": -123.26806640625, "logps_train/ref_chosen": -127.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -93.16129302978516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.394532173871994, "rewards_train/margins": 0.6385668218135834, "rewards_train/rejected": -0.24403464794158936, "step": 28 }, { "epoch": 0.32, "logps_train/chosen": -162.7403106689453, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -112.55574035644531, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.8793867826461792, "rewards_train/margins": 1.0994132608175278, "rewards_train/rejected": -0.22002647817134857, "step": 28 }, { "epoch": 0.32, "logps_train/chosen": -108.07917785644531, "logps_train/ref_chosen": -112.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -100.18183135986328, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.4343189299106598, "rewards_train/margins": 0.35391509532928467, "rewards_train/rejected": 0.08040383458137512, "step": 28 }, { "epoch": 0.33, "learning_rate": 4.9154828247987275e-05, "loss": 0.5549, "step": 29 }, { "epoch": 0.33, "logps_train/chosen": -152.16064453125, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -132.60389709472656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.7116701602935791, "rewards_train/margins": 0.6406141370534897, "rewards_train/rejected": 0.07105602324008942, "step": 29 }, { "epoch": 0.33, "logps_train/chosen": -150.0108642578125, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -124.35476684570312, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.49451807141304016, "rewards_train/margins": 0.45050277933478355, "rewards_train/rejected": 0.04401529207825661, "step": 29 }, { "epoch": 0.33, "logps_train/chosen": -139.51673889160156, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -123.01943969726562, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.6350452899932861, "rewards_train/margins": 0.6675560399889946, "rewards_train/rejected": -0.032510749995708466, "step": 29 }, { "epoch": 0.33, "logps_train/chosen": -151.99301147460938, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -137.28713989257812, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.34091004729270935, "rewards_train/margins": 0.29519854485988617, "rewards_train/rejected": 0.04571150243282318, "step": 29 }, { "epoch": 0.34, "learning_rate": 4.907293218369499e-05, "loss": 0.5685, "step": 30 }, { "epoch": 0.34, "logps_train/chosen": -171.75759887695312, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -101.5311508178711, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.736739993095398, "rewards_train/margins": 0.8000111803412437, "rewards_train/rejected": -0.0632711872458458, "step": 30 }, { "epoch": 0.34, "logps_train/chosen": -150.02069091796875, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -147.0, "logps_train/rejected": -146.40304565429688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.6478447914123535, "rewards_train/margins": 0.6080716103315353, "rewards_train/rejected": 0.039773181080818176, "step": 30 }, { "epoch": 0.34, "logps_train/chosen": -138.49688720703125, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -118.42485809326172, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.20158207416534424, "rewards_train/margins": 0.424081951379776, "rewards_train/rejected": -0.22249987721443176, "step": 30 }, { "epoch": 0.34, "logps_train/chosen": -153.21910095214844, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -131.95118713378906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.5112931728363037, "rewards_train/margins": 0.5308257788419724, "rewards_train/rejected": -0.01953260600566864, "step": 30 }, { "epoch": 0.35, "learning_rate": 4.898732434036244e-05, "loss": 0.5647, "step": 31 }, { "epoch": 0.35, "logps_train/chosen": -110.5118408203125, "logps_train/ref_chosen": -111.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -105.46853637695312, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.026843346655368805, "rewards_train/margins": 0.23717240244150162, "rewards_train/rejected": -0.2103290557861328, "step": 31 }, { "epoch": 0.35, "logps_train/chosen": -170.99050903320312, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -158.0, "logps_train/rejected": -157.1632080078125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.5195033550262451, "rewards_train/margins": 0.4441733881831169, "rewards_train/rejected": 0.0753299668431282, "step": 31 }, { "epoch": 0.35, "logps_train/chosen": -159.28607177734375, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -138.23997497558594, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.43447864055633545, "rewards_train/margins": 0.6162890195846558, "rewards_train/rejected": -0.1818103790283203, "step": 31 }, { "epoch": 0.35, "logps_train/chosen": -134.58811950683594, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -132.0167236328125, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.36775100231170654, "rewards_train/margins": 0.45528803020715714, "rewards_train/rejected": -0.08753702789545059, "step": 31 }, { "epoch": 0.37, "learning_rate": 4.889801791776921e-05, "loss": 0.6137, "step": 32 }, { "epoch": 0.37, "logps_train/chosen": -135.58267211914062, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -115.02655029296875, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.2093113362789154, "rewards_train/margins": 0.3869669735431671, "rewards_train/rejected": -0.1776556372642517, "step": 32 }, { "epoch": 0.37, "logps_train/chosen": -169.59982299804688, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -144.22299194335938, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.26462656259536743, "rewards_train/margins": 0.3123156949877739, "rewards_train/rejected": -0.047689132392406464, "step": 32 }, { "epoch": 0.37, "logps_train/chosen": -146.15602111816406, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -115.34562683105469, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.4857647120952606, "rewards_train/margins": 0.7841946184635162, "rewards_train/rejected": -0.2984299063682556, "step": 32 }, { "epoch": 0.37, "logps_train/chosen": -139.1509552001953, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -147.0, "logps_train/rejected": -151.7754364013672, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.40150654315948486, "rewards_train/margins": 0.8612766265869141, "rewards_train/rejected": -0.4597700834274292, "step": 32 }, { "epoch": 0.38, "learning_rate": 4.880502668597475e-05, "loss": 0.6015, "step": 33 }, { "epoch": 0.38, "logps_train/chosen": -164.56207275390625, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -149.7305145263672, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.6187927722930908, "rewards_train/margins": 0.6844227313995361, "rewards_train/rejected": -0.06562995910644531, "step": 33 }, { "epoch": 0.38, "logps_train/chosen": -157.14566040039062, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -111.04812622070312, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.4366801977157593, "rewards_train/margins": 0.8013320863246918, "rewards_train/rejected": -0.3646518886089325, "step": 33 }, { "epoch": 0.38, "logps_train/chosen": -159.72222900390625, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -166.0, "logps_train/rejected": -166.43585205078125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.3263116478919983, "rewards_train/margins": 0.37236351892352104, "rewards_train/rejected": -0.04605187103152275, "step": 33 }, { "epoch": 0.38, "logps_train/chosen": -149.86741638183594, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -103.66566467285156, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.577516496181488, "rewards_train/margins": 1.0174227356910706, "rewards_train/rejected": -0.4399062395095825, "step": 33 }, { "epoch": 0.39, "learning_rate": 4.870836498319523e-05, "loss": 0.5305, "step": 34 }, { "epoch": 0.39, "logps_train/chosen": -161.78330993652344, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -130.53228759765625, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.26034095883369446, "rewards_train/margins": 0.532904863357544, "rewards_train/rejected": -0.2725639045238495, "step": 34 }, { "epoch": 0.39, "logps_train/chosen": -167.86843872070312, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -139.84494018554688, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 0.5324914455413818, "rewards_train/margins": 0.7851487994194031, "rewards_train/rejected": -0.25265735387802124, "step": 34 }, { "epoch": 0.39, "logps_train/chosen": -147.22430419921875, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -123.09727478027344, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.3912162780761719, "rewards_train/margins": 0.3929597958922386, "rewards_train/rejected": -0.001743517816066742, "step": 34 }, { "epoch": 0.39, "logps_train/chosen": -147.94500732421875, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -137.91192626953125, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.15667131543159485, "rewards_train/margins": 0.4005984365940094, "rewards_train/rejected": -0.24392712116241455, "step": 34 }, { "epoch": 0.4, "learning_rate": 4.86080477135927e-05, "loss": 0.5872, "step": 35 }, { "epoch": 0.4, "logps_train/chosen": -149.14828491210938, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -131.72792053222656, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.3583168387413025, "rewards_train/margins": 0.661382257938385, "rewards_train/rejected": -0.3030654191970825, "step": 35 }, { "epoch": 0.4, "logps_train/chosen": -173.38485717773438, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -130.51498413085938, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.5212795734405518, "rewards_train/margins": 0.8276603519916534, "rewards_train/rejected": -0.3063807785511017, "step": 35 }, { "epoch": 0.4, "logps_train/chosen": -153.11077880859375, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -115.9320068359375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3146057426929474, "rewards_train/margins": 0.8578799068927765, "rewards_train/rejected": -0.5432741641998291, "step": 35 }, { "epoch": 0.4, "logps_train/chosen": -172.79013061523438, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -155.0, "logps_train/rejected": -157.91104125976562, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.24540139734745026, "rewards_train/margins": 0.5402161329984665, "rewards_train/rejected": -0.29481473565101624, "step": 35 }, { "epoch": 0.41, "learning_rate": 4.850409034497704e-05, "loss": 0.5427, "step": 36 }, { "epoch": 0.41, "logps_train/chosen": -147.08950805664062, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -132.83718872070312, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.3379243314266205, "rewards_train/margins": 0.9198374450206757, "rewards_train/rejected": -0.5819131135940552, "step": 36 }, { "epoch": 0.41, "logps_train/chosen": -161.61907958984375, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -127.36285400390625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3235408067703247, "rewards_train/margins": 1.1210931539535522, "rewards_train/rejected": -0.7975523471832275, "step": 36 }, { "epoch": 0.41, "logps_train/chosen": -131.70205688476562, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -119.81352996826172, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.042293842881917953, "rewards_train/margins": 0.5976700149476528, "rewards_train/rejected": -0.5553761720657349, "step": 36 }, { "epoch": 0.41, "logps_train/chosen": -125.20751953125, "logps_train/ref_chosen": -126.0, "logps_train/ref_rejected": -93.5, "logps_train/rejected": -101.25257873535156, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.11919025331735611, "rewards_train/margins": 0.896144725382328, "rewards_train/rejected": -0.7769544720649719, "step": 36 }, { "epoch": 0.42, "learning_rate": 4.839650890642104e-05, "loss": 0.5169, "step": 37 }, { "epoch": 0.42, "logps_train/chosen": -121.39013671875, "logps_train/ref_chosen": -120.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -102.62464141845703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1606450080871582, "rewards_train/margins": 0.42508548498153687, "rewards_train/rejected": -0.5857304930686951, "step": 37 }, { "epoch": 0.42, "logps_train/chosen": -125.78103637695312, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -111.85375213623047, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.0980682522058487, "rewards_train/margins": 0.46781833469867706, "rewards_train/rejected": -0.36975008249282837, "step": 37 }, { "epoch": 0.42, "logps_train/chosen": -145.85195922851562, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -128.85755920410156, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.18382151424884796, "rewards_train/margins": 0.7501192837953568, "rewards_train/rejected": -0.5662977695465088, "step": 37 }, { "epoch": 0.42, "logps_train/chosen": -120.22520446777344, "logps_train/ref_chosen": -122.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -94.60220336914062, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.18724526464939117, "rewards_train/margins": 0.8372600227594376, "rewards_train/rejected": -0.6500147581100464, "step": 37 }, { "epoch": 0.43, "learning_rate": 4.828531998578885e-05, "loss": 0.5599, "step": 38 }, { "epoch": 0.43, "logps_train/chosen": -182.186279296875, "logps_train/ref_chosen": -186.0, "logps_train/ref_rejected": -153.0, "logps_train/rejected": -157.68670654296875, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.349829763174057, "rewards_train/margins": 0.827485203742981, "rewards_train/rejected": -0.47765544056892395, "step": 38 }, { "epoch": 0.43, "logps_train/chosen": -148.2696075439453, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -132.3610382080078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.011784523725509644, "rewards_train/margins": 0.44945111870765686, "rewards_train/rejected": -0.4376665949821472, "step": 38 }, { "epoch": 0.43, "logps_train/chosen": -148.4195556640625, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -126.39881134033203, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.09119848161935806, "rewards_train/margins": 0.628833569586277, "rewards_train/rejected": -0.537635087966919, "step": 38 }, { "epoch": 0.43, "logps_train/chosen": -154.6384735107422, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -131.3975067138672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4014357924461365, "rewards_train/margins": 0.9915284514427185, "rewards_train/rejected": -0.590092658996582, "step": 38 }, { "epoch": 0.45, "learning_rate": 4.8170540727178326e-05, "loss": 0.5291, "step": 39 }, { "epoch": 0.45, "logps_train/chosen": -151.14736938476562, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -128.07752990722656, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.21706223487854004, "rewards_train/margins": 0.8728617429733276, "rewards_train/rejected": -0.6557995080947876, "step": 39 }, { "epoch": 0.45, "logps_train/chosen": -175.7981719970703, "logps_train/ref_chosen": -178.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -152.55126953125, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.25172606110572815, "rewards_train/margins": 1.094938188791275, "rewards_train/rejected": -0.8432121276855469, "step": 39 }, { "epoch": 0.45, "logps_train/chosen": -142.94915771484375, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -126.44970703125, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.24932152032852173, "rewards_train/margins": 0.8067926168441772, "rewards_train/rejected": -0.5574710965156555, "step": 39 }, { "epoch": 0.45, "logps_train/chosen": -175.91030883789062, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -129.6051025390625, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.29153841733932495, "rewards_train/margins": 0.695236325263977, "rewards_train/rejected": -0.4036979079246521, "step": 39 }, { "epoch": 0.46, "learning_rate": 4.805218882827761e-05, "loss": 0.5147, "step": 40 }, { "epoch": 0.46, "logps_train/chosen": -139.74484252929688, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -110.22254180908203, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.10924912989139557, "rewards_train/margins": 0.5274574607610703, "rewards_train/rejected": -0.6367065906524658, "step": 40 }, { "epoch": 0.46, "logps_train/chosen": -160.38890075683594, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -146.0457000732422, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": -0.15876291692256927, "rewards_train/margins": 0.4254945069551468, "rewards_train/rejected": -0.5842574238777161, "step": 40 }, { "epoch": 0.46, "logps_train/chosen": -153.27549743652344, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -135.46401977539062, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.12430575489997864, "rewards_train/margins": 0.5440479218959808, "rewards_train/rejected": -0.4197421669960022, "step": 40 }, { "epoch": 0.46, "logps_train/chosen": -176.70468139648438, "logps_train/ref_chosen": -178.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -124.0498275756836, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 0.16253966093063354, "rewards_train/margins": 0.6728933453559875, "rewards_train/rejected": -0.510353684425354, "step": 40 }, { "epoch": 0.47, "learning_rate": 4.793028253763633e-05, "loss": 0.5968, "step": 41 }, { "epoch": 0.47, "logps_train/chosen": -171.22650146484375, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -146.68588256835938, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.2716864347457886, "rewards_train/margins": 0.5631259977817535, "rewards_train/rejected": -0.29143956303596497, "step": 41 }, { "epoch": 0.47, "logps_train/chosen": -125.95775604248047, "logps_train/ref_chosen": -124.5, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -107.48272705078125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.14445701241493225, "rewards_train/margins": 0.6441479623317719, "rewards_train/rejected": -0.7886049747467041, "step": 41 }, { "epoch": 0.47, "logps_train/chosen": -126.57373046875, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -87.17964935302734, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.19477389752864838, "rewards_train/margins": 0.7732878178358078, "rewards_train/rejected": -0.5785139203071594, "step": 41 }, { "epoch": 0.47, "logps_train/chosen": -148.91766357421875, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -95.6549072265625, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.44352418184280396, "rewards_train/margins": 0.24266964197158813, "rewards_train/rejected": -0.6861938238143921, "step": 41 }, { "epoch": 0.48, "learning_rate": 4.780484065185188e-05, "loss": 0.6466, "step": 42 }, { "epoch": 0.48, "logps_train/chosen": -115.9168930053711, "logps_train/ref_chosen": -114.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -87.66254425048828, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.21981437504291534, "rewards_train/margins": 0.599864050745964, "rewards_train/rejected": -0.8196784257888794, "step": 42 }, { "epoch": 0.48, "logps_train/chosen": -184.0168914794922, "logps_train/ref_chosen": -186.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -145.45132446289062, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.2588578462600708, "rewards_train/margins": 0.7897325158119202, "rewards_train/rejected": -0.5308746695518494, "step": 42 }, { "epoch": 0.48, "logps_train/chosen": -124.75608825683594, "logps_train/ref_chosen": -125.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -101.11264038085938, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 0.04958619177341461, "rewards_train/margins": 0.9850691705942154, "rewards_train/rejected": -0.9354829788208008, "step": 42 }, { "epoch": 0.48, "logps_train/chosen": -163.1077880859375, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -129.63858032226562, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.006554238498210907, "rewards_train/margins": 0.6211441680788994, "rewards_train/rejected": -0.6145899295806885, "step": 42 }, { "epoch": 0.49, "learning_rate": 4.767588251267121e-05, "loss": 0.5648, "step": 43 }, { "epoch": 0.49, "logps_train/chosen": -151.1636962890625, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -102.26740264892578, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.17907707393169403, "rewards_train/margins": 1.0967354625463486, "rewards_train/rejected": -0.9176583886146545, "step": 43 }, { "epoch": 0.49, "logps_train/chosen": -154.32647705078125, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -118.2194595336914, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.08923967182636261, "rewards_train/margins": 0.5269937962293625, "rewards_train/rejected": -0.6162334680557251, "step": 43 }, { "epoch": 0.49, "logps_train/chosen": -132.7676544189453, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -116.9314193725586, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": -0.3089926540851593, "rewards_train/margins": 0.21969613432884216, "rewards_train/rejected": -0.5286887884140015, "step": 43 }, { "epoch": 0.49, "logps_train/chosen": -158.78457641601562, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -104.1181640625, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.05724158138036728, "rewards_train/margins": 0.9021327868103981, "rewards_train/rejected": -0.9593743681907654, "step": 43 }, { "epoch": 0.5, "learning_rate": 4.754342800400852e-05, "loss": 0.5647, "step": 44 }, { "epoch": 0.5, "logps_train/chosen": -145.93472290039062, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -123.93000030517578, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3344568610191345, "rewards_train/margins": 0.7847806811332703, "rewards_train/rejected": -0.45032382011413574, "step": 44 }, { "epoch": 0.5, "logps_train/chosen": -140.81419372558594, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -109.78305053710938, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.007155388593673706, "rewards_train/margins": 0.5099476277828217, "rewards_train/rejected": -0.502792239189148, "step": 44 }, { "epoch": 0.5, "logps_train/chosen": -151.26019287109375, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -121.91029357910156, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.026507869362831116, "rewards_train/margins": 0.7881536334753036, "rewards_train/rejected": -0.8146615028381348, "step": 44 }, { "epoch": 0.5, "logps_train/chosen": -157.45396423339844, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -120.49565887451172, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.14910393953323364, "rewards_train/margins": 0.5894901752471924, "rewards_train/rejected": -0.44038623571395874, "step": 44 }, { "epoch": 0.51, "learning_rate": 4.7407497548879384e-05, "loss": 0.5792, "step": 45 }, { "epoch": 0.51, "logps_train/chosen": -113.65975952148438, "logps_train/ref_chosen": -113.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -76.96720886230469, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.006112739443778992, "rewards_train/margins": 0.6379715949296951, "rewards_train/rejected": -0.6440843343734741, "step": 45 }, { "epoch": 0.51, "logps_train/chosen": -154.4318389892578, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -128.56004333496094, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.1915566474199295, "rewards_train/margins": 0.7697056084871292, "rewards_train/rejected": -0.5781489610671997, "step": 45 }, { "epoch": 0.51, "logps_train/chosen": -164.68267822265625, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -124.21025848388672, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.11348249763250351, "rewards_train/margins": 0.9802481904625893, "rewards_train/rejected": -1.0937306880950928, "step": 45 }, { "epoch": 0.51, "logps_train/chosen": -148.53305053710938, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -122.53600311279297, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.3209740221500397, "rewards_train/margins": 0.7358295619487762, "rewards_train/rejected": -0.4148555397987366, "step": 45 }, { "epoch": 0.53, "learning_rate": 4.726811210625176e-05, "loss": 0.5427, "step": 46 }, { "epoch": 0.53, "logps_train/chosen": -169.46038818359375, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -117.86236572265625, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.38960567116737366, "rewards_train/margins": 0.9380492866039276, "rewards_train/rejected": -0.548443615436554, "step": 46 }, { "epoch": 0.53, "logps_train/chosen": -109.06082916259766, "logps_train/ref_chosen": -110.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -86.27485656738281, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 0.10861498862504959, "rewards_train/margins": 0.8819746449589729, "rewards_train/rejected": -0.7733596563339233, "step": 46 }, { "epoch": 0.53, "logps_train/chosen": -149.625732421875, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -111.72176361083984, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.3020744025707245, "rewards_train/margins": 1.0752268731594086, "rewards_train/rejected": -0.7731524705886841, "step": 46 }, { "epoch": 0.53, "logps_train/chosen": -160.92486572265625, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -140.57608032226562, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.2807552218437195, "rewards_train/margins": 0.5618008971214294, "rewards_train/rejected": -0.28104567527770996, "step": 46 }, { "epoch": 0.54, "learning_rate": 4.7125293167814345e-05, "loss": 0.5141, "step": 47 }, { "epoch": 0.54, "logps_train/chosen": -174.048828125, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -146.0, "logps_train/rejected": -150.9656219482422, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.28046780824661255, "rewards_train/margins": 0.7596479058265686, "rewards_train/rejected": -0.47918009757995605, "step": 47 }, { "epoch": 0.54, "logps_train/chosen": -134.04562377929688, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -144.9838409423828, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.08473768830299377, "rewards_train/margins": 0.6593497097492218, "rewards_train/rejected": -0.7440873980522156, "step": 47 }, { "epoch": 0.54, "logps_train/chosen": -153.0111083984375, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -124.59916687011719, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.3758174479007721, "rewards_train/margins": 0.956462949514389, "rewards_train/rejected": -1.3322803974151611, "step": 47 }, { "epoch": 0.54, "logps_train/chosen": -139.15069580078125, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -112.43024444580078, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 0.30602389574050903, "rewards_train/margins": 0.9506352543830872, "rewards_train/rejected": -0.6446113586425781, "step": 47 }, { "epoch": 0.55, "learning_rate": 4.697906275466279e-05, "loss": 0.5201, "step": 48 }, { "epoch": 0.55, "logps_train/chosen": -129.7398681640625, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -114.27020263671875, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.321838915348053, "rewards_train/margins": 0.39028793573379517, "rewards_train/rejected": -0.7121268510818481, "step": 48 }, { "epoch": 0.55, "logps_train/chosen": -170.0889434814453, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -177.0, "logps_train/rejected": -180.01104736328125, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.4118082821369171, "rewards_train/margins": 0.7699443101882935, "rewards_train/rejected": -0.35813602805137634, "step": 48 }, { "epoch": 0.55, "logps_train/chosen": -166.91763305664062, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -129.64866638183594, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.2073570191860199, "rewards_train/margins": 0.8548417389392853, "rewards_train/rejected": -0.6474847197532654, "step": 48 }, { "epoch": 0.55, "logps_train/chosen": -160.1295166015625, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -138.43392944335938, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.05831342190504074, "rewards_train/margins": 0.5032433345913887, "rewards_train/rejected": -0.5615567564964294, "step": 48 }, { "epoch": 0.56, "learning_rate": 4.68294434139043e-05, "loss": 0.6017, "step": 49 }, { "epoch": 0.56, "logps_train/chosen": -149.0851593017578, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -115.5216064453125, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 0.5192182064056396, "rewards_train/margins": 1.6972576379776, "rewards_train/rejected": -1.1780394315719604, "step": 49 }, { "epoch": 0.56, "logps_train/chosen": -125.34487915039062, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -108.78245544433594, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.09363719820976257, "rewards_train/margins": 0.7584063112735748, "rewards_train/rejected": -0.6647691130638123, "step": 49 }, { "epoch": 0.56, "logps_train/chosen": -141.67967224121094, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -124.4493408203125, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.10556776821613312, "rewards_train/margins": 0.5667611807584763, "rewards_train/rejected": -0.46119341254234314, "step": 49 }, { "epoch": 0.56, "logps_train/chosen": -124.61006164550781, "logps_train/ref_chosen": -125.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -95.22357177734375, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.030595704913139343, "rewards_train/margins": 0.7752193659543991, "rewards_train/rejected": -0.7446236610412598, "step": 49 }, { "epoch": 0.57, "learning_rate": 4.667645821518111e-05, "loss": 0.4851, "step": 50 }, { "epoch": 0.57, "logps_train/chosen": -176.50062561035156, "logps_train/ref_chosen": -178.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -155.60519409179688, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.17474240064620972, "rewards_train/margins": 0.5643627047538757, "rewards_train/rejected": -0.389620304107666, "step": 50 }, { "epoch": 0.57, "logps_train/chosen": -165.77471923828125, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -145.21810913085938, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.4006544351577759, "rewards_train/margins": 0.8921915590763092, "rewards_train/rejected": -0.4915371239185333, "step": 50 }, { "epoch": 0.57, "logps_train/chosen": -142.41026306152344, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -121.59886169433594, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.2132701277732849, "rewards_train/margins": 0.9618284106254578, "rewards_train/rejected": -0.7485582828521729, "step": 50 }, { "epoch": 0.57, "logps_train/chosen": -150.48184204101562, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -128.40724182128906, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.04735386371612549, "rewards_train/margins": 0.7860215902328491, "rewards_train/rejected": -0.8333754539489746, "step": 50 }, { "epoch": 0.58, "learning_rate": 4.65201307471134e-05, "loss": 0.5391, "step": 51 }, { "epoch": 0.58, "logps_train/chosen": -158.06637573242188, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -139.3981170654297, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.11431728303432465, "rewards_train/margins": 0.3441133350133896, "rewards_train/rejected": -0.22979605197906494, "step": 51 }, { "epoch": 0.58, "logps_train/chosen": -132.14430236816406, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -121.54273986816406, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.13733020424842834, "rewards_train/margins": 1.0312997996807098, "rewards_train/rejected": -1.1686300039291382, "step": 51 }, { "epoch": 0.58, "logps_train/chosen": -145.66217041015625, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -144.03201293945312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.26250535249710083, "rewards_train/margins": 0.5208224654197693, "rewards_train/rejected": -0.7833278179168701, "step": 51 }, { "epoch": 0.58, "logps_train/chosen": -169.46548461914062, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -143.0, "logps_train/rejected": -150.265869140625, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.2237648218870163, "rewards_train/margins": 0.9571881741285324, "rewards_train/rejected": -0.7334233522415161, "step": 51 }, { "epoch": 0.59, "learning_rate": 4.6360485113662216e-05, "loss": 0.6187, "step": 52 }, { "epoch": 0.59, "logps_train/chosen": -163.33822631835938, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -145.1938018798828, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.24283689260482788, "rewards_train/margins": 0.8202248811721802, "rewards_train/rejected": -0.5773879885673523, "step": 52 }, { "epoch": 0.59, "logps_train/chosen": -181.57000732421875, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -133.94961547851562, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.14456185698509216, "rewards_train/margins": 0.8153055608272552, "rewards_train/rejected": -0.6707437038421631, "step": 52 }, { "epoch": 0.59, "logps_train/chosen": -160.7414093017578, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -131.62620544433594, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 0.27234378457069397, "rewards_train/margins": 1.1264202892780304, "rewards_train/rejected": -0.8540765047073364, "step": 52 }, { "epoch": 0.59, "logps_train/chosen": -129.25555419921875, "logps_train/ref_chosen": -127.5, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -129.04898071289062, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": -0.16788795590400696, "rewards_train/margins": 0.3479476869106293, "rewards_train/rejected": -0.5158356428146362, "step": 52 }, { "epoch": 0.61, "learning_rate": 4.6197545930412874e-05, "loss": 0.5332, "step": 53 }, { "epoch": 0.61, "logps_train/chosen": -153.34988403320312, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -160.0, "logps_train/rejected": -164.3514862060547, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": -0.12678614258766174, "rewards_train/margins": 0.33057892322540283, "rewards_train/rejected": -0.4573650658130646, "step": 53 }, { "epoch": 0.61, "logps_train/chosen": -162.29542541503906, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -130.75132751464844, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.25033989548683167, "rewards_train/margins": 1.0030124485492706, "rewards_train/rejected": -0.752672553062439, "step": 53 }, { "epoch": 0.61, "logps_train/chosen": -138.71487426757812, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -101.21212768554688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.04726358875632286, "rewards_train/margins": 1.0392774604260921, "rewards_train/rejected": -0.9920138716697693, "step": 53 }, { "epoch": 0.61, "logps_train/chosen": -159.54571533203125, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -129.88558959960938, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.34285280108451843, "rewards_train/margins": 0.4222678244113922, "rewards_train/rejected": -0.7651206254959106, "step": 53 }, { "epoch": 0.62, "learning_rate": 4.6031338320779534e-05, "loss": 0.6483, "step": 54 }, { "epoch": 0.62, "logps_train/chosen": -173.44290161132812, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -133.28878784179688, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.12924429774284363, "rewards_train/margins": 0.9989194571971893, "rewards_train/rejected": -0.8696751594543457, "step": 54 }, { "epoch": 0.62, "logps_train/chosen": -148.29855346679688, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -115.97592163085938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.009714528918266296, "rewards_train/margins": 0.9764523059129715, "rewards_train/rejected": -0.9861668348312378, "step": 54 }, { "epoch": 0.62, "logps_train/chosen": -163.42010498046875, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -132.7950439453125, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 0.16990332305431366, "rewards_train/margins": 1.1158150881528854, "rewards_train/rejected": -0.9459117650985718, "step": 54 }, { "epoch": 0.62, "logps_train/chosen": -169.2532958984375, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -126.75450134277344, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.2445676475763321, "rewards_train/margins": 0.5689687579870224, "rewards_train/rejected": -0.8135364055633545, "step": 54 }, { "epoch": 0.63, "learning_rate": 4.586188791213143e-05, "loss": 0.5139, "step": 55 }, { "epoch": 0.63, "logps_train/chosen": -128.13194274902344, "logps_train/ref_chosen": -124.5, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -120.18585205078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3469837009906769, "rewards_train/margins": 1.0525586307048798, "rewards_train/rejected": -1.3995423316955566, "step": 55 }, { "epoch": 0.63, "logps_train/chosen": -137.48464965820312, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -109.7799072265625, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.32219475507736206, "rewards_train/margins": 0.4786474108695984, "rewards_train/rejected": -0.8008421659469604, "step": 55 }, { "epoch": 0.63, "logps_train/chosen": -155.82305908203125, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -140.88726806640625, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.3568549156188965, "rewards_train/margins": 1.018384873867035, "rewards_train/rejected": -0.6615299582481384, "step": 55 }, { "epoch": 0.63, "logps_train/chosen": -137.8286590576172, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -101.40814208984375, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.3273978531360626, "rewards_train/margins": 0.5737676322460175, "rewards_train/rejected": -0.9011654853820801, "step": 55 }, { "epoch": 0.64, "learning_rate": 4.568922083184144e-05, "loss": 0.5308, "step": 56 }, { "epoch": 0.64, "logps_train/chosen": -201.31527709960938, "logps_train/ref_chosen": -204.0, "logps_train/ref_rejected": -166.0, "logps_train/rejected": -172.15939331054688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.23741737008094788, "rewards_train/margins": 0.8567750751972198, "rewards_train/rejected": -0.619357705116272, "step": 56 }, { "epoch": 0.64, "logps_train/chosen": -173.6513671875, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -154.0, "logps_train/rejected": -163.62249755859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.11093807220458984, "rewards_train/margins": 1.0794376134872437, "rewards_train/rejected": -0.9684995412826538, "step": 56 }, { "epoch": 0.64, "logps_train/chosen": -170.75677490234375, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -153.0, "logps_train/rejected": -160.66265869140625, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": -0.1634213924407959, "rewards_train/margins": 0.5591435432434082, "rewards_train/rejected": -0.7225649356842041, "step": 56 }, { "epoch": 0.64, "logps_train/chosen": -176.91485595703125, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -158.0, "logps_train/rejected": -165.61184692382812, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": -0.07683837413787842, "rewards_train/margins": 0.6816108822822571, "rewards_train/rejected": -0.7584492564201355, "step": 56 }, { "epoch": 0.65, "learning_rate": 4.5513363703257496e-05, "loss": 0.5886, "step": 57 }, { "epoch": 0.65, "logps_train/chosen": -147.32388305664062, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -118.71624755859375, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": -0.22408804297447205, "rewards_train/margins": 0.9452424347400665, "rewards_train/rejected": -1.1693304777145386, "step": 57 }, { "epoch": 0.65, "logps_train/chosen": -149.79339599609375, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -138.41961669921875, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.2672303020954132, "rewards_train/margins": 0.8092033565044403, "rewards_train/rejected": -1.0764336585998535, "step": 57 }, { "epoch": 0.65, "logps_train/chosen": -108.96585083007812, "logps_train/ref_chosen": -104.5, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -111.37809753417969, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.4704621434211731, "rewards_train/margins": 0.4902119040489197, "rewards_train/rejected": -0.9606740474700928, "step": 57 }, { "epoch": 0.65, "logps_train/chosen": -176.698486328125, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -146.0, "logps_train/rejected": -154.18609619140625, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": 0.010303527116775513, "rewards_train/margins": 0.8293034136295319, "rewards_train/rejected": -0.8189998865127563, "step": 57 }, { "epoch": 0.66, "learning_rate": 4.533434364159761e-05, "loss": 0.5843, "step": 58 }, { "epoch": 0.66, "logps_train/chosen": -136.48675537109375, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -142.8067626953125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.18441638350486755, "rewards_train/margins": 0.39918962121009827, "rewards_train/rejected": -0.5836060047149658, "step": 58 }, { "epoch": 0.66, "logps_train/chosen": -184.3641357421875, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -146.0, "logps_train/rejected": -153.0880584716797, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.08172591775655746, "rewards_train/margins": 0.6115528717637062, "rewards_train/rejected": -0.6932787895202637, "step": 58 }, { "epoch": 0.66, "logps_train/chosen": -175.23277282714844, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -117.5, "logps_train/rejected": -124.69281005859375, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.07252401113510132, "rewards_train/margins": 0.7904370427131653, "rewards_train/rejected": -0.717913031578064, "step": 58 }, { "epoch": 0.66, "logps_train/chosen": -144.44598388671875, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -113.03366088867188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.09381596744060516, "rewards_train/margins": 1.031522586941719, "rewards_train/rejected": -1.1253385543823242, "step": 58 }, { "epoch": 0.67, "learning_rate": 4.515218824976895e-05, "loss": 0.5473, "step": 59 }, { "epoch": 0.67, "logps_train/chosen": -135.3798065185547, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -120.29751586914062, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -0.3549787104129791, "rewards_train/margins": 1.1142815053462982, "rewards_train/rejected": -1.4692602157592773, "step": 59 }, { "epoch": 0.67, "logps_train/chosen": -140.24740600585938, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -117.92471313476562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.19534514844417572, "rewards_train/margins": 1.0404373556375504, "rewards_train/rejected": -1.235782504081726, "step": 59 }, { "epoch": 0.67, "logps_train/chosen": -170.62718200683594, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -133.84637451171875, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": 0.15193051099777222, "rewards_train/margins": 1.3069776892662048, "rewards_train/rejected": -1.1550471782684326, "step": 59 }, { "epoch": 0.67, "logps_train/chosen": -171.68701171875, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -140.77157592773438, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.14379817247390747, "rewards_train/margins": 0.9716156721115112, "rewards_train/rejected": -0.8278174996376038, "step": 59 }, { "epoch": 0.69, "learning_rate": 4.496692561411182e-05, "loss": 0.4245, "step": 60 }, { "epoch": 0.69, "logps_train/chosen": -145.77418518066406, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -127.12965393066406, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.3304464817047119, "rewards_train/margins": 0.8365238904953003, "rewards_train/rejected": -1.1669703722000122, "step": 60 }, { "epoch": 0.69, "logps_train/chosen": -173.94161987304688, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -132.8108673095703, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.3185763955116272, "rewards_train/margins": 0.8773545622825623, "rewards_train/rejected": -1.1959309577941895, "step": 60 }, { "epoch": 0.69, "logps_train/chosen": -190.071044921875, "logps_train/ref_chosen": -189.0, "logps_train/ref_rejected": -159.0, "logps_train/rejected": -168.48336791992188, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.10808217525482178, "rewards_train/margins": 0.8708211779594421, "rewards_train/rejected": -0.9789033532142639, "step": 60 }, { "epoch": 0.69, "logps_train/chosen": -121.50714111328125, "logps_train/ref_chosen": -116.5, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -117.76773071289062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5074518322944641, "rewards_train/margins": 1.0132667422294617, "rewards_train/rejected": -1.5207185745239258, "step": 60 }, { "epoch": 0.7, "learning_rate": 4.477858430006906e-05, "loss": 0.5009, "step": 61 }, { "epoch": 0.7, "logps_train/chosen": -148.4321746826172, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -122.11833953857422, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.6903851628303528, "rewards_train/margins": 0.9600227475166321, "rewards_train/rejected": -1.6504079103469849, "step": 61 }, { "epoch": 0.7, "logps_train/chosen": -167.27960205078125, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -140.42367553710938, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.3914361596107483, "rewards_train/margins": 0.738186776638031, "rewards_train/rejected": -1.1296229362487793, "step": 61 }, { "epoch": 0.7, "logps_train/chosen": -175.978271484375, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -129.72894287109375, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.16933763027191162, "rewards_train/margins": 1.182854413986206, "rewards_train/rejected": -1.3521920442581177, "step": 61 }, { "epoch": 0.7, "logps_train/chosen": -153.08892822265625, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -137.57305908203125, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.2504955530166626, "rewards_train/margins": 0.9743890762329102, "rewards_train/rejected": -1.2248846292495728, "step": 61 }, { "epoch": 0.71, "learning_rate": 4.458719334778153e-05, "loss": 0.5142, "step": 62 }, { "epoch": 0.71, "logps_train/chosen": -143.49911499023438, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -147.0, "logps_train/rejected": -156.21580505371094, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": -0.044589295983314514, "rewards_train/margins": 0.8543341010808945, "rewards_train/rejected": -0.898923397064209, "step": 62 }, { "epoch": 0.71, "logps_train/chosen": -156.5689239501953, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -125.41449737548828, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.5022292137145996, "rewards_train/margins": 0.6355087757110596, "rewards_train/rejected": -1.1377379894256592, "step": 62 }, { "epoch": 0.71, "logps_train/chosen": -201.266357421875, "logps_train/ref_chosen": -196.0, "logps_train/ref_rejected": -159.0, "logps_train/rejected": -173.8751220703125, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -0.43591195344924927, "rewards_train/margins": 1.0374639630317688, "rewards_train/rejected": -1.473375916481018, "step": 62 }, { "epoch": 0.71, "logps_train/chosen": -149.0179901123047, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -170.43136596679688, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -1.208733081817627, "rewards_train/margins": 0.9972443580627441, "rewards_train/rejected": -2.205977439880371, "step": 62 }, { "epoch": 0.72, "learning_rate": 4.43927822676105e-05, "loss": 0.5598, "step": 63 }, { "epoch": 0.72, "logps_train/chosen": -126.84237670898438, "logps_train/ref_chosen": -117.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -115.09578704833984, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.9945892691612244, "rewards_train/margins": 0.3996577858924866, "rewards_train/rejected": -1.394247055053711, "step": 63 }, { "epoch": 0.72, "logps_train/chosen": -142.31930541992188, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -117.5, "logps_train/rejected": -130.88372802734375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.607906699180603, "rewards_train/margins": 0.7457003593444824, "rewards_train/rejected": -1.3536070585250854, "step": 63 }, { "epoch": 0.72, "logps_train/chosen": -124.56082916259766, "logps_train/ref_chosen": -115.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -110.76518249511719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.923465371131897, "rewards_train/margins": 0.618140697479248, "rewards_train/rejected": -1.541606068611145, "step": 63 }, { "epoch": 0.72, "logps_train/chosen": -179.1411895751953, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -157.0, "logps_train/rejected": -171.82464599609375, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.33872923254966736, "rewards_train/margins": 1.1219574511051178, "rewards_train/rejected": -1.4606866836547852, "step": 63 }, { "epoch": 0.73, "learning_rate": 4.419538103558742e-05, "loss": 0.5848, "step": 64 }, { "epoch": 0.73, "logps_train/chosen": -168.5423583984375, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -148.78189086914062, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.5333375334739685, "rewards_train/margins": 0.8630149960517883, "rewards_train/rejected": -1.3963525295257568, "step": 64 }, { "epoch": 0.73, "logps_train/chosen": -140.47198486328125, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -124.3873291015625, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -1.0393857955932617, "rewards_train/margins": 0.7289131879806519, "rewards_train/rejected": -1.7682989835739136, "step": 64 }, { "epoch": 0.73, "logps_train/chosen": -174.19720458984375, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -125.15985870361328, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.5634703040122986, "rewards_train/margins": 1.0730233788490295, "rewards_train/rejected": -1.6364936828613281, "step": 64 }, { "epoch": 0.73, "logps_train/chosen": -160.61813354492188, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -143.4553680419922, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.6700178384780884, "rewards_train/margins": 0.5729802846908569, "rewards_train/rejected": -1.2429981231689453, "step": 64 }, { "epoch": 0.74, "learning_rate": 4.3995020088792e-05, "loss": 0.5701, "step": 65 }, { "epoch": 0.74, "logps_train/chosen": -188.96270751953125, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -148.6373291015625, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -1.192512035369873, "rewards_train/margins": 0.6878228187561035, "rewards_train/rejected": -1.8803348541259766, "step": 65 }, { "epoch": 0.74, "logps_train/chosen": -173.8163299560547, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -141.50582885742188, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.7177664041519165, "rewards_train/margins": 0.6095980405807495, "rewards_train/rejected": -1.327364444732666, "step": 65 }, { "epoch": 0.74, "logps_train/chosen": -156.86846923828125, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -131.81948852539062, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -0.514484167098999, "rewards_train/margins": 1.3581879138946533, "rewards_train/rejected": -1.8726720809936523, "step": 65 }, { "epoch": 0.74, "logps_train/chosen": -153.33334350585938, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -140.69818115234375, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.8610811233520508, "rewards_train/margins": 0.6778771877288818, "rewards_train/rejected": -1.5389583110809326, "step": 65 }, { "epoch": 0.75, "learning_rate": 4.379173032065912e-05, "loss": 0.571, "step": 66 }, { "epoch": 0.75, "logps_train/chosen": -123.39452362060547, "logps_train/ref_chosen": -111.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -110.48931121826172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.208837628364563, "rewards_train/margins": 0.7541805505752563, "rewards_train/rejected": -1.9630181789398193, "step": 66 }, { "epoch": 0.75, "logps_train/chosen": -149.31439208984375, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -124.77264404296875, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.7231384515762329, "rewards_train/margins": 1.0160391330718994, "rewards_train/rejected": -1.7391775846481323, "step": 66 }, { "epoch": 0.75, "logps_train/chosen": -189.39576721191406, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -165.7838134765625, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.8770769834518433, "rewards_train/margins": 0.7735692262649536, "rewards_train/rejected": -1.6506462097167969, "step": 66 }, { "epoch": 0.75, "logps_train/chosen": -172.92323303222656, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -151.0, "logps_train/rejected": -169.48965454101562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6337293982505798, "rewards_train/margins": 1.203711450099945, "rewards_train/rejected": -1.837440848350525, "step": 66 }, { "epoch": 0.77, "learning_rate": 4.358554307621541e-05, "loss": 0.519, "step": 67 }, { "epoch": 0.77, "logps_train/chosen": -106.08445739746094, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -104.09698486328125, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -1.045597791671753, "rewards_train/margins": 0.9242256879806519, "rewards_train/rejected": -1.9698234796524048, "step": 67 }, { "epoch": 0.77, "logps_train/chosen": -157.91793823242188, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -139.233154296875, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -1.0456013679504395, "rewards_train/margins": 1.0714645385742188, "rewards_train/rejected": -2.117065906524658, "step": 67 }, { "epoch": 0.77, "logps_train/chosen": -191.77606201171875, "logps_train/ref_chosen": -187.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -159.0693359375, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.44342678785324097, "rewards_train/margins": 1.00628000497818, "rewards_train/rejected": -1.449706792831421, "step": 67 }, { "epoch": 0.77, "logps_train/chosen": -136.33993530273438, "logps_train/ref_chosen": -125.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -121.84136199951172, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -1.1351652145385742, "rewards_train/margins": 0.6773405075073242, "rewards_train/rejected": -1.8125057220458984, "step": 67 }, { "epoch": 0.78, "learning_rate": 4.337649014724621e-05, "loss": 0.5137, "step": 68 }, { "epoch": 0.78, "logps_train/chosen": -163.60760498046875, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -143.341064453125, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.8888216018676758, "rewards_train/margins": 0.49487829208374023, "rewards_train/rejected": -1.383699893951416, "step": 68 }, { "epoch": 0.78, "logps_train/chosen": -172.0780029296875, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -118.73085021972656, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.7458871603012085, "rewards_train/margins": 0.9197756052017212, "rewards_train/rejected": -1.6656627655029297, "step": 68 }, { "epoch": 0.78, "logps_train/chosen": -135.83432006835938, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -148.3021697998047, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.9420253038406372, "rewards_train/margins": 0.8683685064315796, "rewards_train/rejected": -1.8103938102722168, "step": 68 }, { "epoch": 0.78, "logps_train/chosen": -156.50640869140625, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -119.85636138916016, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0331604480743408, "rewards_train/margins": 0.8918311595916748, "rewards_train/rejected": -1.9249916076660156, "step": 68 }, { "epoch": 0.79, "learning_rate": 4.31646037673936e-05, "loss": 0.6014, "step": 69 }, { "epoch": 0.79, "logps_train/chosen": -109.36697387695312, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -90.85383605957031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0384798049926758, "rewards_train/margins": 1.0995402336120605, "rewards_train/rejected": -2.1380200386047363, "step": 69 }, { "epoch": 0.79, "logps_train/chosen": -164.5358428955078, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -157.33303833007812, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -1.1899994611740112, "rewards_train/margins": 0.6788498163223267, "rewards_train/rejected": -1.868849277496338, "step": 69 }, { "epoch": 0.79, "logps_train/chosen": -172.2188720703125, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -148.18170166015625, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.46329355239868164, "rewards_train/margins": 1.061615228652954, "rewards_train/rejected": -1.5249087810516357, "step": 69 }, { "epoch": 0.79, "logps_train/chosen": -157.74252319335938, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -144.2315673828125, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.8678563833236694, "rewards_train/margins": 0.7282001972198486, "rewards_train/rejected": -1.596056580543518, "step": 69 }, { "epoch": 0.8, "learning_rate": 4.2949916607186357e-05, "loss": 0.5289, "step": 70 }, { "epoch": 0.8, "logps_train/chosen": -151.2384033203125, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -117.5, "logps_train/rejected": -136.65538024902344, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.9337508678436279, "rewards_train/margins": 0.9835443496704102, "rewards_train/rejected": -1.917295217514038, "step": 70 }, { "epoch": 0.8, "logps_train/chosen": -164.0899658203125, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -119.64618682861328, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.9898558855056763, "rewards_train/margins": 1.0959542989730835, "rewards_train/rejected": -2.0858101844787598, "step": 70 }, { "epoch": 0.8, "logps_train/chosen": -158.28138732910156, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -107.51951599121094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7542130947113037, "rewards_train/margins": 1.254964828491211, "rewards_train/rejected": -2.0091779232025146, "step": 70 }, { "epoch": 0.8, "logps_train/chosen": -158.1058349609375, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -135.90130615234375, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -1.465502142906189, "rewards_train/margins": 0.7091010808944702, "rewards_train/rejected": -2.174603223800659, "step": 70 }, { "epoch": 0.81, "learning_rate": 4.273246176900252e-05, "loss": 0.5182, "step": 71 }, { "epoch": 0.81, "logps_train/chosen": -180.446533203125, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -157.4761962890625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7675294876098633, "rewards_train/margins": 0.56602942943573, "rewards_train/rejected": -1.3335589170455933, "step": 71 }, { "epoch": 0.81, "logps_train/chosen": -166.73056030273438, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -143.46798706054688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6558671593666077, "rewards_train/margins": 1.4000133872032166, "rewards_train/rejected": -2.055880546569824, "step": 71 }, { "epoch": 0.81, "logps_train/chosen": -188.02003479003906, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -167.78900146484375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7188003063201904, "rewards_train/margins": 1.1408617496490479, "rewards_train/rejected": -1.8596620559692383, "step": 71 }, { "epoch": 0.81, "logps_train/chosen": -144.71981811523438, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -128.44407653808594, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -1.008699893951416, "rewards_train/margins": 0.8348284959793091, "rewards_train/rejected": -1.843528389930725, "step": 71 }, { "epoch": 0.82, "learning_rate": 4.251227278196536e-05, "loss": 0.4895, "step": 72 }, { "epoch": 0.82, "logps_train/chosen": -132.26991271972656, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -113.90031433105469, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -0.6027978658676147, "rewards_train/margins": 1.070241093635559, "rewards_train/rejected": -1.6730389595031738, "step": 72 }, { "epoch": 0.82, "logps_train/chosen": -156.4181671142578, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -129.92678833007812, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.6840531826019287, "rewards_train/margins": 0.7702467441558838, "rewards_train/rejected": -1.4542999267578125, "step": 72 }, { "epoch": 0.82, "logps_train/chosen": -137.28082275390625, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -150.2662353515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9548407793045044, "rewards_train/margins": 0.8424848318099976, "rewards_train/rejected": -1.797325611114502, "step": 72 }, { "epoch": 0.82, "logps_train/chosen": -159.76388549804688, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -127.27428436279297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3386008143424988, "rewards_train/margins": 1.3924408555030823, "rewards_train/rejected": -1.731041669845581, "step": 72 }, { "epoch": 0.83, "learning_rate": 4.228938359677354e-05, "loss": 0.4813, "step": 73 }, { "epoch": 0.83, "logps_train/chosen": -158.0514373779297, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -126.70331573486328, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8166680335998535, "rewards_train/margins": 0.6750500202178955, "rewards_train/rejected": -1.491718053817749, "step": 73 }, { "epoch": 0.83, "logps_train/chosen": -167.51153564453125, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -154.432861328125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.33699363470077515, "rewards_train/margins": 0.9434502720832825, "rewards_train/rejected": -1.2804439067840576, "step": 73 }, { "epoch": 0.83, "logps_train/chosen": -197.72607421875, "logps_train/ref_chosen": -196.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -160.88992309570312, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.23745089769363403, "rewards_train/margins": 0.9035444855690002, "rewards_train/rejected": -1.1409953832626343, "step": 73 }, { "epoch": 0.83, "logps_train/chosen": -149.27923583984375, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -128.9501953125, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.7999448180198669, "rewards_train/margins": 1.0314993262290955, "rewards_train/rejected": -1.8314441442489624, "step": 73 }, { "epoch": 0.85, "learning_rate": 4.206382858046636e-05, "loss": 0.5071, "step": 74 }, { "epoch": 0.85, "logps_train/chosen": -176.96607971191406, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -150.25148010253906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6526632308959961, "rewards_train/margins": 0.5287351608276367, "rewards_train/rejected": -1.1813983917236328, "step": 74 }, { "epoch": 0.85, "logps_train/chosen": -202.90512084960938, "logps_train/ref_chosen": -201.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -153.5736083984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.19715136289596558, "rewards_train/margins": 0.9527884125709534, "rewards_train/rejected": -1.149939775466919, "step": 74 }, { "epoch": 0.85, "logps_train/chosen": -149.58419799804688, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -117.5, "logps_train/rejected": -133.56790161132812, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.5854701995849609, "rewards_train/margins": 1.0398268699645996, "rewards_train/rejected": -1.6252970695495605, "step": 74 }, { "epoch": 0.85, "logps_train/chosen": -132.50787353515625, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -119.30223083496094, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.8568595051765442, "rewards_train/margins": 0.9427974820137024, "rewards_train/rejected": -1.7996569871902466, "step": 74 }, { "epoch": 0.86, "learning_rate": 4.1835642511124656e-05, "loss": 0.5643, "step": 75 }, { "epoch": 0.86, "logps_train/chosen": -158.19993591308594, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -128.560546875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4262433648109436, "rewards_train/margins": 1.189382255077362, "rewards_train/rejected": -1.6156256198883057, "step": 75 }, { "epoch": 0.86, "logps_train/chosen": -188.24725341796875, "logps_train/ref_chosen": -187.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -144.20211791992188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1420358419418335, "rewards_train/margins": 0.9777122735977173, "rewards_train/rejected": -1.1197481155395508, "step": 75 }, { "epoch": 0.86, "logps_train/chosen": -137.14773559570312, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -112.96627807617188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8436791896820068, "rewards_train/margins": 0.7864446640014648, "rewards_train/rejected": -1.6301238536834717, "step": 75 }, { "epoch": 0.86, "logps_train/chosen": -174.13377380371094, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -139.75009155273438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.310349702835083, "rewards_train/margins": 1.0647573471069336, "rewards_train/rejected": -1.3751070499420166, "step": 75 }, { "epoch": 0.87, "learning_rate": 4.160486057250849e-05, "loss": 0.4979, "step": 76 }, { "epoch": 0.87, "logps_train/chosen": -142.59698486328125, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -129.82534790039062, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.3968072235584259, "rewards_train/margins": 0.733596533536911, "rewards_train/rejected": -1.130403757095337, "step": 76 }, { "epoch": 0.87, "logps_train/chosen": -161.75323486328125, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -125.47662353515625, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": -1.1422175168991089, "rewards_train/margins": 0.11257338523864746, "rewards_train/rejected": -1.2547909021377563, "step": 76 }, { "epoch": 0.87, "logps_train/chosen": -178.9916229248047, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -132.39141845703125, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.31464019417762756, "rewards_train/margins": 0.715956062078476, "rewards_train/rejected": -1.0305962562561035, "step": 76 }, { "epoch": 0.87, "logps_train/chosen": -152.24346923828125, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -127.48640441894531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2509094476699829, "rewards_train/margins": 1.095436692237854, "rewards_train/rejected": -1.346346139907837, "step": 76 }, { "epoch": 0.88, "learning_rate": 4.137151834863213e-05, "loss": 0.6958, "step": 77 }, { "epoch": 0.88, "logps_train/chosen": -148.580078125, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -157.56333923339844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.48818454146385193, "rewards_train/margins": 0.4652206003665924, "rewards_train/rejected": -0.9534051418304443, "step": 77 }, { "epoch": 0.88, "logps_train/chosen": -166.72164916992188, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -147.3184356689453, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.47060221433639526, "rewards_train/margins": 0.5743276476860046, "rewards_train/rejected": -1.0449298620224, "step": 77 }, { "epoch": 0.88, "logps_train/chosen": -143.37298583984375, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -130.98110961914062, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.5442638397216797, "rewards_train/margins": 1.0199611186981201, "rewards_train/rejected": -1.5642249584197998, "step": 77 }, { "epoch": 0.88, "logps_train/chosen": -155.43408203125, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -129.516845703125, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.3463389575481415, "rewards_train/margins": 0.9465561211109161, "rewards_train/rejected": -1.2928950786590576, "step": 77 }, { "epoch": 0.89, "learning_rate": 4.1135651818277445e-05, "loss": 0.5414, "step": 78 }, { "epoch": 0.89, "logps_train/chosen": -154.96820068359375, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -132.89260864257812, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -0.20873412489891052, "rewards_train/margins": 1.1599203646183014, "rewards_train/rejected": -1.368654489517212, "step": 78 }, { "epoch": 0.89, "logps_train/chosen": -144.24415588378906, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -130.75668334960938, "rewards_train/accuracies": 0.53125, "rewards_train/chosen": -0.38398557901382446, "rewards_train/margins": 0.327619731426239, "rewards_train/rejected": -0.7116053104400635, "step": 78 }, { "epoch": 0.89, "logps_train/chosen": -139.5198974609375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -116.1756591796875, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.28363025188446045, "rewards_train/margins": 0.8827880620956421, "rewards_train/rejected": -1.1664183139801025, "step": 78 }, { "epoch": 0.89, "logps_train/chosen": -180.26397705078125, "logps_train/ref_chosen": -178.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -149.57452392578125, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.1939750611782074, "rewards_train/margins": 1.027538686990738, "rewards_train/rejected": -1.2215137481689453, "step": 78 }, { "epoch": 0.9, "learning_rate": 4.089729734944634e-05, "loss": 0.544, "step": 79 }, { "epoch": 0.9, "logps_train/chosen": -143.99681091308594, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -153.2803497314453, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": -0.678196370601654, "rewards_train/margins": 0.220004141330719, "rewards_train/rejected": -0.898200511932373, "step": 79 }, { "epoch": 0.9, "logps_train/chosen": -158.53390502929688, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -146.01658630371094, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.03825424611568451, "rewards_train/margins": 0.9192156940698624, "rewards_train/rejected": -0.9574699401855469, "step": 79 }, { "epoch": 0.9, "logps_train/chosen": -156.74154663085938, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -150.85284423828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.48094165325164795, "rewards_train/margins": 0.6565214395523071, "rewards_train/rejected": -1.137463092803955, "step": 79 }, { "epoch": 0.9, "logps_train/chosen": -183.3664093017578, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -156.0, "logps_train/rejected": -164.84950256347656, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.10578218847513199, "rewards_train/margins": 0.8176443204283714, "rewards_train/rejected": -0.9234265089035034, "step": 79 }, { "epoch": 0.91, "learning_rate": 4.065649169375324e-05, "loss": 0.6275, "step": 80 }, { "epoch": 0.91, "logps_train/chosen": -161.34866333007812, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -130.3014678955078, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -0.00708414614200592, "rewards_train/margins": 1.3033236116170883, "rewards_train/rejected": -1.3104077577590942, "step": 80 }, { "epoch": 0.91, "logps_train/chosen": -196.28811645507812, "logps_train/ref_chosen": -194.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -164.7366943359375, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.1925809383392334, "rewards_train/margins": 1.0608729124069214, "rewards_train/rejected": -1.2534538507461548, "step": 80 }, { "epoch": 0.91, "logps_train/chosen": -143.91098022460938, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -137.1011962890625, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.027426350861787796, "rewards_train/margins": 0.7220006696879864, "rewards_train/rejected": -0.7494270205497742, "step": 80 }, { "epoch": 0.91, "logps_train/chosen": -186.50070190429688, "logps_train/ref_chosen": -188.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -138.23883056640625, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.17532047629356384, "rewards_train/margins": 1.30330428481102, "rewards_train/rejected": -1.127983808517456, "step": 80 }, { "epoch": 0.93, "learning_rate": 4.041327198075838e-05, "loss": 0.4788, "step": 81 }, { "epoch": 0.93, "logps_train/chosen": -128.11949157714844, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -107.81585693359375, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.03633883595466614, "rewards_train/margins": 1.1293288171291351, "rewards_train/rejected": -1.1656676530838013, "step": 81 }, { "epoch": 0.93, "logps_train/chosen": -165.39813232421875, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -141.9514617919922, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.31048035621643066, "rewards_train/margins": 1.1549919247627258, "rewards_train/rejected": -0.8445115685462952, "step": 81 }, { "epoch": 0.93, "logps_train/chosen": -126.2926025390625, "logps_train/ref_chosen": -124.5, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -113.79829406738281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.19713124632835388, "rewards_train/margins": 0.9133626520633698, "rewards_train/rejected": -1.1104938983917236, "step": 81 }, { "epoch": 0.93, "logps_train/chosen": -187.25137329101562, "logps_train/ref_chosen": -187.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -142.0508575439453, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": -0.020840942859649658, "rewards_train/margins": 1.0018226504325867, "rewards_train/rejected": -1.0226635932922363, "step": 81 }, { "epoch": 0.94, "learning_rate": 4.016767571224284e-05, "loss": 0.4871, "step": 82 }, { "epoch": 0.94, "logps_train/chosen": -136.57688903808594, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -102.67213439941406, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.5452377796173096, "rewards_train/margins": 0.577444314956665, "rewards_train/rejected": -1.1226820945739746, "step": 82 }, { "epoch": 0.94, "logps_train/chosen": -162.8528289794922, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -123.56939697265625, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": -0.4730757772922516, "rewards_train/margins": 0.4601331651210785, "rewards_train/rejected": -0.9332089424133301, "step": 82 }, { "epoch": 0.94, "logps_train/chosen": -152.0038299560547, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -127.97988891601562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0006765499711036682, "rewards_train/margins": 1.3138957843184471, "rewards_train/rejected": -1.3145723342895508, "step": 82 }, { "epoch": 0.94, "logps_train/chosen": -141.26974487304688, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -124.5, "logps_train/rejected": -132.22300720214844, "rewards_train/accuracies": 0.46875, "rewards_train/chosen": -0.452718585729599, "rewards_train/margins": 0.32729652523994446, "rewards_train/rejected": -0.7800151109695435, "step": 82 }, { "epoch": 0.95, "learning_rate": 3.991974075642621e-05, "loss": 0.5842, "step": 83 }, { "epoch": 0.95, "logps_train/chosen": -144.98745727539062, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -131.68789672851562, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.15765535831451416, "rewards_train/margins": 1.060597538948059, "rewards_train/rejected": -1.2182528972625732, "step": 83 }, { "epoch": 0.95, "logps_train/chosen": -159.57968139648438, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -131.57052612304688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.22476573288440704, "rewards_train/margins": 1.0797476023435593, "rewards_train/rejected": -1.3045133352279663, "step": 83 }, { "epoch": 0.95, "logps_train/chosen": -138.2335968017578, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -133.75772094726562, "rewards_train/accuracies": 0.8125, "rewards_train/chosen": -0.14119389653205872, "rewards_train/margins": 1.2634479701519012, "rewards_train/rejected": -1.40464186668396, "step": 83 }, { "epoch": 0.95, "logps_train/chosen": -145.84381103515625, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -147.16595458984375, "rewards_train/accuracies": 0.65625, "rewards_train/chosen": -0.2642488479614258, "rewards_train/margins": 0.4790076017379761, "rewards_train/rejected": -0.7432564496994019, "step": 83 }, { "epoch": 0.96, "learning_rate": 3.96695053421277e-05, "loss": 0.4781, "step": 84 }, { "epoch": 0.96, "logps_train/chosen": -183.54966735839844, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -143.7237548828125, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.6588726043701172, "rewards_train/margins": 0.7602804899215698, "rewards_train/rejected": -1.419153094291687, "step": 84 }, { "epoch": 0.96, "logps_train/chosen": -118.30413055419922, "logps_train/ref_chosen": -113.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -104.0281982421875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.46830376982688904, "rewards_train/margins": 0.7527767717838287, "rewards_train/rejected": -1.2210805416107178, "step": 84 }, { "epoch": 0.96, "logps_train/chosen": -142.31927490234375, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -121.17881774902344, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.44811493158340454, "rewards_train/margins": 0.5898844599723816, "rewards_train/rejected": -1.0379993915557861, "step": 84 }, { "epoch": 0.96, "logps_train/chosen": -162.7830047607422, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -154.27557373046875, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.023749448359012604, "rewards_train/margins": 0.9083372130990028, "rewards_train/rejected": -0.8845877647399902, "step": 84 }, { "epoch": 0.97, "learning_rate": 3.941700805287168e-05, "loss": 0.5488, "step": 85 }, { "epoch": 0.97, "logps_train/chosen": -145.57501220703125, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -146.69969177246094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0080266073346138, "rewards_train/margins": 0.8595626428723335, "rewards_train/rejected": -0.8515360355377197, "step": 85 }, { "epoch": 0.97, "logps_train/chosen": -157.25, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -141.33470153808594, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.25292864441871643, "rewards_train/margins": 0.8953824937343597, "rewards_train/rejected": -0.6424538493156433, "step": 85 }, { "epoch": 0.97, "logps_train/chosen": -156.912109375, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -123.60111999511719, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.2099611759185791, "rewards_train/margins": 0.9257863759994507, "rewards_train/rejected": -0.7158252000808716, "step": 85 }, { "epoch": 0.97, "logps_train/chosen": -148.09971618652344, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -110.16642761230469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.032508380711078644, "rewards_train/margins": 1.2118464782834053, "rewards_train/rejected": -1.1793380975723267, "step": 85 }, { "epoch": 0.98, "learning_rate": 3.916228782093857e-05, "loss": 0.4959, "step": 86 }, { "epoch": 0.98, "logps_train/chosen": -134.76730346679688, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -116.19819641113281, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": -0.11325030773878098, "rewards_train/margins": 0.5541764572262764, "rewards_train/rejected": -0.6674267649650574, "step": 86 }, { "epoch": 0.98, "logps_train/chosen": -177.84475708007812, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -145.89642333984375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3474574685096741, "rewards_train/margins": 1.1841707825660706, "rewards_train/rejected": -0.8367133140563965, "step": 86 }, { "epoch": 0.98, "logps_train/chosen": -162.54910278320312, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -128.0146484375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.29553040862083435, "rewards_train/margins": 1.3141840398311615, "rewards_train/rejected": -1.0186536312103271, "step": 86 }, { "epoch": 0.98, "logps_train/chosen": -159.21815490722656, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -137.9909210205078, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.2795528173446655, "rewards_train/margins": 0.6489570438861847, "rewards_train/rejected": -0.36940422654151917, "step": 86 }, { "epoch": 0.99, "learning_rate": 3.890538392136188e-05, "loss": 0.5114, "step": 87 }, { "epoch": 0.99, "logps_train/chosen": -156.0184326171875, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -113.9042739868164, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": -0.060827575623989105, "rewards_train/margins": 0.9319678917527199, "rewards_train/rejected": -0.992795467376709, "step": 87 }, { "epoch": 0.99, "logps_train/chosen": -153.46629333496094, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -147.35433959960938, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": 0.40214991569519043, "rewards_train/margins": 1.7021600008010864, "rewards_train/rejected": -1.300010085105896, "step": 87 }, { "epoch": 0.99, "logps_train/chosen": -137.97434997558594, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -142.22134399414062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3973763883113861, "rewards_train/margins": 1.8008342683315277, "rewards_train/rejected": -1.4034578800201416, "step": 87 }, { "epoch": 0.99, "logps_train/chosen": -149.7021026611328, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -150.05059814453125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.41885271668434143, "rewards_train/margins": 2.0571157038211823, "rewards_train/rejected": -1.6382629871368408, "step": 87 }, { "epoch": 1.01, "learning_rate": 3.8646335965872414e-05, "loss": 0.3438, "step": 88 }, { "epoch": 1.01, "logps_train/chosen": -133.6719207763672, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -138.8072509765625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.3787682056427002, "rewards_train/margins": 1.741170048713684, "rewards_train/rejected": -1.3624018430709839, "step": 88 }, { "epoch": 1.01, "logps_train/chosen": -175.55409240722656, "logps_train/ref_chosen": -185.0, "logps_train/ref_rejected": -160.0, "logps_train/rejected": -170.5850067138672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9660758972167969, "rewards_train/margins": 2.0684245824813843, "rewards_train/rejected": -1.1023486852645874, "step": 88 }, { "epoch": 1.01, "logps_train/chosen": -130.45492553710938, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -122.48587799072266, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 0.4400542378425598, "rewards_train/margins": 2.027607262134552, "rewards_train/rejected": -1.5875530242919922, "step": 88 }, { "epoch": 1.01, "logps_train/chosen": -139.69839477539062, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -145.28656005859375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.48426252603530884, "rewards_train/margins": 2.070731222629547, "rewards_train/rejected": -1.5864686965942383, "step": 88 }, { "epoch": 1.02, "learning_rate": 3.838518389679065e-05, "loss": 0.223, "step": 89 }, { "epoch": 1.02, "logps_train/chosen": -135.4624786376953, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -144.20643615722656, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -0.40640440583229065, "rewards_train/margins": 1.82263645529747, "rewards_train/rejected": -2.2290408611297607, "step": 89 }, { "epoch": 1.02, "logps_train/chosen": -151.22457885742188, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -158.74038696289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1002472639083862, "rewards_train/margins": 2.5626660585403442, "rewards_train/rejected": -1.462418794631958, "step": 89 }, { "epoch": 1.02, "logps_train/chosen": -152.0464324951172, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -123.42019653320312, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 1.0309046506881714, "rewards_train/margins": 2.756957173347473, "rewards_train/rejected": -1.7260525226593018, "step": 89 }, { "epoch": 1.02, "logps_train/chosen": -142.5914764404297, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -130.45376586914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0303058624267578, "rewards_train/margins": 2.694628357887268, "rewards_train/rejected": -1.6643224954605103, "step": 89 }, { "epoch": 1.03, "learning_rate": 3.812196798086799e-05, "loss": 0.2086, "step": 90 }, { "epoch": 1.03, "logps_train/chosen": -122.85346221923828, "logps_train/ref_chosen": -127.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -120.95262908935547, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.44590410590171814, "rewards_train/margins": 2.2182177007198334, "rewards_train/rejected": -1.7723135948181152, "step": 90 }, { "epoch": 1.03, "logps_train/chosen": -135.0833282470703, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -148.67274475097656, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 0.5127615928649902, "rewards_train/margins": 1.8486876487731934, "rewards_train/rejected": -1.3359260559082031, "step": 90 }, { "epoch": 1.03, "logps_train/chosen": -127.56909942626953, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -137.51315307617188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.8834019899368286, "rewards_train/margins": 2.3031264543533325, "rewards_train/rejected": -1.419724464416504, "step": 90 }, { "epoch": 1.03, "logps_train/chosen": -157.03421020507812, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -134.88729858398438, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.7317352294921875, "rewards_train/margins": 2.5376535654067993, "rewards_train/rejected": -1.8059183359146118, "step": 90 }, { "epoch": 1.04, "learning_rate": 3.785672880307817e-05, "loss": 0.2009, "step": 91 }, { "epoch": 1.04, "logps_train/chosen": -175.03244018554688, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -156.64846801757812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.8252712488174438, "rewards_train/margins": 2.3022282123565674, "rewards_train/rejected": -1.4769569635391235, "step": 91 }, { "epoch": 1.04, "logps_train/chosen": -134.37451171875, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -126.92540740966797, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.2630377411842346, "rewards_train/margins": 1.791417419910431, "rewards_train/rejected": -1.5283796787261963, "step": 91 }, { "epoch": 1.04, "logps_train/chosen": -122.10877227783203, "logps_train/ref_chosen": -121.0, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -109.1966552734375, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -0.07845546305179596, "rewards_train/margins": 1.5968745797872543, "rewards_train/rejected": -1.6753300428390503, "step": 91 }, { "epoch": 1.04, "logps_train/chosen": -175.49851989746094, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -151.86138916015625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.5926284790039062, "rewards_train/margins": 2.2545496225357056, "rewards_train/rejected": -1.6619211435317993, "step": 91 }, { "epoch": 1.05, "learning_rate": 3.7589507260359404e-05, "loss": 0.2275, "step": 92 }, { "epoch": 1.05, "logps_train/chosen": -149.17735290527344, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -150.0, "logps_train/rejected": -168.39157104492188, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.5334359407424927, "rewards_train/margins": 2.4423190355300903, "rewards_train/rejected": -1.9088830947875977, "step": 92 }, { "epoch": 1.05, "logps_train/chosen": -128.9286346435547, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -155.26699829101562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.3554763197898865, "rewards_train/margins": 2.472897946834564, "rewards_train/rejected": -2.1174216270446777, "step": 92 }, { "epoch": 1.05, "logps_train/chosen": -126.43780517578125, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -122.29857635498047, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.2506530284881592, "rewards_train/margins": 2.3148858547210693, "rewards_train/rejected": -2.06423282623291, "step": 92 }, { "epoch": 1.05, "logps_train/chosen": -155.54661560058594, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -121.3519515991211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8316656351089478, "rewards_train/margins": 2.5136386156082153, "rewards_train/rejected": -1.6819729804992676, "step": 92 }, { "epoch": 1.06, "learning_rate": 3.732034455530863e-05, "loss": 0.1687, "step": 93 }, { "epoch": 1.06, "logps_train/chosen": -180.7027587890625, "logps_train/ref_chosen": -190.0, "logps_train/ref_rejected": -171.0, "logps_train/rejected": -185.62432861328125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.9221908450126648, "rewards_train/margins": 2.366423189640045, "rewards_train/rejected": -1.4442323446273804, "step": 93 }, { "epoch": 1.06, "logps_train/chosen": -174.7982940673828, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -162.9814453125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.7939980030059814, "rewards_train/margins": 2.73032546043396, "rewards_train/rejected": -1.9363274574279785, "step": 93 }, { "epoch": 1.06, "logps_train/chosen": -133.31781005859375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -116.48727416992188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.2709043323993683, "rewards_train/margins": 2.5305700600147247, "rewards_train/rejected": -2.2596657276153564, "step": 93 }, { "epoch": 1.06, "logps_train/chosen": -156.94393920898438, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -139.75140380859375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.225722536444664, "rewards_train/margins": 2.1422575563192368, "rewards_train/rejected": -1.9165350198745728, "step": 93 }, { "epoch": 1.07, "learning_rate": 3.704928218982845e-05, "loss": 0.1829, "step": 94 }, { "epoch": 1.07, "logps_train/chosen": -139.14205932617188, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -131.75674438476562, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.3425329923629761, "rewards_train/margins": 2.1845154762268066, "rewards_train/rejected": -1.8419824838638306, "step": 94 }, { "epoch": 1.07, "logps_train/chosen": -145.99407958984375, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -156.46421813964844, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 0.3601621985435486, "rewards_train/margins": 2.0587562918663025, "rewards_train/rejected": -1.698594093322754, "step": 94 }, { "epoch": 1.07, "logps_train/chosen": -125.1644287109375, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -120.51481628417969, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -0.10804449021816254, "rewards_train/margins": 2.082010880112648, "rewards_train/rejected": -2.1900553703308105, "step": 94 }, { "epoch": 1.07, "logps_train/chosen": -156.272216796875, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -144.20132446289062, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.561450183391571, "rewards_train/margins": 2.499405324459076, "rewards_train/rejected": -1.9379551410675049, "step": 94 }, { "epoch": 1.09, "learning_rate": 3.677636195872802e-05, "loss": 0.2071, "step": 95 }, { "epoch": 1.09, "logps_train/chosen": -165.90792846679688, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -166.0856170654297, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.7180935144424438, "rewards_train/margins": 2.9360300302505493, "rewards_train/rejected": -2.2179365158081055, "step": 95 }, { "epoch": 1.09, "logps_train/chosen": -143.52719116210938, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -130.509521484375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.600570797920227, "rewards_train/margins": 2.178702235221863, "rewards_train/rejected": -2.77927303314209, "step": 95 }, { "epoch": 1.09, "logps_train/chosen": -168.9429931640625, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -131.9114532470703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6327519416809082, "rewards_train/margins": 3.1334948539733887, "rewards_train/rejected": -2.5007429122924805, "step": 95 }, { "epoch": 1.09, "logps_train/chosen": -161.4586944580078, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -135.01678466796875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.04224124550819397, "rewards_train/margins": 2.7115966379642487, "rewards_train/rejected": -2.6693553924560547, "step": 95 }, { "epoch": 1.1, "learning_rate": 3.6501625943278805e-05, "loss": 0.1683, "step": 96 }, { "epoch": 1.1, "logps_train/chosen": -166.95816040039062, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -168.36917114257812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.10113706439733505, "rewards_train/margins": 2.489418126642704, "rewards_train/rejected": -2.590555191040039, "step": 96 }, { "epoch": 1.1, "logps_train/chosen": -159.73220825195312, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -146.8890380859375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.12797528505325317, "rewards_train/margins": 2.4644392132759094, "rewards_train/rejected": -2.3364639282226562, "step": 96 }, { "epoch": 1.1, "logps_train/chosen": -141.54803466796875, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -137.75836181640625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.1825377345085144, "rewards_train/margins": 2.6527701020240784, "rewards_train/rejected": -2.8353078365325928, "step": 96 }, { "epoch": 1.1, "logps_train/chosen": -167.2300262451172, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -154.9856414794922, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 0.023921027779579163, "rewards_train/margins": 2.4235100895166397, "rewards_train/rejected": -2.3995890617370605, "step": 96 }, { "epoch": 1.11, "learning_rate": 3.622511650472601e-05, "loss": 0.1993, "step": 97 }, { "epoch": 1.11, "logps_train/chosen": -145.75787353515625, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -143.81246948242188, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": 0.23278886079788208, "rewards_train/margins": 2.254021942615509, "rewards_train/rejected": -2.021233081817627, "step": 97 }, { "epoch": 1.11, "logps_train/chosen": -134.80609130859375, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -121.690185546875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.004826478660106659, "rewards_train/margins": 2.3503986075520515, "rewards_train/rejected": -2.355225086212158, "step": 97 }, { "epoch": 1.11, "logps_train/chosen": -122.42782592773438, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -128.15924072265625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.4224705398082733, "rewards_train/margins": 2.235249310731888, "rewards_train/rejected": -2.657719850540161, "step": 97 }, { "epoch": 1.11, "logps_train/chosen": -164.96200561523438, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -157.0484619140625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.057143136858940125, "rewards_train/margins": 2.3765157908201218, "rewards_train/rejected": -2.3193726539611816, "step": 97 }, { "epoch": 1.12, "learning_rate": 3.5946876277757066e-05, "loss": 0.2219, "step": 98 }, { "epoch": 1.12, "logps_train/chosen": -157.20069885253906, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -158.66604614257812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.008192479610443115, "rewards_train/margins": 2.5320449471473694, "rewards_train/rejected": -2.5402374267578125, "step": 98 }, { "epoch": 1.12, "logps_train/chosen": -168.69036865234375, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -165.33319091796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.05536573380231857, "rewards_train/margins": 2.6265848353505135, "rewards_train/rejected": -2.681950569152832, "step": 98 }, { "epoch": 1.12, "logps_train/chosen": -150.9794464111328, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -150.70811462402344, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.5467734336853027, "rewards_train/margins": 2.44610857963562, "rewards_train/rejected": -2.992882013320923, "step": 98 }, { "epoch": 1.12, "logps_train/chosen": -137.99195861816406, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -140.39208984375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.13567104935646057, "rewards_train/margins": 2.4768291413784027, "rewards_train/rejected": -2.6125001907348633, "step": 98 }, { "epoch": 1.13, "learning_rate": 3.5666948163927716e-05, "loss": 0.1803, "step": 99 }, { "epoch": 1.13, "logps_train/chosen": -146.2452392578125, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -150.16941833496094, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.3603140115737915, "rewards_train/margins": 2.619420647621155, "rewards_train/rejected": -2.9797346591949463, "step": 99 }, { "epoch": 1.13, "logps_train/chosen": -126.06419372558594, "logps_train/ref_chosen": -121.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -114.22348022460938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.49977877736091614, "rewards_train/margins": 2.319640129804611, "rewards_train/rejected": -2.8194189071655273, "step": 99 }, { "epoch": 1.13, "logps_train/chosen": -132.54383850097656, "logps_train/ref_chosen": -125.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -168.75698852539062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7930556535720825, "rewards_train/margins": 2.4375261068344116, "rewards_train/rejected": -3.230581760406494, "step": 99 }, { "epoch": 1.13, "logps_train/chosen": -130.33595275878906, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -93.5, "logps_train/rejected": -121.05641174316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6355488896369934, "rewards_train/margins": 2.140502631664276, "rewards_train/rejected": -2.7760515213012695, "step": 99 }, { "epoch": 1.14, "learning_rate": 3.5385375325047166e-05, "loss": 0.2181, "step": 100 }, { "epoch": 1.14, "logps_train/chosen": -173.3467254638672, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -161.84597778320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3217822313308716, "rewards_train/margins": 2.759886145591736, "rewards_train/rejected": -3.0816683769226074, "step": 100 }, { "epoch": 1.14, "logps_train/chosen": -180.46633911132812, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -174.64852905273438, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.31440621614456177, "rewards_train/margins": 2.7816969752311707, "rewards_train/rejected": -3.0961031913757324, "step": 100 }, { "epoch": 1.14, "logps_train/chosen": -149.21096801757812, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -135.708251953125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.562260091304779, "rewards_train/margins": 2.4379597306251526, "rewards_train/rejected": -3.0002198219299316, "step": 100 }, { "epoch": 1.14, "logps_train/chosen": -165.3883819580078, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -147.68844604492188, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.2641317844390869, "rewards_train/margins": 2.305885076522827, "rewards_train/rejected": -2.570016860961914, "step": 100 }, { "epoch": 1.15, "learning_rate": 3.510220117652297e-05, "loss": 0.1737, "step": 101 }, { "epoch": 1.15, "logps_train/chosen": -153.82940673828125, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -131.58132934570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.41795065999031067, "rewards_train/margins": 3.12226340174675, "rewards_train/rejected": -3.5402140617370605, "step": 101 }, { "epoch": 1.15, "logps_train/chosen": -149.13304138183594, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -142.9083251953125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.5003165006637573, "rewards_train/margins": 2.4736016988754272, "rewards_train/rejected": -2.9739181995391846, "step": 101 }, { "epoch": 1.15, "logps_train/chosen": -185.08547973632812, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -164.0, "logps_train/rejected": -201.46383666992188, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.43471941351890564, "rewards_train/margins": 3.3151794970035553, "rewards_train/rejected": -3.749898910522461, "step": 101 }, { "epoch": 1.15, "logps_train/chosen": -143.16180419921875, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -152.1904754638672, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8441107273101807, "rewards_train/margins": 2.4427101612091064, "rewards_train/rejected": -3.286820888519287, "step": 101 }, { "epoch": 1.17, "learning_rate": 3.481746938066684e-05, "loss": 0.1572, "step": 102 }, { "epoch": 1.17, "logps_train/chosen": -169.8740234375, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -151.0, "logps_train/rejected": -182.55886840820312, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -0.9510747790336609, "rewards_train/margins": 2.172243893146515, "rewards_train/rejected": -3.123318672180176, "step": 102 }, { "epoch": 1.17, "logps_train/chosen": -140.3857421875, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -151.04632568359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9004732966423035, "rewards_train/margins": 2.5984941124916077, "rewards_train/rejected": -3.498967409133911, "step": 102 }, { "epoch": 1.17, "logps_train/chosen": -180.0685577392578, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -165.80479431152344, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9504103660583496, "rewards_train/margins": 3.016885280609131, "rewards_train/rejected": -3.9672956466674805, "step": 102 }, { "epoch": 1.17, "logps_train/chosen": -159.4775390625, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -159.30438232421875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.7955068349838257, "rewards_train/margins": 2.654363751411438, "rewards_train/rejected": -3.4498705863952637, "step": 102 }, { "epoch": 1.18, "learning_rate": 3.4531223839962453e-05, "loss": 0.1811, "step": 103 }, { "epoch": 1.18, "logps_train/chosen": -126.63336181640625, "logps_train/ref_chosen": -117.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -131.54600524902344, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.9199284911155701, "rewards_train/margins": 3.021292269229889, "rewards_train/rejected": -3.941220760345459, "step": 103 }, { "epoch": 1.18, "logps_train/chosen": -138.7056121826172, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -143.95974731445312, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.7718613147735596, "rewards_train/margins": 3.3114187717437744, "rewards_train/rejected": -4.083280086517334, "step": 103 }, { "epoch": 1.18, "logps_train/chosen": -170.283447265625, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -173.4869384765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.28478938341140747, "rewards_train/margins": 3.222693145275116, "rewards_train/rejected": -3.5074825286865234, "step": 103 }, { "epoch": 1.18, "logps_train/chosen": -186.765380859375, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -184.8590850830078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7643318176269531, "rewards_train/margins": 2.9727487564086914, "rewards_train/rejected": -3.7370805740356445, "step": 103 }, { "epoch": 1.19, "learning_rate": 3.4243508690296135e-05, "loss": 0.1504, "step": 104 }, { "epoch": 1.19, "logps_train/chosen": -132.2598876953125, "logps_train/ref_chosen": -122.5, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -142.23316955566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9834104180335999, "rewards_train/margins": 2.9381489157676697, "rewards_train/rejected": -3.9215593338012695, "step": 104 }, { "epoch": 1.19, "logps_train/chosen": -133.6953887939453, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -161.70277404785156, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.7267659902572632, "rewards_train/margins": 2.867143750190735, "rewards_train/rejected": -3.593909740447998, "step": 104 }, { "epoch": 1.19, "logps_train/chosen": -148.66311645507812, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -142.30751037597656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.7122358083724976, "rewards_train/margins": 3.461971879005432, "rewards_train/rejected": -4.17420768737793, "step": 104 }, { "epoch": 1.19, "logps_train/chosen": -169.8518829345703, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -159.99441528320312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9660478830337524, "rewards_train/margins": 2.7869099378585815, "rewards_train/rejected": -3.752957820892334, "step": 104 }, { "epoch": 1.2, "learning_rate": 3.39543682941516e-05, "loss": 0.1721, "step": 105 }, { "epoch": 1.2, "logps_train/chosen": -163.95489501953125, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -164.74134826660156, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.6238088607788086, "rewards_train/margins": 2.583432197570801, "rewards_train/rejected": -4.207241058349609, "step": 105 }, { "epoch": 1.2, "logps_train/chosen": -170.38116455078125, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -171.26705932617188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8710801005363464, "rewards_train/margins": 3.070517122745514, "rewards_train/rejected": -3.9415972232818604, "step": 105 }, { "epoch": 1.2, "logps_train/chosen": -171.02854919433594, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -151.22903442382812, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.7450417280197144, "rewards_train/margins": 2.1690722703933716, "rewards_train/rejected": -3.914113998413086, "step": 105 }, { "epoch": 1.2, "logps_train/chosen": -162.40505981445312, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -158.7891845703125, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.560684084892273, "rewards_train/margins": 2.450362801551819, "rewards_train/rejected": -4.011046886444092, "step": 105 }, { "epoch": 1.21, "learning_rate": 3.366384723376977e-05, "loss": 0.2371, "step": 106 }, { "epoch": 1.21, "logps_train/chosen": -119.2367935180664, "logps_train/ref_chosen": -99.5, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -139.24002075195312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9458470344543457, "rewards_train/margins": 2.0191712379455566, "rewards_train/rejected": -3.9650182723999023, "step": 106 }, { "epoch": 1.21, "logps_train/chosen": -144.74407958984375, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -153.19857788085938, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.4341243505477905, "rewards_train/margins": 2.543986439704895, "rewards_train/rejected": -3.9781107902526855, "step": 106 }, { "epoch": 1.21, "logps_train/chosen": -190.86489868164062, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -181.99560546875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8407862186431885, "rewards_train/margins": 3.3185408115386963, "rewards_train/rejected": -4.159327030181885, "step": 106 }, { "epoch": 1.21, "logps_train/chosen": -173.3730926513672, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -179.78939819335938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8697314262390137, "rewards_train/margins": 3.73850679397583, "rewards_train/rejected": -4.608238220214844, "step": 106 }, { "epoch": 1.22, "learning_rate": 3.3371990304274656e-05, "loss": 0.1757, "step": 107 }, { "epoch": 1.22, "logps_train/chosen": -186.10415649414062, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -159.8262939453125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9052414894104004, "rewards_train/margins": 3.8516058921813965, "rewards_train/rejected": -4.756847381591797, "step": 107 }, { "epoch": 1.22, "logps_train/chosen": -146.3466339111328, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -157.24111938476562, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.206477165222168, "rewards_train/margins": 2.8152427673339844, "rewards_train/rejected": -4.021719932556152, "step": 107 }, { "epoch": 1.22, "logps_train/chosen": -172.4299774169922, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -143.0, "logps_train/rejected": -178.18833923339844, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.0049124956130981, "rewards_train/margins": 2.5578664541244507, "rewards_train/rejected": -3.562778949737549, "step": 107 }, { "epoch": 1.22, "logps_train/chosen": -148.16546630859375, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -114.5, "logps_train/rejected": -157.4647216796875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6409605741500854, "rewards_train/margins": 2.648480534553528, "rewards_train/rejected": -4.289441108703613, "step": 107 }, { "epoch": 1.23, "learning_rate": 3.3078842506766484e-05, "loss": 0.1539, "step": 108 }, { "epoch": 1.23, "logps_train/chosen": -138.66619873046875, "logps_train/ref_chosen": -120.0, "logps_train/ref_rejected": -126.5, "logps_train/rejected": -176.2493896484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8791207075119019, "rewards_train/margins": 3.097185254096985, "rewards_train/rejected": -4.976305961608887, "step": 108 }, { "epoch": 1.23, "logps_train/chosen": -143.60552978515625, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -149.5123291015625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4019588232040405, "rewards_train/margins": 2.506013035774231, "rewards_train/rejected": -3.9079718589782715, "step": 108 }, { "epoch": 1.23, "logps_train/chosen": -157.93670654296875, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -159.3701171875, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.7084167003631592, "rewards_train/margins": 3.0381643772125244, "rewards_train/rejected": -4.746581077575684, "step": 108 }, { "epoch": 1.23, "logps_train/chosen": -199.24874877929688, "logps_train/ref_chosen": -188.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -186.7269287109375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0649135112762451, "rewards_train/margins": 3.8796541690826416, "rewards_train/rejected": -4.944567680358887, "step": 108 }, { "epoch": 1.25, "learning_rate": 3.278444904138297e-05, "loss": 0.1546, "step": 109 }, { "epoch": 1.25, "logps_train/chosen": -143.3296661376953, "logps_train/ref_chosen": -119.5, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -155.20834350585938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.4022021293640137, "rewards_train/margins": 2.016094207763672, "rewards_train/rejected": -4.4182963371276855, "step": 109 }, { "epoch": 1.25, "logps_train/chosen": -152.14297485351562, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -163.31515502929688, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.001040458679199, "rewards_train/margins": 2.6719303131103516, "rewards_train/rejected": -4.672970771789551, "step": 109 }, { "epoch": 1.25, "logps_train/chosen": -147.17257690429688, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -133.1774444580078, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -2.0941858291625977, "rewards_train/margins": 2.6622066497802734, "rewards_train/rejected": -4.756392478942871, "step": 109 }, { "epoch": 1.25, "logps_train/chosen": -170.54586791992188, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -167.38327026367188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1571266651153564, "rewards_train/margins": 3.347363233566284, "rewards_train/rejected": -4.504489898681641, "step": 109 }, { "epoch": 1.26, "learning_rate": 3.248885530033004e-05, "loss": 0.2248, "step": 110 }, { "epoch": 1.26, "logps_train/chosen": -170.85304260253906, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -174.83726501464844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8820815086364746, "rewards_train/margins": 3.082406520843506, "rewards_train/rejected": -4.9644880294799805, "step": 110 }, { "epoch": 1.26, "logps_train/chosen": -141.23141479492188, "logps_train/ref_chosen": -121.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -149.21527099609375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.9987776279449463, "rewards_train/margins": 2.5813276767730713, "rewards_train/rejected": -4.580105304718018, "step": 110 }, { "epoch": 1.26, "logps_train/chosen": -150.4207000732422, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -141.09347534179688, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7420827150344849, "rewards_train/margins": 2.7829874753952026, "rewards_train/rejected": -4.5250701904296875, "step": 110 }, { "epoch": 1.26, "logps_train/chosen": -154.02862548828125, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -146.64706420898438, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -2.1366517543792725, "rewards_train/margins": 2.7122819423675537, "rewards_train/rejected": -4.848933696746826, "step": 110 }, { "epoch": 1.27, "learning_rate": 3.219210686088278e-05, "loss": 0.1783, "step": 111 }, { "epoch": 1.27, "logps_train/chosen": -196.1944580078125, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -177.0728759765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4467411041259766, "rewards_train/margins": 3.594043731689453, "rewards_train/rejected": -5.04078483581543, "step": 111 }, { "epoch": 1.27, "logps_train/chosen": -154.63021850585938, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -155.19606018066406, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4006195068359375, "rewards_train/margins": 2.3893957138061523, "rewards_train/rejected": -4.79001522064209, "step": 111 }, { "epoch": 1.27, "logps_train/chosen": -206.43435668945312, "logps_train/ref_chosen": -196.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -181.19366455078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0297620296478271, "rewards_train/margins": 3.8278849124908447, "rewards_train/rejected": -4.857646942138672, "step": 111 }, { "epoch": 1.27, "logps_train/chosen": -200.61825561523438, "logps_train/ref_chosen": -185.0, "logps_train/ref_rejected": -147.0, "logps_train/rejected": -197.92047119140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5702245235443115, "rewards_train/margins": 3.5200655460357666, "rewards_train/rejected": -5.090290069580078, "step": 111 }, { "epoch": 1.28, "learning_rate": 3.1894249478357965e-05, "loss": 0.1124, "step": 112 }, { "epoch": 1.28, "logps_train/chosen": -184.96241760253906, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -177.40328979492188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3922240734100342, "rewards_train/margins": 3.4400475025177, "rewards_train/rejected": -4.832271575927734, "step": 112 }, { "epoch": 1.28, "logps_train/chosen": -190.53353881835938, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -161.22642517089844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.5588231086730957, "rewards_train/margins": 3.2116708755493164, "rewards_train/rejected": -4.770493984222412, "step": 112 }, { "epoch": 1.28, "logps_train/chosen": -162.96185302734375, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -184.81460571289062, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.5726509094238281, "rewards_train/margins": 3.1111040115356445, "rewards_train/rejected": -4.683754920959473, "step": 112 }, { "epoch": 1.28, "logps_train/chosen": -188.84292602539062, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -189.7190399169922, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.4589011669158936, "rewards_train/margins": 3.8735506534576416, "rewards_train/rejected": -5.332451820373535, "step": 112 }, { "epoch": 1.29, "learning_rate": 3.15953290790591e-05, "loss": 0.1447, "step": 113 }, { "epoch": 1.29, "logps_train/chosen": -155.69085693359375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -144.44094848632812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.01090669631958, "rewards_train/margins": 3.0220065116882324, "rewards_train/rejected": -5.0329132080078125, "step": 113 }, { "epoch": 1.29, "logps_train/chosen": -176.43182373046875, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -196.0265655517578, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.9890806674957275, "rewards_train/margins": 3.118849515914917, "rewards_train/rejected": -5.1079301834106445, "step": 113 }, { "epoch": 1.29, "logps_train/chosen": -147.64617919921875, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -158.9957733154297, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8295596837997437, "rewards_train/margins": 2.9519516229629517, "rewards_train/rejected": -4.781511306762695, "step": 113 }, { "epoch": 1.29, "logps_train/chosen": -201.11227416992188, "logps_train/ref_chosen": -191.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -172.78042602539062, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9713826775550842, "rewards_train/margins": 3.347773015499115, "rewards_train/rejected": -4.319155693054199, "step": 113 }, { "epoch": 1.3, "learning_rate": 3.1295391753195047e-05, "loss": 0.1702, "step": 114 }, { "epoch": 1.3, "logps_train/chosen": -183.66317749023438, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -176.61851501464844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3601659536361694, "rewards_train/margins": 3.3909436464309692, "rewards_train/rejected": -4.751109600067139, "step": 114 }, { "epoch": 1.3, "logps_train/chosen": -190.912353515625, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -192.1277618408203, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.485377311706543, "rewards_train/margins": 2.946054458618164, "rewards_train/rejected": -5.431431770324707, "step": 114 }, { "epoch": 1.3, "logps_train/chosen": -142.88148498535156, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -151.7729949951172, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.4135019779205322, "rewards_train/margins": 3.1451447010040283, "rewards_train/rejected": -4.5586466789245605, "step": 114 }, { "epoch": 1.3, "logps_train/chosen": -195.8353271484375, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -157.0, "logps_train/rejected": -207.72579956054688, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4638062715530396, "rewards_train/margins": 3.5900248289108276, "rewards_train/rejected": -5.053831100463867, "step": 114 }, { "epoch": 1.31, "learning_rate": 3.099448374777351e-05, "loss": 0.1436, "step": 115 }, { "epoch": 1.31, "logps_train/chosen": -202.21957397460938, "logps_train/ref_chosen": -187.0, "logps_train/ref_rejected": -158.0, "logps_train/rejected": -206.32733154296875, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -1.5319174528121948, "rewards_train/margins": 3.348861813545227, "rewards_train/rejected": -4.880779266357422, "step": 115 }, { "epoch": 1.31, "logps_train/chosen": -164.8944091796875, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -145.25302124023438, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.778893232345581, "rewards_train/margins": 3.073557138442993, "rewards_train/rejected": -4.852450370788574, "step": 115 }, { "epoch": 1.31, "logps_train/chosen": -160.88668823242188, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -163.2848358154297, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -2.420285224914551, "rewards_train/margins": 2.5804386138916016, "rewards_train/rejected": -5.000723838806152, "step": 115 }, { "epoch": 1.31, "logps_train/chosen": -188.2561492919922, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -169.0, "logps_train/rejected": -224.9777069091797, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.849443793296814, "rewards_train/margins": 3.7523800134658813, "rewards_train/rejected": -5.601823806762695, "step": 115 }, { "epoch": 1.33, "learning_rate": 3.069265145947016e-05, "loss": 0.2013, "step": 116 }, { "epoch": 1.33, "logps_train/chosen": -159.26284790039062, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -178.18568420410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1178855895996094, "rewards_train/margins": 3.3582873344421387, "rewards_train/rejected": -5.476172924041748, "step": 116 }, { "epoch": 1.33, "logps_train/chosen": -158.87692260742188, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -163.0965118408203, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6447348594665527, "rewards_train/margins": 3.0411858558654785, "rewards_train/rejected": -4.685920715332031, "step": 116 }, { "epoch": 1.33, "logps_train/chosen": -182.80258178710938, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -177.33001708984375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4982264041900635, "rewards_train/margins": 3.7515709400177, "rewards_train/rejected": -5.249797344207764, "step": 116 }, { "epoch": 1.33, "logps_train/chosen": -167.61244201660156, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -161.88333129882812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.1869282722473145, "rewards_train/margins": 3.1240487098693848, "rewards_train/rejected": -5.310976982116699, "step": 116 }, { "epoch": 1.34, "learning_rate": 3.0389941427474873e-05, "loss": 0.1413, "step": 117 }, { "epoch": 1.34, "logps_train/chosen": -187.91346740722656, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -195.27867126464844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.616736650466919, "rewards_train/margins": 3.5172088146209717, "rewards_train/rejected": -5.133945465087891, "step": 117 }, { "epoch": 1.34, "logps_train/chosen": -151.50758361816406, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -147.73008728027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9189229011535645, "rewards_train/margins": 2.8163905143737793, "rewards_train/rejected": -4.735313415527344, "step": 117 }, { "epoch": 1.34, "logps_train/chosen": -169.54360961914062, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -167.17601013183594, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -2.2594144344329834, "rewards_train/margins": 3.2765467166900635, "rewards_train/rejected": -5.535961151123047, "step": 117 }, { "epoch": 1.34, "logps_train/chosen": -198.6015625, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -195.97879028320312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7592782974243164, "rewards_train/margins": 3.4153575897216797, "rewards_train/rejected": -5.174635887145996, "step": 117 }, { "epoch": 1.35, "learning_rate": 3.008640032631585e-05, "loss": 0.1433, "step": 118 }, { "epoch": 1.35, "logps_train/chosen": -169.9024200439453, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -191.21194458007812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.6621170043945312, "rewards_train/margins": 3.6406192779541016, "rewards_train/rejected": -5.302736282348633, "step": 118 }, { "epoch": 1.35, "logps_train/chosen": -144.08741760253906, "logps_train/ref_chosen": -127.0, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -157.87893676757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7102066278457642, "rewards_train/margins": 3.2556179761886597, "rewards_train/rejected": -4.965824604034424, "step": 118 }, { "epoch": 1.35, "logps_train/chosen": -137.56689453125, "logps_train/ref_chosen": -117.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -156.909423828125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.0094242095947266, "rewards_train/margins": 3.6044678688049316, "rewards_train/rejected": -5.613892078399658, "step": 118 }, { "epoch": 1.35, "logps_train/chosen": -177.030029296875, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -124.5, "logps_train/rejected": -172.6234130859375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.687475323677063, "rewards_train/margins": 3.1036747694015503, "rewards_train/rejected": -4.791150093078613, "step": 118 }, { "epoch": 1.36, "learning_rate": 2.978207495866292e-05, "loss": 0.146, "step": 119 }, { "epoch": 1.36, "logps_train/chosen": -167.2484130859375, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -151.47787475585938, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.6930049657821655, "rewards_train/margins": 3.221190094947815, "rewards_train/rejected": -4.9141950607299805, "step": 119 }, { "epoch": 1.36, "logps_train/chosen": -178.5880584716797, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -176.48397827148438, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.979020595550537, "rewards_train/margins": 3.617034435272217, "rewards_train/rejected": -5.596055030822754, "step": 119 }, { "epoch": 1.36, "logps_train/chosen": -158.26107788085938, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -180.61029052734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.294564127922058, "rewards_train/margins": 3.3548184633255005, "rewards_train/rejected": -4.649382591247559, "step": 119 }, { "epoch": 1.36, "logps_train/chosen": -164.2959442138672, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -165.3199005126953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3953654766082764, "rewards_train/margins": 2.9375030994415283, "rewards_train/rejected": -4.332868576049805, "step": 119 }, { "epoch": 1.37, "learning_rate": 2.947701224811113e-05, "loss": 0.1248, "step": 120 }, { "epoch": 1.37, "logps_train/chosen": -174.03245544433594, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -171.17623901367188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3544423580169678, "rewards_train/margins": 3.606821298599243, "rewards_train/rejected": -4.961263656616211, "step": 120 }, { "epoch": 1.37, "logps_train/chosen": -147.8043670654297, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -166.4681396484375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.2177417278289795, "rewards_train/margins": 2.9022276401519775, "rewards_train/rejected": -4.119969367980957, "step": 120 }, { "epoch": 1.37, "logps_train/chosen": -178.89630126953125, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -170.27052307128906, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1310200691223145, "rewards_train/margins": 3.1832242012023926, "rewards_train/rejected": -4.314244270324707, "step": 120 }, { "epoch": 1.37, "logps_train/chosen": -165.63587951660156, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -154.56637573242188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.1438608169555664, "rewards_train/margins": 2.665705680847168, "rewards_train/rejected": -4.809566497802734, "step": 120 }, { "epoch": 1.38, "learning_rate": 2.9171259231945598e-05, "loss": 0.1559, "step": 121 }, { "epoch": 1.38, "logps_train/chosen": -151.94607543945312, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -136.6217041015625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.5857216119766235, "rewards_train/margins": 3.2760576009750366, "rewards_train/rejected": -4.86177921295166, "step": 121 }, { "epoch": 1.38, "logps_train/chosen": -155.14697265625, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -162.53421020507812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3468749523162842, "rewards_train/margins": 3.6037633419036865, "rewards_train/rejected": -4.950638294219971, "step": 121 }, { "epoch": 1.38, "logps_train/chosen": -161.08724975585938, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -160.4564208984375, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.577230453491211, "rewards_train/margins": 2.754911422729492, "rewards_train/rejected": -4.332141876220703, "step": 121 }, { "epoch": 1.38, "logps_train/chosen": -160.4774932861328, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -114.5, "logps_train/rejected": -158.9612274169922, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.4526317119598389, "rewards_train/margins": 2.9784510135650635, "rewards_train/rejected": -4.431082725524902, "step": 121 }, { "epoch": 1.39, "learning_rate": 2.8864863053888925e-05, "loss": 0.1559, "step": 122 }, { "epoch": 1.39, "logps_train/chosen": -166.92727661132812, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -182.114990234375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.6018102169036865, "rewards_train/margins": 3.3425023555755615, "rewards_train/rejected": -4.944312572479248, "step": 122 }, { "epoch": 1.39, "logps_train/chosen": -161.64584350585938, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -146.01136779785156, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.470199465751648, "rewards_train/margins": 2.7532025575637817, "rewards_train/rejected": -4.22340202331543, "step": 122 }, { "epoch": 1.39, "logps_train/chosen": -179.35714721679688, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -127.37224578857422, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3069069385528564, "rewards_train/margins": 2.7117631435394287, "rewards_train/rejected": -4.018670082092285, "step": 122 }, { "epoch": 1.39, "logps_train/chosen": -142.1234588623047, "logps_train/ref_chosen": -123.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -134.3780059814453, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.8813402652740479, "rewards_train/margins": 2.604262590408325, "rewards_train/rejected": -4.485602855682373, "step": 122 }, { "epoch": 1.41, "learning_rate": 2.8557870956832132e-05, "loss": 0.1891, "step": 123 }, { "epoch": 1.41, "logps_train/chosen": -215.79141235351562, "logps_train/ref_chosen": -207.0, "logps_train/ref_rejected": -158.0, "logps_train/rejected": -206.87472534179688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9100003242492676, "rewards_train/margins": 4.041144847869873, "rewards_train/rejected": -4.951145172119141, "step": 123 }, { "epoch": 1.41, "logps_train/chosen": -171.5395965576172, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -178.28213500976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3091840744018555, "rewards_train/margins": 3.5175161361694336, "rewards_train/rejected": -4.826700210571289, "step": 123 }, { "epoch": 1.41, "logps_train/chosen": -197.67599487304688, "logps_train/ref_chosen": -191.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -182.3161163330078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6696504354476929, "rewards_train/margins": 3.831687092781067, "rewards_train/rejected": -4.50133752822876, "step": 123 }, { "epoch": 1.41, "logps_train/chosen": -173.24679565429688, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -164.2289276123047, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.745944619178772, "rewards_train/margins": 3.166388154029846, "rewards_train/rejected": -3.912332773208618, "step": 123 }, { "epoch": 1.42, "learning_rate": 2.8250330275550336e-05, "loss": 0.0987, "step": 124 }, { "epoch": 1.42, "logps_train/chosen": -172.51718139648438, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -177.36227416992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7524986267089844, "rewards_train/margins": 4.2184929847717285, "rewards_train/rejected": -4.970991611480713, "step": 124 }, { "epoch": 1.42, "logps_train/chosen": -141.9478302001953, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -145.900146484375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.2405827045440674, "rewards_train/margins": 3.0773122310638428, "rewards_train/rejected": -4.31789493560791, "step": 124 }, { "epoch": 1.42, "logps_train/chosen": -180.90890502929688, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -169.93887329101562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6273165941238403, "rewards_train/margins": 3.1660832166671753, "rewards_train/rejected": -4.793399810791016, "step": 124 }, { "epoch": 1.42, "logps_train/chosen": -171.01747131347656, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -157.03610229492188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1220600605010986, "rewards_train/margins": 3.2815511226654053, "rewards_train/rejected": -4.403611183166504, "step": 124 }, { "epoch": 1.43, "learning_rate": 2.7942288429404256e-05, "loss": 0.1097, "step": 125 }, { "epoch": 1.43, "logps_train/chosen": -175.7913818359375, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -169.33328247070312, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.0130254030227661, "rewards_train/margins": 2.717079997062683, "rewards_train/rejected": -3.730105400085449, "step": 125 }, { "epoch": 1.43, "logps_train/chosen": -193.09005737304688, "logps_train/ref_chosen": -186.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -190.29751586914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7705289125442505, "rewards_train/margins": 3.368988871574402, "rewards_train/rejected": -4.139517784118652, "step": 125 }, { "epoch": 1.43, "logps_train/chosen": -175.15960693359375, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -162.30618286132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0979918241500854, "rewards_train/margins": 3.0310651063919067, "rewards_train/rejected": -4.129056930541992, "step": 125 }, { "epoch": 1.43, "logps_train/chosen": -192.91085815429688, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -156.0, "logps_train/rejected": -200.0994415283203, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.1479214429855347, "rewards_train/margins": 3.2763782739639282, "rewards_train/rejected": -4.424299716949463, "step": 125 }, { "epoch": 1.44, "learning_rate": 2.7633792915028677e-05, "loss": 0.1349, "step": 126 }, { "epoch": 1.44, "logps_train/chosen": -185.21578979492188, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -176.3563232421875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.304196298122406, "rewards_train/margins": 3.450674593448639, "rewards_train/rejected": -3.754870891571045, "step": 126 }, { "epoch": 1.44, "logps_train/chosen": -171.53411865234375, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -186.81044006347656, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.3897385597229004, "rewards_train/margins": 3.4538064002990723, "rewards_train/rejected": -4.843544960021973, "step": 126 }, { "epoch": 1.44, "logps_train/chosen": -157.05662536621094, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -139.83226013183594, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.347580909729004, "rewards_train/margins": 2.9695076942443848, "rewards_train/rejected": -4.317088603973389, "step": 126 }, { "epoch": 1.44, "logps_train/chosen": -155.17971801757812, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -143.3584747314453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2133831977844238, "rewards_train/margins": 2.3373942375183105, "rewards_train/rejected": -3.5507774353027344, "step": 126 }, { "epoch": 1.45, "learning_rate": 2.7324891299008985e-05, "loss": 0.2015, "step": 127 }, { "epoch": 1.45, "logps_train/chosen": -153.86216735839844, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -162.63079833984375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.480747938156128, "rewards_train/margins": 2.8033287525177, "rewards_train/rejected": -4.284076690673828, "step": 127 }, { "epoch": 1.45, "logps_train/chosen": -186.6682586669922, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -192.19993591308594, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0969038009643555, "rewards_train/margins": 3.314544677734375, "rewards_train/rejected": -4.4114484786987305, "step": 127 }, { "epoch": 1.45, "logps_train/chosen": -153.8680419921875, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -171.44602966308594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9669804573059082, "rewards_train/margins": 2.993198871612549, "rewards_train/rejected": -4.960179328918457, "step": 127 }, { "epoch": 1.45, "logps_train/chosen": -176.20404052734375, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -169.44656372070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6339795589447021, "rewards_train/margins": 3.147592306137085, "rewards_train/rejected": -3.781571865081787, "step": 127 }, { "epoch": 1.46, "learning_rate": 2.701563121054695e-05, "loss": 0.1361, "step": 128 }, { "epoch": 1.46, "logps_train/chosen": -166.00640869140625, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -178.55386352539062, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8213452100753784, "rewards_train/margins": 3.634822964668274, "rewards_train/rejected": -4.456168174743652, "step": 128 }, { "epoch": 1.46, "logps_train/chosen": -162.34912109375, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -152.22750854492188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0764155387878418, "rewards_train/margins": 3.124901294708252, "rewards_train/rejected": -4.201316833496094, "step": 128 }, { "epoch": 1.46, "logps_train/chosen": -156.54486083984375, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -149.5716094970703, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1559518575668335, "rewards_train/margins": 3.318982720375061, "rewards_train/rejected": -4.4749345779418945, "step": 128 }, { "epoch": 1.46, "logps_train/chosen": -190.66033935546875, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -178.93310546875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.6988459825515747, "rewards_train/margins": 3.3385089635849, "rewards_train/rejected": -4.037354946136475, "step": 128 }, { "epoch": 1.47, "learning_rate": 2.6706060334116777e-05, "loss": 0.1665, "step": 129 }, { "epoch": 1.47, "logps_train/chosen": -153.0819854736328, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -140.51028442382812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.736420750617981, "rewards_train/margins": 3.3056238889694214, "rewards_train/rejected": -4.042044639587402, "step": 129 }, { "epoch": 1.47, "logps_train/chosen": -179.62692260742188, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -159.4015655517578, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3000822067260742, "rewards_train/margins": 3.679259777069092, "rewards_train/rejected": -4.979341983795166, "step": 129 }, { "epoch": 1.47, "logps_train/chosen": -162.6141815185547, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -156.1014862060547, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.0475513935089111, "rewards_train/margins": 3.2958009243011475, "rewards_train/rejected": -4.343352317810059, "step": 129 }, { "epoch": 1.47, "logps_train/chosen": -174.86026000976562, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -182.4564971923828, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3674092292785645, "rewards_train/margins": 3.3553266525268555, "rewards_train/rejected": -4.72273588180542, "step": 129 }, { "epoch": 1.49, "learning_rate": 2.639622640211277e-05, "loss": 0.1287, "step": 130 }, { "epoch": 1.49, "logps_train/chosen": -142.3889923095703, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -153.8872528076172, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3541336059570312, "rewards_train/margins": 3.004904270172119, "rewards_train/rejected": -4.35903787612915, "step": 130 }, { "epoch": 1.49, "logps_train/chosen": -168.5977020263672, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -160.97059631347656, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.1273491382598877, "rewards_train/margins": 3.112093210220337, "rewards_train/rejected": -4.239442348480225, "step": 130 }, { "epoch": 1.49, "logps_train/chosen": -186.99581909179688, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -178.27426147460938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.5144252777099609, "rewards_train/margins": 3.7290172576904297, "rewards_train/rejected": -4.243442535400391, "step": 130 }, { "epoch": 1.49, "logps_train/chosen": -163.23785400390625, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -131.27413940429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0287654399871826, "rewards_train/margins": 3.533609628677368, "rewards_train/rejected": -4.562375068664551, "step": 130 }, { "epoch": 1.5, "learning_rate": 2.6086177187489453e-05, "loss": 0.1228, "step": 131 }, { "epoch": 1.5, "logps_train/chosen": -147.3597412109375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -145.3800048828125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1206421852111816, "rewards_train/margins": 3.2051515579223633, "rewards_train/rejected": -4.325793743133545, "step": 131 }, { "epoch": 1.5, "logps_train/chosen": -165.80751037597656, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -178.4746551513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1619529724121094, "rewards_train/margins": 3.133852005004883, "rewards_train/rejected": -4.295804977416992, "step": 131 }, { "epoch": 1.5, "logps_train/chosen": -169.84341430664062, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -160.15591430664062, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.9361000061035156, "rewards_train/margins": 3.2722649574279785, "rewards_train/rejected": -4.208364963531494, "step": 131 }, { "epoch": 1.5, "logps_train/chosen": -163.28036499023438, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -147.83535766601562, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.7479585409164429, "rewards_train/margins": 3.2120429277420044, "rewards_train/rejected": -3.9600014686584473, "step": 131 }, { "epoch": 1.51, "learning_rate": 2.5775960496395564e-05, "loss": 0.1649, "step": 132 }, { "epoch": 1.51, "logps_train/chosen": -149.8038330078125, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -147.8526611328125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4508914947509766, "rewards_train/margins": 3.12939453125, "rewards_train/rejected": -4.580286026000977, "step": 132 }, { "epoch": 1.51, "logps_train/chosen": -158.60960388183594, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -155.93435668945312, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.1415255069732666, "rewards_train/margins": 2.831206798553467, "rewards_train/rejected": -3.9727323055267334, "step": 132 }, { "epoch": 1.51, "logps_train/chosen": -177.83819580078125, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -161.01573181152344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7680490612983704, "rewards_train/margins": 3.2229778170585632, "rewards_train/rejected": -3.9910268783569336, "step": 132 }, { "epoch": 1.51, "logps_train/chosen": -150.2593231201172, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -158.00173950195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7834035754203796, "rewards_train/margins": 3.515306293964386, "rewards_train/rejected": -4.298709869384766, "step": 132 }, { "epoch": 1.52, "learning_rate": 2.5465624160802847e-05, "loss": 0.1705, "step": 133 }, { "epoch": 1.52, "logps_train/chosen": -163.82518005371094, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -160.40072631835938, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.2626445293426514, "rewards_train/margins": 2.3889267444610596, "rewards_train/rejected": -3.651571273803711, "step": 133 }, { "epoch": 1.52, "logps_train/chosen": -138.35488891601562, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -150.45436096191406, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -0.8314854502677917, "rewards_train/margins": 3.542002499103546, "rewards_train/rejected": -4.373487949371338, "step": 133 }, { "epoch": 1.52, "logps_train/chosen": -164.03900146484375, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -152.27687072753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3316338062286377, "rewards_train/margins": 3.427401304244995, "rewards_train/rejected": -4.759035110473633, "step": 133 }, { "epoch": 1.52, "logps_train/chosen": -193.63211059570312, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -177.69418334960938, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.9139914512634277, "rewards_train/margins": 3.4447827339172363, "rewards_train/rejected": -4.358774185180664, "step": 133 }, { "epoch": 1.53, "learning_rate": 2.515521603113088e-05, "loss": 0.187, "step": 134 }, { "epoch": 1.53, "logps_train/chosen": -175.33932495117188, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -165.0, "logps_train/rejected": -213.67547607421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5542943477630615, "rewards_train/margins": 4.358224153518677, "rewards_train/rejected": -4.912518501281738, "step": 134 }, { "epoch": 1.53, "logps_train/chosen": -158.3114013671875, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -173.7991943359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7678590416908264, "rewards_train/margins": 4.60151332616806, "rewards_train/rejected": -5.369372367858887, "step": 134 }, { "epoch": 1.53, "logps_train/chosen": -131.25491333007812, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -131.59129333496094, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3272488117218018, "rewards_train/margins": 2.7177212238311768, "rewards_train/rejected": -4.0449700355529785, "step": 134 }, { "epoch": 1.53, "logps_train/chosen": -139.65713500976562, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -143.0663299560547, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.1557526588439941, "rewards_train/margins": 3.5598645210266113, "rewards_train/rejected": -4.7156171798706055, "step": 134 }, { "epoch": 1.54, "learning_rate": 2.4844783968869126e-05, "loss": 0.1299, "step": 135 }, { "epoch": 1.54, "logps_train/chosen": -155.28524780273438, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -126.5, "logps_train/rejected": -163.39198303222656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8066498041152954, "rewards_train/margins": 2.9134079217910767, "rewards_train/rejected": -3.720057725906372, "step": 135 }, { "epoch": 1.54, "logps_train/chosen": -178.38946533203125, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -192.02011108398438, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0768359899520874, "rewards_train/margins": 3.238333821296692, "rewards_train/rejected": -4.315169811248779, "step": 135 }, { "epoch": 1.54, "logps_train/chosen": -160.60389709472656, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -163.22579956054688, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.4700582027435303, "rewards_train/margins": 2.681037187576294, "rewards_train/rejected": -4.151095390319824, "step": 135 }, { "epoch": 1.54, "logps_train/chosen": -191.0967559814453, "logps_train/ref_chosen": -185.0, "logps_train/ref_rejected": -146.0, "logps_train/rejected": -190.99061584472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6215896606445312, "rewards_train/margins": 3.8812813758850098, "rewards_train/rejected": -4.502871036529541, "step": 135 }, { "epoch": 1.55, "learning_rate": 2.4534375839197166e-05, "loss": 0.1323, "step": 136 }, { "epoch": 1.55, "logps_train/chosen": -216.13201904296875, "logps_train/ref_chosen": -210.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -161.54336547851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6491394639015198, "rewards_train/margins": 3.5523642897605896, "rewards_train/rejected": -4.201503753662109, "step": 136 }, { "epoch": 1.55, "logps_train/chosen": -186.5906219482422, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -165.56106567382812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7216589450836182, "rewards_train/margins": 3.585913896560669, "rewards_train/rejected": -4.307572841644287, "step": 136 }, { "epoch": 1.55, "logps_train/chosen": -157.28329467773438, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -144.79754638671875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.2453217506408691, "rewards_train/margins": 3.0565037727355957, "rewards_train/rejected": -4.301825523376465, "step": 136 }, { "epoch": 1.55, "logps_train/chosen": -146.12713623046875, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -144.28741455078125, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.4028985500335693, "rewards_train/margins": 2.723278284072876, "rewards_train/rejected": -4.126176834106445, "step": 136 }, { "epoch": 1.57, "learning_rate": 2.4224039503604435e-05, "loss": 0.1395, "step": 137 }, { "epoch": 1.57, "logps_train/chosen": -193.05108642578125, "logps_train/ref_chosen": -186.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -180.34556579589844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.7160476446151733, "rewards_train/margins": 3.606399178504944, "rewards_train/rejected": -4.322446823120117, "step": 137 }, { "epoch": 1.57, "logps_train/chosen": -177.28684997558594, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -169.56716918945312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8280016779899597, "rewards_train/margins": 3.3246129155158997, "rewards_train/rejected": -4.152614593505859, "step": 137 }, { "epoch": 1.57, "logps_train/chosen": -181.3519744873047, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -164.0, "logps_train/rejected": -209.20787048339844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.585979163646698, "rewards_train/margins": 3.8938897252082825, "rewards_train/rejected": -4.4798688888549805, "step": 137 }, { "epoch": 1.57, "logps_train/chosen": -159.36561584472656, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -163.44479370117188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.33968687057495117, "rewards_train/margins": 3.739558219909668, "rewards_train/rejected": -4.079245090484619, "step": 137 }, { "epoch": 1.58, "learning_rate": 2.391382281251055e-05, "loss": 0.1132, "step": 138 }, { "epoch": 1.58, "logps_train/chosen": -160.30413818359375, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -160.53933715820312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.747503936290741, "rewards_train/margins": 3.6289380192756653, "rewards_train/rejected": -4.376441955566406, "step": 138 }, { "epoch": 1.58, "logps_train/chosen": -193.16165161132812, "logps_train/ref_chosen": -189.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -167.0458984375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.4382353723049164, "rewards_train/margins": 3.1267053186893463, "rewards_train/rejected": -3.5649406909942627, "step": 138 }, { "epoch": 1.58, "logps_train/chosen": -177.08596801757812, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -176.0380401611328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2597697973251343, "rewards_train/margins": 3.0808502435684204, "rewards_train/rejected": -4.340620040893555, "step": 138 }, { "epoch": 1.58, "logps_train/chosen": -159.55792236328125, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -145.29629516601562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9486621618270874, "rewards_train/margins": 3.1882187128067017, "rewards_train/rejected": -4.136880874633789, "step": 138 }, { "epoch": 1.59, "learning_rate": 2.3603773597887237e-05, "loss": 0.1249, "step": 139 }, { "epoch": 1.59, "logps_train/chosen": -165.89608764648438, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -172.01133728027344, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.7296485900878906, "rewards_train/margins": 3.3035411834716797, "rewards_train/rejected": -4.03318977355957, "step": 139 }, { "epoch": 1.59, "logps_train/chosen": -144.5615234375, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -117.5, "logps_train/rejected": -163.75762939453125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.721398651599884, "rewards_train/margins": 3.903387725353241, "rewards_train/rejected": -4.624786376953125, "step": 139 }, { "epoch": 1.59, "logps_train/chosen": -169.52536010742188, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -170.62039184570312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9957976341247559, "rewards_train/margins": 2.806670904159546, "rewards_train/rejected": -3.8024685382843018, "step": 139 }, { "epoch": 1.59, "logps_train/chosen": -159.63851928710938, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -154.8424072265625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.4302584230899811, "rewards_train/margins": 3.5069116055965424, "rewards_train/rejected": -3.9371700286865234, "step": 139 }, { "epoch": 1.6, "learning_rate": 2.329393966588323e-05, "loss": 0.1378, "step": 140 }, { "epoch": 1.6, "logps_train/chosen": -174.330078125, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -172.95440673828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9353509545326233, "rewards_train/margins": 3.4886531233787537, "rewards_train/rejected": -4.424004077911377, "step": 140 }, { "epoch": 1.6, "logps_train/chosen": -166.16903686523438, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -187.2434844970703, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.7706148624420166, "rewards_train/margins": 3.949387788772583, "rewards_train/rejected": -4.7200026512146, "step": 140 }, { "epoch": 1.6, "logps_train/chosen": -163.92617797851562, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -188.00546264648438, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8798237442970276, "rewards_train/margins": 3.081660211086273, "rewards_train/rejected": -3.961483955383301, "step": 140 }, { "epoch": 1.6, "logps_train/chosen": -154.75360107421875, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -161.3019561767578, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.1850275993347168, "rewards_train/margins": 3.391695022583008, "rewards_train/rejected": -4.576722621917725, "step": 140 }, { "epoch": 1.61, "learning_rate": 2.298436878945306e-05, "loss": 0.1181, "step": 141 }, { "epoch": 1.61, "logps_train/chosen": -142.72462463378906, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -179.27532958984375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9942886829376221, "rewards_train/margins": 3.0164482593536377, "rewards_train/rejected": -4.01073694229126, "step": 141 }, { "epoch": 1.61, "logps_train/chosen": -190.180419921875, "logps_train/ref_chosen": -181.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -180.17337036132812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9520259499549866, "rewards_train/margins": 3.4328898787498474, "rewards_train/rejected": -4.384915828704834, "step": 141 }, { "epoch": 1.61, "logps_train/chosen": -134.74435424804688, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -111.79988098144531, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.6727750301361084, "rewards_train/margins": 2.240952968597412, "rewards_train/rejected": -3.9137279987335205, "step": 141 }, { "epoch": 1.61, "logps_train/chosen": -176.04776000976562, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -164.4101104736328, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1286051273345947, "rewards_train/margins": 3.44355845451355, "rewards_train/rejected": -4.5721635818481445, "step": 141 }, { "epoch": 1.62, "learning_rate": 2.267510870099101e-05, "loss": 0.1654, "step": 142 }, { "epoch": 1.62, "logps_train/chosen": -188.28570556640625, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -180.6005859375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1609909534454346, "rewards_train/margins": 3.7696726322174072, "rewards_train/rejected": -4.930663585662842, "step": 142 }, { "epoch": 1.62, "logps_train/chosen": -157.38246154785156, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -157.08433532714844, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.2556779384613037, "rewards_train/margins": 2.78234601020813, "rewards_train/rejected": -4.038023948669434, "step": 142 }, { "epoch": 1.62, "logps_train/chosen": -139.43142700195312, "logps_train/ref_chosen": -127.5, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -141.7026824951172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.159501314163208, "rewards_train/margins": 3.0188729763031006, "rewards_train/rejected": -4.178374290466309, "step": 142 }, { "epoch": 1.62, "logps_train/chosen": -125.58201599121094, "logps_train/ref_chosen": -104.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -139.86517333984375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.145481824874878, "rewards_train/margins": 3.053291082382202, "rewards_train/rejected": -5.19877290725708, "step": 142 }, { "epoch": 1.63, "learning_rate": 2.2366207084971325e-05, "loss": 0.1394, "step": 143 }, { "epoch": 1.63, "logps_train/chosen": -119.32640838623047, "logps_train/ref_chosen": -104.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -134.8964385986328, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5528064966201782, "rewards_train/margins": 2.5242398977279663, "rewards_train/rejected": -4.0770463943481445, "step": 143 }, { "epoch": 1.63, "logps_train/chosen": -166.6683349609375, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -158.14041137695312, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.3560905456542969, "rewards_train/margins": 2.8011627197265625, "rewards_train/rejected": -4.157253265380859, "step": 143 }, { "epoch": 1.63, "logps_train/chosen": -153.33035278320312, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -163.28121948242188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.7734657526016235, "rewards_train/margins": 3.996648669242859, "rewards_train/rejected": -4.770114421844482, "step": 143 }, { "epoch": 1.63, "logps_train/chosen": -171.47213745117188, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -174.22564697265625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.0140111446380615, "rewards_train/margins": 2.929255723953247, "rewards_train/rejected": -3.9432668685913086, "step": 143 }, { "epoch": 1.65, "learning_rate": 2.2057711570595746e-05, "loss": 0.2175, "step": 144 }, { "epoch": 1.65, "logps_train/chosen": -173.53387451171875, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -161.1766357421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8866889476776123, "rewards_train/margins": 3.222722291946411, "rewards_train/rejected": -4.109411239624023, "step": 144 }, { "epoch": 1.65, "logps_train/chosen": -141.76498413085938, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -159.65899658203125, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.348568320274353, "rewards_train/margins": 3.194480061531067, "rewards_train/rejected": -4.54304838180542, "step": 144 }, { "epoch": 1.65, "logps_train/chosen": -197.79476928710938, "logps_train/ref_chosen": -192.0, "logps_train/ref_rejected": -166.0, "logps_train/rejected": -209.7325439453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5577000379562378, "rewards_train/margins": 3.7895785570144653, "rewards_train/rejected": -4.347278594970703, "step": 144 }, { "epoch": 1.65, "logps_train/chosen": -130.68243408203125, "logps_train/ref_chosen": -122.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -138.5338897705078, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8820131421089172, "rewards_train/margins": 3.5245015025138855, "rewards_train/rejected": -4.406514644622803, "step": 144 }, { "epoch": 1.66, "learning_rate": 2.174966972444967e-05, "loss": 0.1446, "step": 145 }, { "epoch": 1.66, "logps_train/chosen": -163.034912109375, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -165.3939666748047, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.1952872276306152, "rewards_train/margins": 3.1185240745544434, "rewards_train/rejected": -4.313811302185059, "step": 145 }, { "epoch": 1.66, "logps_train/chosen": -182.8961181640625, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -172.29110717773438, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.5673462152481079, "rewards_train/margins": 3.453562617301941, "rewards_train/rejected": -4.020908832550049, "step": 145 }, { "epoch": 1.66, "logps_train/chosen": -150.4124298095703, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -159.89599609375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4033517837524414, "rewards_train/margins": 3.661468505859375, "rewards_train/rejected": -5.064820289611816, "step": 145 }, { "epoch": 1.66, "logps_train/chosen": -184.98175048828125, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -164.98641967773438, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.6009587049484253, "rewards_train/margins": 3.9541760683059692, "rewards_train/rejected": -4.5551347732543945, "step": 145 }, { "epoch": 1.67, "learning_rate": 2.1442129043167874e-05, "loss": 0.1344, "step": 146 }, { "epoch": 1.67, "logps_train/chosen": -164.82098388671875, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -179.12875366210938, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3958685398101807, "rewards_train/margins": 3.42560076713562, "rewards_train/rejected": -4.821469306945801, "step": 146 }, { "epoch": 1.67, "logps_train/chosen": -143.52203369140625, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -173.89883422851562, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.5310614109039307, "rewards_train/margins": 3.589242696762085, "rewards_train/rejected": -5.120304107666016, "step": 146 }, { "epoch": 1.67, "logps_train/chosen": -171.19552612304688, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -150.95005798339844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.160958170890808, "rewards_train/margins": 3.271157383918762, "rewards_train/rejected": -4.43211555480957, "step": 146 }, { "epoch": 1.67, "logps_train/chosen": -185.29385375976562, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -177.15206909179688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1695337295532227, "rewards_train/margins": 3.7931838035583496, "rewards_train/rejected": -4.962717533111572, "step": 146 }, { "epoch": 1.68, "learning_rate": 2.1135136946111078e-05, "loss": 0.1606, "step": 147 }, { "epoch": 1.68, "logps_train/chosen": -178.3898162841797, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -172.91342163085938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1439135074615479, "rewards_train/margins": 3.5383970737457275, "rewards_train/rejected": -4.682310581207275, "step": 147 }, { "epoch": 1.68, "logps_train/chosen": -175.71603393554688, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -164.0, "logps_train/rejected": -209.33184814453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5641824007034302, "rewards_train/margins": 4.025251746177673, "rewards_train/rejected": -4.5894341468811035, "step": 147 }, { "epoch": 1.68, "logps_train/chosen": -186.2715301513672, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -172.0, "logps_train/rejected": -216.0167999267578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4318392276763916, "rewards_train/margins": 4.031365156173706, "rewards_train/rejected": -4.463204383850098, "step": 147 }, { "epoch": 1.68, "logps_train/chosen": -159.61163330078125, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -172.23619079589844, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.1693428754806519, "rewards_train/margins": 3.491873860359192, "rewards_train/rejected": -4.661216735839844, "step": 147 }, { "epoch": 1.69, "learning_rate": 2.0828740768054405e-05, "loss": 0.094, "step": 148 }, { "epoch": 1.69, "logps_train/chosen": -146.35887145996094, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -136.61520385742188, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -1.5266588926315308, "rewards_train/margins": 2.8000484704971313, "rewards_train/rejected": -4.326707363128662, "step": 148 }, { "epoch": 1.69, "logps_train/chosen": -187.1864013671875, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -154.72918701171875, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3573615550994873, "rewards_train/margins": 2.9866015911102295, "rewards_train/rejected": -4.343963146209717, "step": 148 }, { "epoch": 1.69, "logps_train/chosen": -170.3841552734375, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -188.68914794921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0149770975112915, "rewards_train/margins": 3.598516821861267, "rewards_train/rejected": -4.613493919372559, "step": 148 }, { "epoch": 1.69, "logps_train/chosen": -184.0547332763672, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -186.50656127929688, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.43496501445770264, "rewards_train/margins": 4.409247994422913, "rewards_train/rejected": -4.844213008880615, "step": 148 }, { "epoch": 1.7, "learning_rate": 2.0522987751888878e-05, "loss": 0.1432, "step": 149 }, { "epoch": 1.7, "logps_train/chosen": -172.67449951171875, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -175.04855346679688, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.4930354356765747, "rewards_train/margins": 2.8857582807540894, "rewards_train/rejected": -4.378793716430664, "step": 149 }, { "epoch": 1.7, "logps_train/chosen": -178.00057983398438, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -162.48727416992188, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.4646331071853638, "rewards_train/margins": 2.847375750541687, "rewards_train/rejected": -4.312008857727051, "step": 149 }, { "epoch": 1.7, "logps_train/chosen": -177.27328491210938, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -173.37014770507812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8803309798240662, "rewards_train/margins": 3.7075143456459045, "rewards_train/rejected": -4.587845325469971, "step": 149 }, { "epoch": 1.7, "logps_train/chosen": -178.50416564941406, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -192.19558715820312, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.926978588104248, "rewards_train/margins": 3.855861186981201, "rewards_train/rejected": -4.782839775085449, "step": 149 }, { "epoch": 1.71, "learning_rate": 2.0217925041337088e-05, "loss": 0.1603, "step": 150 }, { "epoch": 1.71, "logps_train/chosen": -182.9842071533203, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -156.3119354248047, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1948070526123047, "rewards_train/margins": 3.445127010345459, "rewards_train/rejected": -4.639934062957764, "step": 150 }, { "epoch": 1.71, "logps_train/chosen": -196.98382568359375, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -182.46234130859375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4740171432495117, "rewards_train/margins": 3.0879406929016113, "rewards_train/rejected": -4.561957836151123, "step": 150 }, { "epoch": 1.71, "logps_train/chosen": -188.139892578125, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -190.43099975585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2782472372055054, "rewards_train/margins": 3.6082111597061157, "rewards_train/rejected": -4.886458396911621, "step": 150 }, { "epoch": 1.71, "logps_train/chosen": -205.94732666015625, "logps_train/ref_chosen": -193.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -199.26638793945312, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.2850637435913086, "rewards_train/margins": 3.3791723251342773, "rewards_train/rejected": -4.664236068725586, "step": 150 }, { "epoch": 1.73, "learning_rate": 1.991359967368416e-05, "loss": 0.1191, "step": 151 }, { "epoch": 1.73, "logps_train/chosen": -164.7177276611328, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -163.2447509765625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.7888622283935547, "rewards_train/margins": 2.895867347717285, "rewards_train/rejected": -4.68472957611084, "step": 151 }, { "epoch": 1.73, "logps_train/chosen": -192.6652374267578, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -164.0, "logps_train/rejected": -208.13421630859375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.8331253528594971, "rewards_train/margins": 3.594896078109741, "rewards_train/rejected": -4.428021430969238, "step": 151 }, { "epoch": 1.73, "logps_train/chosen": -179.51992797851562, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -167.351318359375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3398842811584473, "rewards_train/margins": 3.2446608543395996, "rewards_train/rejected": -4.584545135498047, "step": 151 }, { "epoch": 1.73, "logps_train/chosen": -142.92648315429688, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -151.46743774414062, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6470386981964111, "rewards_train/margins": 3.038767099380493, "rewards_train/rejected": -4.685805797576904, "step": 151 }, { "epoch": 1.74, "learning_rate": 1.9610058572525126e-05, "loss": 0.156, "step": 152 }, { "epoch": 1.74, "logps_train/chosen": -125.03524017333984, "logps_train/ref_chosen": -104.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -143.35089111328125, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -2.051748275756836, "rewards_train/margins": 2.4157614707946777, "rewards_train/rejected": -4.467509746551514, "step": 152 }, { "epoch": 1.74, "logps_train/chosen": -163.41842651367188, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -166.00680541992188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.5096149444580078, "rewards_train/margins": 3.4148945808410645, "rewards_train/rejected": -4.924509525299072, "step": 152 }, { "epoch": 1.74, "logps_train/chosen": -184.01190185546875, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -182.28060913085938, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.1560717821121216, "rewards_train/margins": 3.208804965019226, "rewards_train/rejected": -4.364876747131348, "step": 152 }, { "epoch": 1.74, "logps_train/chosen": -134.97909545898438, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -154.97637939453125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7018145322799683, "rewards_train/margins": 2.6661349534988403, "rewards_train/rejected": -4.367949485778809, "step": 152 }, { "epoch": 1.75, "learning_rate": 1.9307348540529842e-05, "loss": 0.199, "step": 153 }, { "epoch": 1.75, "logps_train/chosen": -184.7167205810547, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -184.70632934570312, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.5790934562683105, "rewards_train/margins": 3.391930103302002, "rewards_train/rejected": -4.9710235595703125, "step": 153 }, { "epoch": 1.75, "logps_train/chosen": -170.3045654296875, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -183.49859619140625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.2700073719024658, "rewards_train/margins": 3.2787787914276123, "rewards_train/rejected": -4.548786163330078, "step": 153 }, { "epoch": 1.75, "logps_train/chosen": -192.09291076660156, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -182.16331481933594, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -0.9595848321914673, "rewards_train/margins": 3.4182342290878296, "rewards_train/rejected": -4.377819061279297, "step": 153 }, { "epoch": 1.75, "logps_train/chosen": -185.59799194335938, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -156.0, "logps_train/rejected": -207.5470733642578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3219197988510132, "rewards_train/margins": 3.890599846839905, "rewards_train/rejected": -5.212519645690918, "step": 153 }, { "epoch": 1.76, "learning_rate": 1.90055162522265e-05, "loss": 0.1899, "step": 154 }, { "epoch": 1.76, "logps_train/chosen": -160.90257263183594, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -174.55712890625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.113547921180725, "rewards_train/margins": 3.4354270696640015, "rewards_train/rejected": -4.548974990844727, "step": 154 }, { "epoch": 1.76, "logps_train/chosen": -168.6278076171875, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -185.01776123046875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.2065556049346924, "rewards_train/margins": 3.4405343532562256, "rewards_train/rejected": -4.647089958190918, "step": 154 }, { "epoch": 1.76, "logps_train/chosen": -179.93496704101562, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -182.0, "logps_train/rejected": -236.11505126953125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.095841407775879, "rewards_train/margins": 4.302578449249268, "rewards_train/rejected": -5.3984198570251465, "step": 154 }, { "epoch": 1.76, "logps_train/chosen": -141.23049926757812, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -173.46426391601562, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.7132835388183594, "rewards_train/margins": 3.7317748069763184, "rewards_train/rejected": -5.445058345794678, "step": 154 }, { "epoch": 1.77, "learning_rate": 1.8704608246804956e-05, "loss": 0.1316, "step": 155 }, { "epoch": 1.77, "logps_train/chosen": -169.93484497070312, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -159.2445526123047, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.6998562812805176, "rewards_train/margins": 3.3152976036071777, "rewards_train/rejected": -5.015153884887695, "step": 155 }, { "epoch": 1.77, "logps_train/chosen": -170.24639892578125, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -179.92730712890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5101873874664307, "rewards_train/margins": 3.3885018825531006, "rewards_train/rejected": -4.898689270019531, "step": 155 }, { "epoch": 1.77, "logps_train/chosen": -172.0523681640625, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -177.50497436523438, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.215980052947998, "rewards_train/margins": 3.4622879028320312, "rewards_train/rejected": -4.678267955780029, "step": 155 }, { "epoch": 1.77, "logps_train/chosen": -167.3325958251953, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -181.28607177734375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4187091588974, "rewards_train/margins": 3.3278671503067017, "rewards_train/rejected": -4.746576309204102, "step": 155 }, { "epoch": 1.78, "learning_rate": 1.840467092094091e-05, "loss": 0.1536, "step": 156 }, { "epoch": 1.78, "logps_train/chosen": -170.8963623046875, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -178.3448028564453, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.1062872409820557, "rewards_train/margins": 3.4995310306549072, "rewards_train/rejected": -4.605818271636963, "step": 156 }, { "epoch": 1.78, "logps_train/chosen": -152.4623260498047, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -152.89100646972656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9597091674804688, "rewards_train/margins": 2.9297332763671875, "rewards_train/rejected": -4.889442443847656, "step": 156 }, { "epoch": 1.78, "logps_train/chosen": -140.10516357421875, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -136.63473510742188, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.0900087356567383, "rewards_train/margins": 3.295339584350586, "rewards_train/rejected": -4.385348320007324, "step": 156 }, { "epoch": 1.78, "logps_train/chosen": -178.86492919921875, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -143.0, "logps_train/rejected": -188.87057495117188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0580618381500244, "rewards_train/margins": 3.5533130168914795, "rewards_train/rejected": -4.611374855041504, "step": 156 }, { "epoch": 1.79, "learning_rate": 1.8105750521642034e-05, "loss": 0.1377, "step": 157 }, { "epoch": 1.79, "logps_train/chosen": -176.15272521972656, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -193.9800567626953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5117566585540771, "rewards_train/margins": 3.3692567348480225, "rewards_train/rejected": -4.8810133934021, "step": 157 }, { "epoch": 1.79, "logps_train/chosen": -165.08853149414062, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -168.65086364746094, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.9819000959396362, "rewards_train/margins": 4.159559845924377, "rewards_train/rejected": -5.141459941864014, "step": 157 }, { "epoch": 1.79, "logps_train/chosen": -171.33531188964844, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -165.46160888671875, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.4948601722717285, "rewards_train/margins": 3.4002251625061035, "rewards_train/rejected": -4.895085334777832, "step": 157 }, { "epoch": 1.79, "logps_train/chosen": -158.28863525390625, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -168.30862426757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3315372467041016, "rewards_train/margins": 3.280808448791504, "rewards_train/rejected": -4.6123456954956055, "step": 157 }, { "epoch": 1.81, "learning_rate": 1.780789313911722e-05, "loss": 0.1212, "step": 158 }, { "epoch": 1.81, "logps_train/chosen": -164.3837890625, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -195.4906005859375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.437110424041748, "rewards_train/margins": 3.6194682121276855, "rewards_train/rejected": -5.056578636169434, "step": 158 }, { "epoch": 1.81, "logps_train/chosen": -167.58389282226562, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -165.21438598632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1453032493591309, "rewards_train/margins": 3.3836545944213867, "rewards_train/rejected": -4.528957843780518, "step": 158 }, { "epoch": 1.81, "logps_train/chosen": -167.3607177734375, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -142.85699462890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.172424077987671, "rewards_train/margins": 3.6320254802703857, "rewards_train/rejected": -4.804449558258057, "step": 158 }, { "epoch": 1.81, "logps_train/chosen": -138.3846893310547, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -128.16195678710938, "rewards_train/accuracies": 0.84375, "rewards_train/chosen": -2.0708906650543213, "rewards_train/margins": 1.8548266887664795, "rewards_train/rejected": -3.925717353820801, "step": 158 }, { "epoch": 1.82, "learning_rate": 1.7511144699669966e-05, "loss": 0.1585, "step": 159 }, { "epoch": 1.82, "logps_train/chosen": -184.75267028808594, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -182.9331512451172, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -0.784446656703949, "rewards_train/margins": 4.016046583652496, "rewards_train/rejected": -4.800493240356445, "step": 159 }, { "epoch": 1.82, "logps_train/chosen": -138.78643798828125, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -133.27508544921875, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.0814690589904785, "rewards_train/margins": 2.621307849884033, "rewards_train/rejected": -4.702776908874512, "step": 159 }, { "epoch": 1.82, "logps_train/chosen": -174.599609375, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -172.98977661132812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.085846424102783, "rewards_train/margins": 3.226027488708496, "rewards_train/rejected": -5.311873912811279, "step": 159 }, { "epoch": 1.82, "logps_train/chosen": -154.55592346191406, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -148.77499389648438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5021743774414062, "rewards_train/margins": 2.780306339263916, "rewards_train/rejected": -4.282480716705322, "step": 159 }, { "epoch": 1.83, "learning_rate": 1.7215550958617034e-05, "loss": 0.1462, "step": 160 }, { "epoch": 1.83, "logps_train/chosen": -162.04150390625, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -162.67465209960938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.419677495956421, "rewards_train/margins": 3.43099045753479, "rewards_train/rejected": -4.850667953491211, "step": 160 }, { "epoch": 1.83, "logps_train/chosen": -147.52159118652344, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -146.4416961669922, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.1266707181930542, "rewards_train/margins": 3.2333184480667114, "rewards_train/rejected": -4.359989166259766, "step": 160 }, { "epoch": 1.83, "logps_train/chosen": -168.31619262695312, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -152.9374542236328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2656030654907227, "rewards_train/margins": 3.5036306381225586, "rewards_train/rejected": -4.769233703613281, "step": 160 }, { "epoch": 1.83, "logps_train/chosen": -205.88299560546875, "logps_train/ref_chosen": -196.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -174.3811492919922, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -0.9983091354370117, "rewards_train/margins": 3.4017200469970703, "rewards_train/rejected": -4.400029182434082, "step": 160 }, { "epoch": 1.84, "learning_rate": 1.6921157493233532e-05, "loss": 0.1424, "step": 161 }, { "epoch": 1.84, "logps_train/chosen": -167.32449340820312, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -164.69781494140625, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.4970972537994385, "rewards_train/margins": 3.72248911857605, "rewards_train/rejected": -5.219586372375488, "step": 161 }, { "epoch": 1.84, "logps_train/chosen": -172.23770141601562, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -154.22283935546875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0136151313781738, "rewards_train/margins": 3.5768332481384277, "rewards_train/rejected": -4.590448379516602, "step": 161 }, { "epoch": 1.84, "logps_train/chosen": -151.36978149414062, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -171.3380126953125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3356115818023682, "rewards_train/margins": 3.544039487838745, "rewards_train/rejected": -4.879651069641113, "step": 161 }, { "epoch": 1.84, "logps_train/chosen": -199.06410217285156, "logps_train/ref_chosen": -188.0, "logps_train/ref_rejected": -147.0, "logps_train/rejected": -194.74569702148438, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1116838455200195, "rewards_train/margins": 3.712299346923828, "rewards_train/rejected": -4.823983192443848, "step": 161 }, { "epoch": 1.85, "learning_rate": 1.6628009695725346e-05, "loss": 0.129, "step": 162 }, { "epoch": 1.85, "logps_train/chosen": -131.08058166503906, "logps_train/ref_chosen": -121.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -166.07493591308594, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0106275081634521, "rewards_train/margins": 3.0171782970428467, "rewards_train/rejected": -4.027805805206299, "step": 162 }, { "epoch": 1.85, "logps_train/chosen": -137.0918426513672, "logps_train/ref_chosen": -115.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -155.005859375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.2230069637298584, "rewards_train/margins": 2.3856842517852783, "rewards_train/rejected": -4.608691215515137, "step": 162 }, { "epoch": 1.85, "logps_train/chosen": -135.45196533203125, "logps_train/ref_chosen": -121.0, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -156.66986083984375, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.433404564857483, "rewards_train/margins": 2.7299917936325073, "rewards_train/rejected": -4.16339635848999, "step": 162 }, { "epoch": 1.85, "logps_train/chosen": -134.3225860595703, "logps_train/ref_chosen": -114.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -150.91635131835938, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -2.0572102069854736, "rewards_train/margins": 2.305445909500122, "rewards_train/rejected": -4.362656116485596, "step": 162 }, { "epoch": 1.86, "learning_rate": 1.6336152766230232e-05, "loss": 0.1969, "step": 163 }, { "epoch": 1.86, "logps_train/chosen": -142.69918823242188, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -166.99905395507812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.13105309009552, "rewards_train/margins": 3.8543986082077026, "rewards_train/rejected": -4.985451698303223, "step": 163 }, { "epoch": 1.86, "logps_train/chosen": -167.85174560546875, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -168.38937377929688, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.7078323364257812, "rewards_train/margins": 3.499269485473633, "rewards_train/rejected": -5.207101821899414, "step": 163 }, { "epoch": 1.86, "logps_train/chosen": -165.04327392578125, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -174.62939453125, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.4283990859985352, "rewards_train/margins": 3.1969175338745117, "rewards_train/rejected": -4.625316619873047, "step": 163 }, { "epoch": 1.86, "logps_train/chosen": -189.43804931640625, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -183.73580932617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0531786680221558, "rewards_train/margins": 3.821672558784485, "rewards_train/rejected": -4.874851226806641, "step": 163 }, { "epoch": 1.87, "learning_rate": 1.6045631705848404e-05, "loss": 0.1373, "step": 164 }, { "epoch": 1.87, "logps_train/chosen": -160.4321746826172, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -136.58428955078125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9580607414245605, "rewards_train/margins": 2.7091808319091797, "rewards_train/rejected": -4.66724157333374, "step": 164 }, { "epoch": 1.87, "logps_train/chosen": -157.96083068847656, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -179.47161865234375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4216691255569458, "rewards_train/margins": 3.2209337949752808, "rewards_train/rejected": -4.642602920532227, "step": 164 }, { "epoch": 1.87, "logps_train/chosen": -156.0556640625, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -152.37881469726562, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3762691020965576, "rewards_train/margins": 3.1942293643951416, "rewards_train/rejected": -4.570498466491699, "step": 164 }, { "epoch": 1.87, "logps_train/chosen": -154.2939453125, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -157.37115478515625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.6712883710861206, "rewards_train/margins": 2.8872863054275513, "rewards_train/rejected": -4.558574676513672, "step": 164 }, { "epoch": 1.89, "learning_rate": 1.5756491309703875e-05, "loss": 0.1581, "step": 165 }, { "epoch": 1.89, "logps_train/chosen": -135.0899658203125, "logps_train/ref_chosen": -121.5, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -153.34164428710938, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.354859709739685, "rewards_train/margins": 3.076668381690979, "rewards_train/rejected": -4.431528091430664, "step": 165 }, { "epoch": 1.89, "logps_train/chosen": -184.4627227783203, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -203.6276397705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5243486166000366, "rewards_train/margins": 4.656775116920471, "rewards_train/rejected": -5.181123733520508, "step": 165 }, { "epoch": 1.89, "logps_train/chosen": -161.5297393798828, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -185.64584350585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4633244276046753, "rewards_train/margins": 3.1752835512161255, "rewards_train/rejected": -4.638607978820801, "step": 165 }, { "epoch": 1.89, "logps_train/chosen": -185.7200164794922, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -151.0, "logps_train/rejected": -196.57159423828125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0503220558166504, "rewards_train/margins": 3.486915111541748, "rewards_train/rejected": -4.537237167358398, "step": 165 }, { "epoch": 1.9, "learning_rate": 1.5468776160037556e-05, "loss": 0.1205, "step": 166 }, { "epoch": 1.9, "logps_train/chosen": -168.28759765625, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -169.3665313720703, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3095204830169678, "rewards_train/margins": 2.7026207447052, "rewards_train/rejected": -4.012141227722168, "step": 166 }, { "epoch": 1.9, "logps_train/chosen": -141.98912048339844, "logps_train/ref_chosen": -126.5, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -143.87429809570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.543443202972412, "rewards_train/margins": 3.1720151901245117, "rewards_train/rejected": -4.715458393096924, "step": 166 }, { "epoch": 1.9, "logps_train/chosen": -146.0329132080078, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -158.03030395507812, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.224214792251587, "rewards_train/margins": 2.7690494060516357, "rewards_train/rejected": -3.9932641983032227, "step": 166 }, { "epoch": 1.9, "logps_train/chosen": -185.71340942382812, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -182.1860809326172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5620156526565552, "rewards_train/margins": 3.35896098613739, "rewards_train/rejected": -4.920976638793945, "step": 166 }, { "epoch": 1.91, "learning_rate": 1.5182530619333169e-05, "loss": 0.1542, "step": 167 }, { "epoch": 1.91, "logps_train/chosen": -149.9785919189453, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -152.57215881347656, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3188555240631104, "rewards_train/margins": 3.0212714672088623, "rewards_train/rejected": -4.340126991271973, "step": 167 }, { "epoch": 1.91, "logps_train/chosen": -168.6658935546875, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -182.68592834472656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.816979169845581, "rewards_train/margins": 3.0047385692596436, "rewards_train/rejected": -4.821717739105225, "step": 167 }, { "epoch": 1.91, "logps_train/chosen": -164.0330810546875, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -181.12750244140625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.766735315322876, "rewards_train/margins": 3.4669129848480225, "rewards_train/rejected": -5.233648300170898, "step": 167 }, { "epoch": 1.91, "logps_train/chosen": -201.2147674560547, "logps_train/ref_chosen": -189.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -180.60372924804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.19296133518219, "rewards_train/margins": 4.000518202781677, "rewards_train/rejected": -5.193479537963867, "step": 167 }, { "epoch": 1.92, "learning_rate": 1.4897798823477043e-05, "loss": 0.1167, "step": 168 }, { "epoch": 1.92, "logps_train/chosen": -154.00045776367188, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -161.27391052246094, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.832126498222351, "rewards_train/margins": 3.5510259866714478, "rewards_train/rejected": -5.383152484893799, "step": 168 }, { "epoch": 1.92, "logps_train/chosen": -171.59112548828125, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -170.0, "logps_train/rejected": -216.93157958984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3428027629852295, "rewards_train/margins": 3.343519449234009, "rewards_train/rejected": -4.686322212219238, "step": 168 }, { "epoch": 1.92, "logps_train/chosen": -174.04019165039062, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -172.20095825195312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.2645671367645264, "rewards_train/margins": 3.8841536045074463, "rewards_train/rejected": -5.148720741271973, "step": 168 }, { "epoch": 1.92, "logps_train/chosen": -169.83084106445312, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -158.2201690673828, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.683249592781067, "rewards_train/margins": 3.1385172605514526, "rewards_train/rejected": -4.8217668533325195, "step": 168 }, { "epoch": 1.93, "learning_rate": 1.4614624674952842e-05, "loss": 0.1329, "step": 169 }, { "epoch": 1.93, "logps_train/chosen": -177.6451416015625, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -176.8157958984375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3176395893096924, "rewards_train/margins": 3.5905025005340576, "rewards_train/rejected": -4.90814208984375, "step": 169 }, { "epoch": 1.93, "logps_train/chosen": -178.3532257080078, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -161.0, "logps_train/rejected": -208.62347412109375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.913349449634552, "rewards_train/margins": 3.8398191332817078, "rewards_train/rejected": -4.75316858291626, "step": 169 }, { "epoch": 1.93, "logps_train/chosen": -160.98016357421875, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -149.50071716308594, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.2862005233764648, "rewards_train/margins": 3.435941696166992, "rewards_train/rejected": -4.722142219543457, "step": 169 }, { "epoch": 1.93, "logps_train/chosen": -214.80264282226562, "logps_train/ref_chosen": -205.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -185.12115478515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0412020683288574, "rewards_train/margins": 4.142787456512451, "rewards_train/rejected": -5.183989524841309, "step": 169 }, { "epoch": 1.94, "learning_rate": 1.4333051836072298e-05, "loss": 0.1112, "step": 170 }, { "epoch": 1.94, "logps_train/chosen": -177.8150634765625, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -182.40084838867188, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -1.512269139289856, "rewards_train/margins": 3.168343424797058, "rewards_train/rejected": -4.680612564086914, "step": 170 }, { "epoch": 1.94, "logps_train/chosen": -157.65682983398438, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -151.7862091064453, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.602914571762085, "rewards_train/margins": 3.1795151233673096, "rewards_train/rejected": -4.7824296951293945, "step": 170 }, { "epoch": 1.94, "logps_train/chosen": -176.32493591308594, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -140.84335327148438, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.5847883224487305, "rewards_train/margins": 2.9966893196105957, "rewards_train/rejected": -4.581477642059326, "step": 170 }, { "epoch": 1.94, "logps_train/chosen": -140.81861877441406, "logps_train/ref_chosen": -122.5, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -162.77423095703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8218281269073486, "rewards_train/margins": 2.9552524089813232, "rewards_train/rejected": -4.777080535888672, "step": 170 }, { "epoch": 1.95, "learning_rate": 1.405312372224294e-05, "loss": 0.1776, "step": 171 }, { "epoch": 1.95, "logps_train/chosen": -138.87081909179688, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -153.38121032714844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.489914894104004, "rewards_train/margins": 3.1869759559631348, "rewards_train/rejected": -4.676890850067139, "step": 171 }, { "epoch": 1.95, "logps_train/chosen": -162.38807678222656, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -174.22418212890625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.095973014831543, "rewards_train/margins": 3.369903087615967, "rewards_train/rejected": -4.46587610244751, "step": 171 }, { "epoch": 1.95, "logps_train/chosen": -162.04440307617188, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -163.14654541015625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0364702939987183, "rewards_train/margins": 3.1233197450637817, "rewards_train/rejected": -4.1597900390625, "step": 171 }, { "epoch": 1.95, "logps_train/chosen": -145.47537231445312, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -128.49514770507812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4886016845703125, "rewards_train/margins": 2.50949764251709, "rewards_train/rejected": -3.9980993270874023, "step": 171 }, { "epoch": 1.97, "learning_rate": 1.3774883495273985e-05, "loss": 0.155, "step": 172 }, { "epoch": 1.97, "logps_train/chosen": -164.489013671875, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -144.42698669433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8249762058258057, "rewards_train/margins": 2.9649884700775146, "rewards_train/rejected": -4.78996467590332, "step": 172 }, { "epoch": 1.97, "logps_train/chosen": -149.56802368164062, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -144.60494995117188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.797525405883789, "rewards_train/margins": 2.945197105407715, "rewards_train/rejected": -4.742722511291504, "step": 172 }, { "epoch": 1.97, "logps_train/chosen": -142.54705810546875, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -157.16668701171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2089290618896484, "rewards_train/margins": 3.610962390899658, "rewards_train/rejected": -4.819891452789307, "step": 172 }, { "epoch": 1.97, "logps_train/chosen": -162.0865936279297, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -163.69024658203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1669116020202637, "rewards_train/margins": 3.1393208503723145, "rewards_train/rejected": -4.306232452392578, "step": 172 }, { "epoch": 1.98, "learning_rate": 1.3498374056721197e-05, "loss": 0.1313, "step": 173 }, { "epoch": 1.98, "logps_train/chosen": -166.25057983398438, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -172.05795288085938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.346738338470459, "rewards_train/margins": 3.3271727561950684, "rewards_train/rejected": -4.673911094665527, "step": 173 }, { "epoch": 1.98, "logps_train/chosen": -170.06703186035156, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -181.25033569335938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.454749345779419, "rewards_train/margins": 3.6730196475982666, "rewards_train/rejected": -5.1277689933776855, "step": 173 }, { "epoch": 1.98, "logps_train/chosen": -140.8558349609375, "logps_train/ref_chosen": -124.5, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -145.61558532714844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.617444396018982, "rewards_train/margins": 3.549827218055725, "rewards_train/rejected": -5.167271614074707, "step": 173 }, { "epoch": 1.98, "logps_train/chosen": -215.5902099609375, "logps_train/ref_chosen": -202.0, "logps_train/ref_rejected": -167.0, "logps_train/rejected": -217.3975067138672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4094116687774658, "rewards_train/margins": 3.6104161739349365, "rewards_train/rejected": -5.019827842712402, "step": 173 }, { "epoch": 1.99, "learning_rate": 1.3223638041271979e-05, "loss": 0.1236, "step": 174 }, { "epoch": 1.99, "logps_train/chosen": -196.11367797851562, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -175.70323181152344, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3418858051300049, "rewards_train/margins": 3.1817080974578857, "rewards_train/rejected": -4.523593902587891, "step": 174 }, { "epoch": 1.99, "logps_train/chosen": -142.29310607910156, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -174.541015625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3043105602264404, "rewards_train/margins": 3.299423933029175, "rewards_train/rejected": -4.603734493255615, "step": 174 }, { "epoch": 1.99, "logps_train/chosen": -172.05809020996094, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -163.4266357421875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.619040846824646, "rewards_train/margins": 3.089199662208557, "rewards_train/rejected": -4.708240509033203, "step": 174 }, { "epoch": 1.99, "logps_train/chosen": -171.18453979492188, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -167.75234985351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.761239767074585, "rewards_train/margins": 3.911907911300659, "rewards_train/rejected": -4.673147678375244, "step": 174 }, { "epoch": 2.0, "learning_rate": 1.2950717810171558e-05, "loss": 0.124, "step": 175 }, { "epoch": 2.0, "logps_train/chosen": -178.75906372070312, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -158.2779998779297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.51145339012146, "rewards_train/margins": 3.475623846054077, "rewards_train/rejected": -4.987077236175537, "step": 175 }, { "epoch": 2.0, "logps_train/chosen": -144.22889709472656, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -159.568603515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2451064586639404, "rewards_train/margins": 3.9777700901031494, "rewards_train/rejected": -5.22287654876709, "step": 175 }, { "epoch": 2.0, "logps_train/chosen": -159.3805694580078, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -156.07077026367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3049516677856445, "rewards_train/margins": 3.2286882400512695, "rewards_train/rejected": -4.533639907836914, "step": 175 }, { "epoch": 2.0, "logps_train/chosen": -160.80697631835938, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -182.2629852294922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6911461353302002, "rewards_train/margins": 3.7947232723236084, "rewards_train/rejected": -5.485869407653809, "step": 175 }, { "epoch": 2.01, "learning_rate": 1.2679655444691369e-05, "loss": 0.0755, "step": 176 }, { "epoch": 2.01, "logps_train/chosen": -161.154052734375, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -167.23745727539062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2068119049072266, "rewards_train/margins": 3.918104648590088, "rewards_train/rejected": -5.1249165534973145, "step": 176 }, { "epoch": 2.01, "logps_train/chosen": -145.90554809570312, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -144.0738525390625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.574514627456665, "rewards_train/margins": 3.397310972213745, "rewards_train/rejected": -4.97182559967041, "step": 176 }, { "epoch": 2.01, "logps_train/chosen": -192.4039306640625, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -172.21121215820312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8327764272689819, "rewards_train/margins": 4.236098408699036, "rewards_train/rejected": -5.068874835968018, "step": 176 }, { "epoch": 2.01, "logps_train/chosen": -153.53134155273438, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -117.5, "logps_train/rejected": -175.02682495117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4470796585083008, "rewards_train/margins": 4.318201065063477, "rewards_train/rejected": -5.765280723571777, "step": 176 }, { "epoch": 2.02, "learning_rate": 1.2410492739640592e-05, "loss": 0.085, "step": 177 }, { "epoch": 2.02, "logps_train/chosen": -191.07339477539062, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -188.9764404296875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.7616373896598816, "rewards_train/margins": 4.577509701251984, "rewards_train/rejected": -5.339147090911865, "step": 177 }, { "epoch": 2.02, "logps_train/chosen": -185.62136840820312, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -184.6957550048828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6105749011039734, "rewards_train/margins": 4.272965252399445, "rewards_train/rejected": -4.883540153503418, "step": 177 }, { "epoch": 2.02, "logps_train/chosen": -191.00115966796875, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -179.1207733154297, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.8412289619445801, "rewards_train/margins": 3.8856916427612305, "rewards_train/rejected": -4.7269206047058105, "step": 177 }, { "epoch": 2.02, "logps_train/chosen": -163.87918090820312, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -188.97149658203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.558229923248291, "rewards_train/margins": 3.8317909240722656, "rewards_train/rejected": -5.390020847320557, "step": 177 }, { "epoch": 2.03, "learning_rate": 1.2143271196921831e-05, "loss": 0.0734, "step": 178 }, { "epoch": 2.03, "logps_train/chosen": -149.78662109375, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -154.28677368164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2491698265075684, "rewards_train/margins": 3.2221827507019043, "rewards_train/rejected": -4.471352577209473, "step": 178 }, { "epoch": 2.03, "logps_train/chosen": -180.86485290527344, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -168.6180419921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9179306030273438, "rewards_train/margins": 3.6434578895568848, "rewards_train/rejected": -4.5613884925842285, "step": 178 }, { "epoch": 2.03, "logps_train/chosen": -191.31576538085938, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -154.0, "logps_train/rejected": -205.51625061035156, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.9044272899627686, "rewards_train/margins": 4.209453344345093, "rewards_train/rejected": -5.113880634307861, "step": 178 }, { "epoch": 2.03, "logps_train/chosen": -157.76119995117188, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -172.5016632080078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2712382078170776, "rewards_train/margins": 3.9011937379837036, "rewards_train/rejected": -5.172431945800781, "step": 178 }, { "epoch": 2.05, "learning_rate": 1.1878032019132016e-05, "loss": 0.0712, "step": 179 }, { "epoch": 2.05, "logps_train/chosen": -179.49224853515625, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -168.34971618652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9605532884597778, "rewards_train/margins": 4.291605830192566, "rewards_train/rejected": -5.252159118652344, "step": 179 }, { "epoch": 2.05, "logps_train/chosen": -182.27786254882812, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -147.0, "logps_train/rejected": -195.55474853515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.051900215446949005, "rewards_train/margins": 4.886940307915211, "rewards_train/rejected": -4.835040092468262, "step": 179 }, { "epoch": 2.05, "logps_train/chosen": -144.2906494140625, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -156.61428833007812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.2723876237869263, "rewards_train/margins": 3.064677119255066, "rewards_train/rejected": -4.337064743041992, "step": 179 }, { "epoch": 2.05, "logps_train/chosen": -184.02662658691406, "logps_train/ref_chosen": -177.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -187.46713256835938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.6745372414588928, "rewards_train/margins": 4.093368470668793, "rewards_train/rejected": -4.7679057121276855, "step": 179 }, { "epoch": 2.06, "learning_rate": 1.1614816103209363e-05, "loss": 0.0749, "step": 180 }, { "epoch": 2.06, "logps_train/chosen": -152.71478271484375, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -169.14288330078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3001878261566162, "rewards_train/margins": 3.4307992458343506, "rewards_train/rejected": -4.730987071990967, "step": 180 }, { "epoch": 2.06, "logps_train/chosen": -175.60665893554688, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -172.89309692382812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.662753701210022, "rewards_train/margins": 3.5386651754379272, "rewards_train/rejected": -5.201418876647949, "step": 180 }, { "epoch": 2.06, "logps_train/chosen": -145.56417846679688, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -156.00181579589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1640660762786865, "rewards_train/margins": 3.8189284801483154, "rewards_train/rejected": -4.982994556427002, "step": 180 }, { "epoch": 2.06, "logps_train/chosen": -172.25607299804688, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -152.76123046875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -0.6297088265419006, "rewards_train/margins": 4.331765592098236, "rewards_train/rejected": -4.961474418640137, "step": 180 }, { "epoch": 2.07, "learning_rate": 1.1353664034127583e-05, "loss": 0.0793, "step": 181 }, { "epoch": 2.07, "logps_train/chosen": -149.04946899414062, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -154.0411376953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2665691375732422, "rewards_train/margins": 3.5102977752685547, "rewards_train/rejected": -4.776866912841797, "step": 181 }, { "epoch": 2.07, "logps_train/chosen": -158.56723022460938, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -159.50074768066406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.218638300895691, "rewards_train/margins": 3.6543132066726685, "rewards_train/rejected": -4.872951507568359, "step": 181 }, { "epoch": 2.07, "logps_train/chosen": -147.66925048828125, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -168.28912353515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.185003638267517, "rewards_train/margins": 4.234533905982971, "rewards_train/rejected": -5.419537544250488, "step": 181 }, { "epoch": 2.07, "logps_train/chosen": -179.8079833984375, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -197.74644470214844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2968143224716187, "rewards_train/margins": 4.06601345539093, "rewards_train/rejected": -5.362827777862549, "step": 181 }, { "epoch": 2.08, "learning_rate": 1.1094616078638123e-05, "loss": 0.0717, "step": 182 }, { "epoch": 2.08, "logps_train/chosen": -190.28167724609375, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -181.79185485839844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6578567028045654, "rewards_train/margins": 4.172305345535278, "rewards_train/rejected": -4.830162048339844, "step": 182 }, { "epoch": 2.08, "logps_train/chosen": -170.9873046875, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -172.44073486328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5471186637878418, "rewards_train/margins": 4.172564506530762, "rewards_train/rejected": -4.7196831703186035, "step": 182 }, { "epoch": 2.08, "logps_train/chosen": -170.7721405029297, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -159.09173583984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.270280122756958, "rewards_train/margins": 3.9390900135040283, "rewards_train/rejected": -5.209370136260986, "step": 182 }, { "epoch": 2.08, "logps_train/chosen": -175.2460174560547, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -195.99169921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9697037935256958, "rewards_train/margins": 3.803244471549988, "rewards_train/rejected": -4.772948265075684, "step": 182 }, { "epoch": 2.09, "learning_rate": 1.083771217906143e-05, "loss": 0.0599, "step": 183 }, { "epoch": 2.09, "logps_train/chosen": -190.64321899414062, "logps_train/ref_chosen": -187.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -189.67926025390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.39599961042404175, "rewards_train/margins": 4.678371250629425, "rewards_train/rejected": -5.074370861053467, "step": 183 }, { "epoch": 2.09, "logps_train/chosen": -138.62399291992188, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -144.83941650390625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.0373992919921875, "rewards_train/margins": 3.527449131011963, "rewards_train/rejected": -4.56484842300415, "step": 183 }, { "epoch": 2.09, "logps_train/chosen": -136.85711669921875, "logps_train/ref_chosen": -118.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -131.87420654296875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8283874988555908, "rewards_train/margins": 2.936718702316284, "rewards_train/rejected": -4.765106201171875, "step": 183 }, { "epoch": 2.09, "logps_train/chosen": -148.16285705566406, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -186.79367065429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6797617077827454, "rewards_train/margins": 4.7876922488212585, "rewards_train/rejected": -5.467453956604004, "step": 183 }, { "epoch": 2.1, "learning_rate": 1.0582991947128324e-05, "loss": 0.0863, "step": 184 }, { "epoch": 2.1, "logps_train/chosen": -168.52122497558594, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -180.79949951171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8917717337608337, "rewards_train/margins": 3.991675913333893, "rewards_train/rejected": -4.883447647094727, "step": 184 }, { "epoch": 2.1, "logps_train/chosen": -143.18820190429688, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -146.11032104492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5001935958862305, "rewards_train/margins": 3.6904263496398926, "rewards_train/rejected": -5.190619945526123, "step": 184 }, { "epoch": 2.1, "logps_train/chosen": -147.32476806640625, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -184.9254150390625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.9073777198791504, "rewards_train/margins": 3.7832117080688477, "rewards_train/rejected": -5.690589427947998, "step": 184 }, { "epoch": 2.1, "logps_train/chosen": -147.30819702148438, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -155.67486572265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4561140537261963, "rewards_train/margins": 3.524458646774292, "rewards_train/rejected": -4.980572700500488, "step": 184 }, { "epoch": 2.11, "learning_rate": 1.0330494657872312e-05, "loss": 0.0853, "step": 185 }, { "epoch": 2.11, "logps_train/chosen": -162.71340942382812, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -179.08816528320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3596220016479492, "rewards_train/margins": 3.8323984146118164, "rewards_train/rejected": -5.192020416259766, "step": 185 }, { "epoch": 2.11, "logps_train/chosen": -162.77186584472656, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -192.93026733398438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.177870273590088, "rewards_train/margins": 4.452266216278076, "rewards_train/rejected": -5.630136489868164, "step": 185 }, { "epoch": 2.11, "logps_train/chosen": -152.37396240234375, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -195.00845336914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4057559967041016, "rewards_train/margins": 4.478293418884277, "rewards_train/rejected": -5.884049415588379, "step": 185 }, { "epoch": 2.11, "logps_train/chosen": -151.218505859375, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -170.10499572753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1394281387329102, "rewards_train/margins": 3.907008647918701, "rewards_train/rejected": -5.046436786651611, "step": 185 }, { "epoch": 2.13, "learning_rate": 1.0080259243573789e-05, "loss": 0.0513, "step": 186 }, { "epoch": 2.13, "logps_train/chosen": -166.97433471679688, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -182.7489471435547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.403684139251709, "rewards_train/margins": 4.301093101501465, "rewards_train/rejected": -5.704777240753174, "step": 186 }, { "epoch": 2.13, "logps_train/chosen": -131.2030792236328, "logps_train/ref_chosen": -119.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -171.1971435546875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.2050743103027344, "rewards_train/margins": 3.998037815093994, "rewards_train/rejected": -5.2031121253967285, "step": 186 }, { "epoch": 2.13, "logps_train/chosen": -153.02066040039062, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -172.2113494873047, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.3235623836517334, "rewards_train/margins": 3.6091701984405518, "rewards_train/rejected": -4.932732582092285, "step": 186 }, { "epoch": 2.13, "logps_train/chosen": -162.50509643554688, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -177.67140197753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7423069477081299, "rewards_train/margins": 4.284990072250366, "rewards_train/rejected": -5.027297019958496, "step": 186 }, { "epoch": 2.14, "learning_rate": 9.832324287757158e-06, "loss": 0.0607, "step": 187 }, { "epoch": 2.14, "logps_train/chosen": -145.08290100097656, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -177.64239501953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1745011806488037, "rewards_train/margins": 4.237786531448364, "rewards_train/rejected": -5.412287712097168, "step": 187 }, { "epoch": 2.14, "logps_train/chosen": -172.32980346679688, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -179.23846435546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4877278804779053, "rewards_train/margins": 3.9437358379364014, "rewards_train/rejected": -5.431463718414307, "step": 187 }, { "epoch": 2.14, "logps_train/chosen": -153.09422302246094, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -191.51918029785156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5256340503692627, "rewards_train/margins": 4.172573804855347, "rewards_train/rejected": -5.698207855224609, "step": 187 }, { "epoch": 2.14, "logps_train/chosen": -176.45843505859375, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -167.15196228027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1481870412826538, "rewards_train/margins": 4.72482168674469, "rewards_train/rejected": -5.873008728027344, "step": 187 }, { "epoch": 2.15, "learning_rate": 9.586728019241623e-06, "loss": 0.0399, "step": 188 }, { "epoch": 2.15, "logps_train/chosen": -154.5926513671875, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -142.7076873779297, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.872741937637329, "rewards_train/margins": 3.5749804973602295, "rewards_train/rejected": -5.447722434997559, "step": 188 }, { "epoch": 2.15, "logps_train/chosen": -207.6587371826172, "logps_train/ref_chosen": -192.0, "logps_train/ref_rejected": -169.0, "logps_train/rejected": -234.085693359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6053264141082764, "rewards_train/margins": 4.921602487564087, "rewards_train/rejected": -6.526928901672363, "step": 188 }, { "epoch": 2.15, "logps_train/chosen": -149.80093383789062, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -179.54971313476562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6450353860855103, "rewards_train/margins": 3.782885432243347, "rewards_train/rejected": -5.427920818328857, "step": 188 }, { "epoch": 2.15, "logps_train/chosen": -163.64334106445312, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -165.87039184570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0383586883544922, "rewards_train/margins": 4.294675827026367, "rewards_train/rejected": -5.333034515380859, "step": 188 }, { "epoch": 2.16, "learning_rate": 9.343508306246771e-06, "loss": 0.0743, "step": 189 }, { "epoch": 2.16, "logps_train/chosen": -162.09934997558594, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -154.43948364257812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7151105403900146, "rewards_train/margins": 4.205791234970093, "rewards_train/rejected": -5.920901775360107, "step": 189 }, { "epoch": 2.16, "logps_train/chosen": -187.39578247070312, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -198.8822479248047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.358717918395996, "rewards_train/margins": 4.348062515258789, "rewards_train/rejected": -5.706780433654785, "step": 189 }, { "epoch": 2.16, "logps_train/chosen": -142.24322509765625, "logps_train/ref_chosen": -126.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -162.78729248046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6396063566207886, "rewards_train/margins": 3.697179913520813, "rewards_train/rejected": -5.336786270141602, "step": 189 }, { "epoch": 2.16, "logps_train/chosen": -169.55081176757812, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -203.53314208984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2864294052124023, "rewards_train/margins": 5.046573162078857, "rewards_train/rejected": -6.33300256729126, "step": 189 }, { "epoch": 2.17, "learning_rate": 9.102702650553671e-06, "loss": 0.0536, "step": 190 }, { "epoch": 2.17, "logps_train/chosen": -187.0035400390625, "logps_train/ref_chosen": -178.0, "logps_train/ref_rejected": -153.0, "logps_train/rejected": -205.14060974121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8618786334991455, "rewards_train/margins": 4.305112600326538, "rewards_train/rejected": -5.166991233825684, "step": 190 }, { "epoch": 2.17, "logps_train/chosen": -148.8421630859375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -156.6807861328125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.293787956237793, "rewards_train/margins": 3.910130023956299, "rewards_train/rejected": -5.203917980194092, "step": 190 }, { "epoch": 2.17, "logps_train/chosen": -162.02560424804688, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -167.29222106933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2023648023605347, "rewards_train/margins": 4.4929715394973755, "rewards_train/rejected": -5.69533634185791, "step": 190 }, { "epoch": 2.17, "logps_train/chosen": -149.6796417236328, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -193.026611328125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.1812937259674072, "rewards_train/margins": 4.417267084121704, "rewards_train/rejected": -5.598560810089111, "step": 190 }, { "epoch": 2.18, "learning_rate": 8.864348181722559e-06, "loss": 0.0645, "step": 191 }, { "epoch": 2.18, "logps_train/chosen": -165.83389282226562, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -161.61724853515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1698150634765625, "rewards_train/margins": 3.5971102714538574, "rewards_train/rejected": -5.76692533493042, "step": 191 }, { "epoch": 2.18, "logps_train/chosen": -139.57894897460938, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -169.34829711914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.577218770980835, "rewards_train/margins": 4.069085359573364, "rewards_train/rejected": -5.646304130554199, "step": 191 }, { "epoch": 2.18, "logps_train/chosen": -188.30484008789062, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -201.06988525390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2504067420959473, "rewards_train/margins": 4.388809680938721, "rewards_train/rejected": -5.639216423034668, "step": 191 }, { "epoch": 2.18, "logps_train/chosen": -193.16360473632812, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -143.0, "logps_train/rejected": -201.50115966796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3569847345352173, "rewards_train/margins": 4.465201735496521, "rewards_train/rejected": -5.822186470031738, "step": 191 }, { "epoch": 2.19, "learning_rate": 8.628481651367876e-06, "loss": 0.0616, "step": 192 }, { "epoch": 2.19, "logps_train/chosen": -127.2575912475586, "logps_train/ref_chosen": -104.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -145.93881225585938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.2746853828430176, "rewards_train/margins": 3.400190830230713, "rewards_train/rejected": -5.6748762130737305, "step": 192 }, { "epoch": 2.19, "logps_train/chosen": -160.23959350585938, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -163.14401245117188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8308442831039429, "rewards_train/margins": 4.124375939369202, "rewards_train/rejected": -5.9552202224731445, "step": 192 }, { "epoch": 2.19, "logps_train/chosen": -203.55633544921875, "logps_train/ref_chosen": -195.0, "logps_train/ref_rejected": -159.0, "logps_train/rejected": -215.8226776123047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8862971067428589, "rewards_train/margins": 4.777416348457336, "rewards_train/rejected": -5.663713455200195, "step": 192 }, { "epoch": 2.19, "logps_train/chosen": -154.07431030273438, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -111.5, "logps_train/rejected": -165.9130859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1087489128112793, "rewards_train/margins": 4.343350887298584, "rewards_train/rejected": -5.452099800109863, "step": 192 }, { "epoch": 2.21, "learning_rate": 8.395139427491517e-06, "loss": 0.0737, "step": 193 }, { "epoch": 2.21, "logps_train/chosen": -161.2333526611328, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -172.9558868408203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.696040391921997, "rewards_train/margins": 3.8845107555389404, "rewards_train/rejected": -5.5805511474609375, "step": 193 }, { "epoch": 2.21, "logps_train/chosen": -179.02362060546875, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -207.66558837890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.911834478378296, "rewards_train/margins": 4.097449064254761, "rewards_train/rejected": -6.009283542633057, "step": 193 }, { "epoch": 2.21, "logps_train/chosen": -180.44769287109375, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -193.8340301513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.302825689315796, "rewards_train/margins": 4.537803888320923, "rewards_train/rejected": -5.840629577636719, "step": 193 }, { "epoch": 2.21, "logps_train/chosen": -169.3468017578125, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -172.2258758544922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8215930461883545, "rewards_train/margins": 4.061346769332886, "rewards_train/rejected": -5.88293981552124, "step": 193 }, { "epoch": 2.22, "learning_rate": 8.164357488875348e-06, "loss": 0.0566, "step": 194 }, { "epoch": 2.22, "logps_train/chosen": -200.07351684570312, "logps_train/ref_chosen": -184.0, "logps_train/ref_rejected": -151.0, "logps_train/rejected": -209.89173889160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5717073678970337, "rewards_train/margins": 4.342369437217712, "rewards_train/rejected": -5.914076805114746, "step": 194 }, { "epoch": 2.22, "logps_train/chosen": -171.31915283203125, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -182.50357055664062, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.122978925704956, "rewards_train/margins": 4.234654664993286, "rewards_train/rejected": -5.357633590698242, "step": 194 }, { "epoch": 2.22, "logps_train/chosen": -115.46754455566406, "logps_train/ref_chosen": -97.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -135.1890106201172, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8070234060287476, "rewards_train/margins": 3.7197171449661255, "rewards_train/rejected": -5.526740550994873, "step": 194 }, { "epoch": 2.22, "logps_train/chosen": -158.11651611328125, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -179.56265258789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8428046703338623, "rewards_train/margins": 4.047834157943726, "rewards_train/rejected": -5.890638828277588, "step": 194 }, { "epoch": 2.23, "learning_rate": 7.936171419533653e-06, "loss": 0.0772, "step": 195 }, { "epoch": 2.23, "logps_train/chosen": -175.13031005859375, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -145.96676635742188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.54506254196167, "rewards_train/margins": 4.095754146575928, "rewards_train/rejected": -5.640816688537598, "step": 195 }, { "epoch": 2.23, "logps_train/chosen": -173.42425537109375, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -184.82421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6176198720932007, "rewards_train/margins": 4.269880890846252, "rewards_train/rejected": -5.887500762939453, "step": 195 }, { "epoch": 2.23, "logps_train/chosen": -196.7584686279297, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -205.75381469726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.33219575881958, "rewards_train/margins": 4.877925395965576, "rewards_train/rejected": -6.210121154785156, "step": 195 }, { "epoch": 2.23, "logps_train/chosen": -184.88011169433594, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -196.70294189453125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7086167335510254, "rewards_train/margins": 4.16851282119751, "rewards_train/rejected": -5.877129554748535, "step": 195 }, { "epoch": 2.24, "learning_rate": 7.710616403226459e-06, "loss": 0.0644, "step": 196 }, { "epoch": 2.24, "logps_train/chosen": -154.9520263671875, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -183.3641815185547, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9811406135559082, "rewards_train/margins": 4.288285732269287, "rewards_train/rejected": -6.269426345825195, "step": 196 }, { "epoch": 2.24, "logps_train/chosen": -180.44895935058594, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -169.0, "logps_train/rejected": -230.28929138183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.85615074634552, "rewards_train/margins": 5.313965678215027, "rewards_train/rejected": -6.170116424560547, "step": 196 }, { "epoch": 2.24, "logps_train/chosen": -158.23910522460938, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -124.5, "logps_train/rejected": -179.76210021972656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.5342793464660645, "rewards_train/margins": 3.97666597366333, "rewards_train/rejected": -5.5109453201293945, "step": 196 }, { "epoch": 2.24, "logps_train/chosen": -148.6269073486328, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -161.82305908203125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.707613229751587, "rewards_train/margins": 4.167392015457153, "rewards_train/rejected": -5.87500524520874, "step": 196 }, { "epoch": 2.25, "learning_rate": 7.487727218034646e-06, "loss": 0.0777, "step": 197 }, { "epoch": 2.25, "logps_train/chosen": -184.1099853515625, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -201.27886962890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.521935224533081, "rewards_train/margins": 4.687398195266724, "rewards_train/rejected": -6.209333419799805, "step": 197 }, { "epoch": 2.25, "logps_train/chosen": -162.78753662109375, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -185.3925323486328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8591243028640747, "rewards_train/margins": 4.596633791923523, "rewards_train/rejected": -6.455758094787598, "step": 197 }, { "epoch": 2.25, "logps_train/chosen": -155.07752990722656, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -191.83187866210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9492814540863037, "rewards_train/margins": 4.413836717605591, "rewards_train/rejected": -6.3631181716918945, "step": 197 }, { "epoch": 2.25, "logps_train/chosen": -158.76815795898438, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -183.657958984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2400364875793457, "rewards_train/margins": 3.898550510406494, "rewards_train/rejected": -6.13858699798584, "step": 197 }, { "epoch": 2.26, "learning_rate": 7.267538230997487e-06, "loss": 0.065, "step": 198 }, { "epoch": 2.26, "logps_train/chosen": -154.61886596679688, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -165.445556640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5989946126937866, "rewards_train/margins": 3.7381139993667603, "rewards_train/rejected": -5.337108612060547, "step": 198 }, { "epoch": 2.26, "logps_train/chosen": -174.43276977539062, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -194.56985473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.813735842704773, "rewards_train/margins": 3.821837544441223, "rewards_train/rejected": -5.635573387145996, "step": 198 }, { "epoch": 2.26, "logps_train/chosen": -171.7181396484375, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -204.68441772460938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9548206329345703, "rewards_train/margins": 4.630368232727051, "rewards_train/rejected": -6.585188865661621, "step": 198 }, { "epoch": 2.26, "logps_train/chosen": -156.4467010498047, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -171.12014770507812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.553849220275879, "rewards_train/margins": 3.99556827545166, "rewards_train/rejected": -5.549417495727539, "step": 198 }, { "epoch": 2.27, "learning_rate": 7.05008339281365e-06, "loss": 0.079, "step": 199 }, { "epoch": 2.27, "logps_train/chosen": -201.37559509277344, "logps_train/ref_chosen": -189.0, "logps_train/ref_rejected": -159.0, "logps_train/rejected": -222.2662811279297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2338494062423706, "rewards_train/margins": 5.101566910743713, "rewards_train/rejected": -6.335416316986084, "step": 199 }, { "epoch": 2.27, "logps_train/chosen": -155.3743438720703, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -185.0620880126953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0335280895233154, "rewards_train/margins": 4.135473966598511, "rewards_train/rejected": -6.169002056121826, "step": 199 }, { "epoch": 2.27, "logps_train/chosen": -214.00473022460938, "logps_train/ref_chosen": -203.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -188.80557250976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0920145511627197, "rewards_train/margins": 4.752117395401001, "rewards_train/rejected": -5.844131946563721, "step": 199 }, { "epoch": 2.27, "logps_train/chosen": -157.0347900390625, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -181.71475219726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7769157886505127, "rewards_train/margins": 4.140556573867798, "rewards_train/rejected": -5.9174723625183105, "step": 199 }, { "epoch": 2.29, "learning_rate": 6.835396232606414e-06, "loss": 0.0586, "step": 200 }, { "epoch": 2.29, "logps_train/chosen": -161.72032165527344, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -177.61468505859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.022033214569092, "rewards_train/margins": 4.182990550994873, "rewards_train/rejected": -6.205023765563965, "step": 200 }, { "epoch": 2.29, "logps_train/chosen": -134.15977478027344, "logps_train/ref_chosen": -110.5, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -164.42269897460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.3491806983947754, "rewards_train/margins": 4.333714962005615, "rewards_train/rejected": -6.682895660400391, "step": 200 }, { "epoch": 2.29, "logps_train/chosen": -177.84542846679688, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -192.08859252929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0337610244750977, "rewards_train/margins": 4.461132049560547, "rewards_train/rejected": -6.4948930740356445, "step": 200 }, { "epoch": 2.29, "logps_train/chosen": -196.11642456054688, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -202.897705078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6030492782592773, "rewards_train/margins": 4.695511817932129, "rewards_train/rejected": -6.298561096191406, "step": 200 }, { "epoch": 2.3, "learning_rate": 6.623509852753798e-06, "loss": 0.0547, "step": 201 }, { "epoch": 2.3, "logps_train/chosen": -207.26229858398438, "logps_train/ref_chosen": -188.0, "logps_train/ref_rejected": -145.0, "logps_train/rejected": -211.3671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9156838655471802, "rewards_train/margins": 4.669862151145935, "rewards_train/rejected": -6.585546016693115, "step": 201 }, { "epoch": 2.3, "logps_train/chosen": -154.91036987304688, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -166.4049072265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6626181602478027, "rewards_train/margins": 4.285686016082764, "rewards_train/rejected": -5.948304176330566, "step": 201 }, { "epoch": 2.3, "logps_train/chosen": -140.320556640625, "logps_train/ref_chosen": -120.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -179.07057189941406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0460212230682373, "rewards_train/margins": 4.754591226577759, "rewards_train/rejected": -6.800612449645996, "step": 201 }, { "epoch": 2.3, "logps_train/chosen": -167.5859832763672, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -170.61170959472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4132864475250244, "rewards_train/margins": 4.2639000415802, "rewards_train/rejected": -5.677186489105225, "step": 201 }, { "epoch": 2.31, "learning_rate": 6.414456923784593e-06, "loss": 0.0495, "step": 202 }, { "epoch": 2.31, "logps_train/chosen": -174.45191955566406, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -183.5615234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.61784827709198, "rewards_train/margins": 4.5230690240859985, "rewards_train/rejected": -6.1409173011779785, "step": 202 }, { "epoch": 2.31, "logps_train/chosen": -164.99447631835938, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -153.0, "logps_train/rejected": -214.52035522460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3355815410614014, "rewards_train/margins": 4.774393320083618, "rewards_train/rejected": -6.1099748611450195, "step": 202 }, { "epoch": 2.31, "logps_train/chosen": -186.1416778564453, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -173.00531005859375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.7241289615631104, "rewards_train/margins": 3.9765965938568115, "rewards_train/rejected": -5.700725555419922, "step": 202 }, { "epoch": 2.31, "logps_train/chosen": -183.17779541015625, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -193.62240600585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.05098295211792, "rewards_train/margins": 4.535184383392334, "rewards_train/rejected": -6.586167335510254, "step": 202 }, { "epoch": 2.32, "learning_rate": 6.208269679340886e-06, "loss": 0.0549, "step": 203 }, { "epoch": 2.32, "logps_train/chosen": -164.665283203125, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -175.95152282714844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9498772621154785, "rewards_train/margins": 4.297813415527344, "rewards_train/rejected": -6.247690677642822, "step": 203 }, { "epoch": 2.32, "logps_train/chosen": -179.72842407226562, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -199.34719848632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5472551584243774, "rewards_train/margins": 4.814515471458435, "rewards_train/rejected": -6.3617706298828125, "step": 203 }, { "epoch": 2.32, "logps_train/chosen": -139.86717224121094, "logps_train/ref_chosen": -115.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -162.47723388671875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4136698246002197, "rewards_train/margins": 3.998114824295044, "rewards_train/rejected": -6.411784648895264, "step": 203 }, { "epoch": 2.32, "logps_train/chosen": -152.8355712890625, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -174.1539306640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8427374362945557, "rewards_train/margins": 4.584960222244263, "rewards_train/rejected": -6.427697658538818, "step": 203 }, { "epoch": 2.33, "learning_rate": 6.004979911208006e-06, "loss": 0.0592, "step": 204 }, { "epoch": 2.33, "logps_train/chosen": -150.49539184570312, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -201.37484741210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8733186721801758, "rewards_train/margins": 4.557817459106445, "rewards_train/rejected": -6.431136131286621, "step": 204 }, { "epoch": 2.33, "logps_train/chosen": -145.48204040527344, "logps_train/ref_chosen": -119.5, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -165.45367431640625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.6023051738739014, "rewards_train/margins": 3.6914026737213135, "rewards_train/rejected": -6.293707847595215, "step": 204 }, { "epoch": 2.33, "logps_train/chosen": -163.88088989257812, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -191.28944396972656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.48164439201355, "rewards_train/margins": 4.2371437549591064, "rewards_train/rejected": -6.718788146972656, "step": 204 }, { "epoch": 2.33, "logps_train/chosen": -130.41600036621094, "logps_train/ref_chosen": -109.0, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -160.140869140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.111424207687378, "rewards_train/margins": 4.071851015090942, "rewards_train/rejected": -6.18327522277832, "step": 204 }, { "epoch": 2.34, "learning_rate": 5.804618964412586e-06, "loss": 0.0761, "step": 205 }, { "epoch": 2.34, "logps_train/chosen": -164.7631072998047, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -161.48898315429688, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.1265063285827637, "rewards_train/margins": 3.8559865951538086, "rewards_train/rejected": -5.982492923736572, "step": 205 }, { "epoch": 2.34, "logps_train/chosen": -169.73573303222656, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -160.00714111328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.167128086090088, "rewards_train/margins": 3.958268642425537, "rewards_train/rejected": -6.125396728515625, "step": 205 }, { "epoch": 2.34, "logps_train/chosen": -210.82345581054688, "logps_train/ref_chosen": -191.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -185.9019012451172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9597394466400146, "rewards_train/margins": 4.313043832778931, "rewards_train/rejected": -6.272783279418945, "step": 205 }, { "epoch": 2.34, "logps_train/chosen": -204.55963134765625, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -194.1874237060547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.501666784286499, "rewards_train/margins": 4.02088475227356, "rewards_train/rejected": -6.522551536560059, "step": 205 }, { "epoch": 2.35, "learning_rate": 5.607217732389503e-06, "loss": 0.0711, "step": 206 }, { "epoch": 2.35, "logps_train/chosen": -168.4705352783203, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -167.60079956054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6452956199645996, "rewards_train/margins": 4.4018449783325195, "rewards_train/rejected": -6.047140598297119, "step": 206 }, { "epoch": 2.35, "logps_train/chosen": -189.11264038085938, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -184.2760772705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1286468505859375, "rewards_train/margins": 4.595640182495117, "rewards_train/rejected": -6.724287033081055, "step": 206 }, { "epoch": 2.35, "logps_train/chosen": -167.8518829345703, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -179.5463104248047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.855109453201294, "rewards_train/margins": 4.18018651008606, "rewards_train/rejected": -6.0352959632873535, "step": 206 }, { "epoch": 2.35, "logps_train/chosen": -165.7491912841797, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -152.43096923828125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.695818305015564, "rewards_train/margins": 4.109387040138245, "rewards_train/rejected": -5.805205345153809, "step": 206 }, { "epoch": 2.37, "learning_rate": 5.412806652218469e-06, "loss": 0.0656, "step": 207 }, { "epoch": 2.37, "logps_train/chosen": -169.99899291992188, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -170.46067810058594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8358477354049683, "rewards_train/margins": 4.719790577888489, "rewards_train/rejected": -6.555638313293457, "step": 207 }, { "epoch": 2.37, "logps_train/chosen": -193.06130981445312, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -196.26321411132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2590603828430176, "rewards_train/margins": 4.279368877410889, "rewards_train/rejected": -6.538429260253906, "step": 207 }, { "epoch": 2.37, "logps_train/chosen": -193.26812744140625, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -202.84500122070312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.311529517173767, "rewards_train/margins": 5.111006379127502, "rewards_train/rejected": -6.4225358963012695, "step": 207 }, { "epoch": 2.37, "logps_train/chosen": -148.03170776367188, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -186.12869262695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.667038917541504, "rewards_train/margins": 4.782499313354492, "rewards_train/rejected": -6.449538230895996, "step": 207 }, { "epoch": 2.38, "learning_rate": 5.221415699930951e-06, "loss": 0.0451, "step": 208 }, { "epoch": 2.38, "logps_train/chosen": -194.32412719726562, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -124.5, "logps_train/rejected": -189.37872314453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.066347599029541, "rewards_train/margins": 4.420987129211426, "rewards_train/rejected": -6.487334728240967, "step": 208 }, { "epoch": 2.38, "logps_train/chosen": -185.17953491210938, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -201.2667236328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.35320782661438, "rewards_train/margins": 4.4406516551971436, "rewards_train/rejected": -6.793859481811523, "step": 208 }, { "epoch": 2.38, "logps_train/chosen": -205.48251342773438, "logps_train/ref_chosen": -195.0, "logps_train/ref_rejected": -160.0, "logps_train/rejected": -219.9037628173828, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.028524398803711, "rewards_train/margins": 5.0177106857299805, "rewards_train/rejected": -6.046235084533691, "step": 208 }, { "epoch": 2.38, "logps_train/chosen": -159.63363647460938, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -179.73924255371094, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.698287844657898, "rewards_train/margins": 4.654713749885559, "rewards_train/rejected": -6.353001594543457, "step": 208 }, { "epoch": 2.39, "learning_rate": 5.033074385888189e-06, "loss": 0.0496, "step": 209 }, { "epoch": 2.39, "logps_train/chosen": -171.902099609375, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -179.21429443359375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.372437000274658, "rewards_train/margins": 4.280241966247559, "rewards_train/rejected": -6.652678966522217, "step": 209 }, { "epoch": 2.39, "logps_train/chosen": -148.18333435058594, "logps_train/ref_chosen": -132.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -195.27471923828125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6556390523910522, "rewards_train/margins": 4.759748101234436, "rewards_train/rejected": -6.415387153625488, "step": 209 }, { "epoch": 2.39, "logps_train/chosen": -197.36880493164062, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -206.9953155517578, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.864400863647461, "rewards_train/margins": 5.523899555206299, "rewards_train/rejected": -7.38830041885376, "step": 209 }, { "epoch": 2.39, "logps_train/chosen": -198.30517578125, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -149.0, "logps_train/rejected": -218.72198486328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.7881360054016113, "rewards_train/margins": 4.146368980407715, "rewards_train/rejected": -6.934504985809326, "step": 209 }, { "epoch": 2.4, "learning_rate": 4.847811750231057e-06, "loss": 0.0851, "step": 210 }, { "epoch": 2.4, "logps_train/chosen": -196.61697387695312, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -168.0, "logps_train/rejected": -235.59994506835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0059356689453125, "rewards_train/margins": 4.734723091125488, "rewards_train/rejected": -6.740658760070801, "step": 210 }, { "epoch": 2.4, "logps_train/chosen": -177.05670166015625, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -194.96536254882812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9225655794143677, "rewards_train/margins": 4.452094912528992, "rewards_train/rejected": -6.374660491943359, "step": 210 }, { "epoch": 2.4, "logps_train/chosen": -209.01028442382812, "logps_train/ref_chosen": -187.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -218.34353637695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1807167530059814, "rewards_train/margins": 4.460277795791626, "rewards_train/rejected": -6.640994548797607, "step": 210 }, { "epoch": 2.4, "logps_train/chosen": -218.32777404785156, "logps_train/ref_chosen": -199.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -211.5435333251953, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9003549814224243, "rewards_train/margins": 4.402435660362244, "rewards_train/rejected": -6.302790641784668, "step": 210 }, { "epoch": 2.41, "learning_rate": 4.665656358402395e-06, "loss": 0.0516, "step": 211 }, { "epoch": 2.41, "logps_train/chosen": -173.509521484375, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -179.2303009033203, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8173097372055054, "rewards_train/margins": 4.784577965736389, "rewards_train/rejected": -6.6018877029418945, "step": 211 }, { "epoch": 2.41, "logps_train/chosen": -172.63043212890625, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -187.04351806640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2126522064208984, "rewards_train/margins": 4.117969512939453, "rewards_train/rejected": -6.330621719360352, "step": 211 }, { "epoch": 2.41, "logps_train/chosen": -221.95367431640625, "logps_train/ref_chosen": -201.0, "logps_train/ref_rejected": -170.0, "logps_train/rejected": -236.30203247070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0931222438812256, "rewards_train/margins": 4.478243589401245, "rewards_train/rejected": -6.571365833282471, "step": 211 }, { "epoch": 2.41, "logps_train/chosen": -185.05625915527344, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -126.5, "logps_train/rejected": -186.24945068359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3360211849212646, "rewards_train/margins": 4.64351487159729, "rewards_train/rejected": -5.979536056518555, "step": 211 }, { "epoch": 2.42, "learning_rate": 4.486636296742506e-06, "loss": 0.0509, "step": 212 }, { "epoch": 2.42, "logps_train/chosen": -193.8851318359375, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -187.48406982421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2984743118286133, "rewards_train/margins": 4.142901420593262, "rewards_train/rejected": -6.441375732421875, "step": 212 }, { "epoch": 2.42, "logps_train/chosen": -180.93228149414062, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -183.302978515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5795819759368896, "rewards_train/margins": 4.18977952003479, "rewards_train/rejected": -5.76936149597168, "step": 212 }, { "epoch": 2.42, "logps_train/chosen": -163.54148864746094, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -170.50917053222656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.1980700492858887, "rewards_train/margins": 3.664858818054199, "rewards_train/rejected": -5.862928867340088, "step": 212 }, { "epoch": 2.42, "logps_train/chosen": -172.4043731689453, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -184.43423461914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.632063388824463, "rewards_train/margins": 4.868025302886963, "rewards_train/rejected": -6.500088691711426, "step": 212 }, { "epoch": 2.43, "learning_rate": 4.3107791681585655e-06, "loss": 0.0701, "step": 213 }, { "epoch": 2.43, "logps_train/chosen": -182.57791137695312, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -188.65792846679688, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9670684337615967, "rewards_train/margins": 4.3333799839019775, "rewards_train/rejected": -6.300448417663574, "step": 213 }, { "epoch": 2.43, "logps_train/chosen": -178.8064422607422, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -188.4781494140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4776663780212402, "rewards_train/margins": 4.212725639343262, "rewards_train/rejected": -6.690392017364502, "step": 213 }, { "epoch": 2.43, "logps_train/chosen": -180.3985595703125, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -190.71658325195312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.1126112937927246, "rewards_train/margins": 4.2702765464782715, "rewards_train/rejected": -6.382887840270996, "step": 213 }, { "epoch": 2.43, "logps_train/chosen": -214.46804809570312, "logps_train/ref_chosen": -188.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -214.39895629882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.7165300846099854, "rewards_train/margins": 4.324148416519165, "rewards_train/rejected": -7.04067850112915, "step": 213 }, { "epoch": 2.45, "learning_rate": 4.138112087868576e-06, "loss": 0.0602, "step": 214 }, { "epoch": 2.45, "logps_train/chosen": -184.88540649414062, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -208.5177001953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1092917919158936, "rewards_train/margins": 5.026073694229126, "rewards_train/rejected": -7.1353654861450195, "step": 214 }, { "epoch": 2.45, "logps_train/chosen": -168.20083618164062, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -153.44461059570312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.237173557281494, "rewards_train/margins": 3.7916626930236816, "rewards_train/rejected": -6.028836250305176, "step": 214 }, { "epoch": 2.45, "logps_train/chosen": -139.26658630371094, "logps_train/ref_chosen": -117.5, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -174.18048095703125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.160618305206299, "rewards_train/margins": 4.620589733123779, "rewards_train/rejected": -6.781208038330078, "step": 214 }, { "epoch": 2.45, "logps_train/chosen": -167.5859375, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -176.97976684570312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.373340606689453, "rewards_train/margins": 4.333230018615723, "rewards_train/rejected": -6.706570625305176, "step": 214 }, { "epoch": 2.46, "learning_rate": 3.968661679220468e-06, "loss": 0.0675, "step": 215 }, { "epoch": 2.46, "logps_train/chosen": -178.07106018066406, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -178.66525268554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.273902416229248, "rewards_train/margins": 4.306295394897461, "rewards_train/rejected": -6.580197811126709, "step": 215 }, { "epoch": 2.46, "logps_train/chosen": -186.64599609375, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -190.8783721923828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4751794338226318, "rewards_train/margins": 4.996055841445923, "rewards_train/rejected": -6.471235275268555, "step": 215 }, { "epoch": 2.46, "logps_train/chosen": -155.9014129638672, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -111.5, "logps_train/rejected": -170.11361694335938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.4900078773498535, "rewards_train/margins": 4.389421463012695, "rewards_train/rejected": -5.879429340362549, "step": 215 }, { "epoch": 2.46, "logps_train/chosen": -206.13455200195312, "logps_train/ref_chosen": -191.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -218.8090057373047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4799585342407227, "rewards_train/margins": 5.572231292724609, "rewards_train/rejected": -7.052189826965332, "step": 215 }, { "epoch": 2.47, "learning_rate": 3.8024540695871274e-06, "loss": 0.0435, "step": 216 }, { "epoch": 2.47, "logps_train/chosen": -211.99249267578125, "logps_train/ref_chosen": -196.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -193.9930419921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6373107433319092, "rewards_train/margins": 4.699286699295044, "rewards_train/rejected": -6.336597442626953, "step": 216 }, { "epoch": 2.47, "logps_train/chosen": -162.2291259765625, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -182.76242065429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9232053756713867, "rewards_train/margins": 4.727743148803711, "rewards_train/rejected": -6.650948524475098, "step": 216 }, { "epoch": 2.47, "logps_train/chosen": -186.01992797851562, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -187.78819274902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7814736366271973, "rewards_train/margins": 4.773713111877441, "rewards_train/rejected": -6.555186748504639, "step": 216 }, { "epoch": 2.47, "logps_train/chosen": -179.65704345703125, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -193.5622100830078, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4718570709228516, "rewards_train/margins": 3.95780086517334, "rewards_train/rejected": -6.429657936096191, "step": 216 }, { "epoch": 2.48, "learning_rate": 3.6395148863377858e-06, "loss": 0.0425, "step": 217 }, { "epoch": 2.48, "logps_train/chosen": -185.7981719970703, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -194.738037109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9814281463623047, "rewards_train/margins": 4.51200532913208, "rewards_train/rejected": -6.493433475494385, "step": 217 }, { "epoch": 2.48, "logps_train/chosen": -174.4874267578125, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -197.3990020751953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0167109966278076, "rewards_train/margins": 4.54799485206604, "rewards_train/rejected": -6.564705848693848, "step": 217 }, { "epoch": 2.48, "logps_train/chosen": -186.6697998046875, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -150.0, "logps_train/rejected": -213.67283630371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.491588830947876, "rewards_train/margins": 4.921689748764038, "rewards_train/rejected": -6.413278579711914, "step": 217 }, { "epoch": 2.48, "logps_train/chosen": -185.66867065429688, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -215.50732421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.834641695022583, "rewards_train/margins": 4.791409254074097, "rewards_train/rejected": -7.62605094909668, "step": 217 }, { "epoch": 2.49, "learning_rate": 3.4798692528866057e-06, "loss": 0.0431, "step": 218 }, { "epoch": 2.49, "logps_train/chosen": -158.78350830078125, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -162.4704132080078, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4692697525024414, "rewards_train/margins": 4.1036505699157715, "rewards_train/rejected": -6.572920322418213, "step": 218 }, { "epoch": 2.49, "logps_train/chosen": -181.54193115234375, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -218.23255920410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7103451490402222, "rewards_train/margins": 5.347406983375549, "rewards_train/rejected": -7.0577521324157715, "step": 218 }, { "epoch": 2.49, "logps_train/chosen": -175.21749877929688, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -192.23492431640625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.2478251457214355, "rewards_train/margins": 4.507697582244873, "rewards_train/rejected": -6.755522727966309, "step": 218 }, { "epoch": 2.49, "logps_train/chosen": -182.02206420898438, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -197.85394287109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8292454481124878, "rewards_train/margins": 4.512984395027161, "rewards_train/rejected": -6.342229843139648, "step": 218 }, { "epoch": 2.5, "learning_rate": 3.3235417848188983e-06, "loss": 0.0539, "step": 219 }, { "epoch": 2.5, "logps_train/chosen": -184.61679077148438, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -177.779296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.90328049659729, "rewards_train/margins": 4.3316810131073, "rewards_train/rejected": -6.23496150970459, "step": 219 }, { "epoch": 2.5, "logps_train/chosen": -141.23770141601562, "logps_train/ref_chosen": -116.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -154.41409301757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4947173595428467, "rewards_train/margins": 4.093469858169556, "rewards_train/rejected": -6.588187217712402, "step": 219 }, { "epoch": 2.5, "logps_train/chosen": -149.37660217285156, "logps_train/ref_chosen": -124.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -173.98623657226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.497622013092041, "rewards_train/margins": 4.532507419586182, "rewards_train/rejected": -7.030129432678223, "step": 219 }, { "epoch": 2.5, "logps_train/chosen": -192.66302490234375, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -148.0, "logps_train/rejected": -212.57070922851562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.0915956497192383, "rewards_train/margins": 4.3946027755737305, "rewards_train/rejected": -6.486198425292969, "step": 219 }, { "epoch": 2.51, "learning_rate": 3.170556586095699e-06, "loss": 0.0649, "step": 220 }, { "epoch": 2.51, "logps_train/chosen": -189.02066040039062, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -177.7373046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.7520666122436523, "rewards_train/margins": 4.081625461578369, "rewards_train/rejected": -6.8336920738220215, "step": 220 }, { "epoch": 2.51, "logps_train/chosen": -176.2943878173828, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -187.67111206054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.6185994148254395, "rewards_train/margins": 4.177418231964111, "rewards_train/rejected": -6.796017646789551, "step": 220 }, { "epoch": 2.51, "logps_train/chosen": -151.6334686279297, "logps_train/ref_chosen": -131.0, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -156.61749267578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.107999324798584, "rewards_train/margins": 4.088809013366699, "rewards_train/rejected": -6.196808338165283, "step": 220 }, { "epoch": 2.51, "logps_train/chosen": -145.2259521484375, "logps_train/ref_chosen": -127.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -161.01107788085938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.803601622581482, "rewards_train/margins": 4.087031006813049, "rewards_train/rejected": -5.890632629394531, "step": 220 }, { "epoch": 2.53, "learning_rate": 3.0209372453372077e-06, "loss": 0.0763, "step": 221 }, { "epoch": 2.53, "logps_train/chosen": -177.82476806640625, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -166.97608947753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1691970825195312, "rewards_train/margins": 4.20849084854126, "rewards_train/rejected": -6.377687931060791, "step": 221 }, { "epoch": 2.53, "logps_train/chosen": -163.2164306640625, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -205.69760131835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.090000867843628, "rewards_train/margins": 4.568700075149536, "rewards_train/rejected": -6.658700942993164, "step": 221 }, { "epoch": 2.53, "logps_train/chosen": -165.90538024902344, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -179.6652069091797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1149516105651855, "rewards_train/margins": 4.325690269470215, "rewards_train/rejected": -6.4406418800354, "step": 221 }, { "epoch": 2.53, "logps_train/chosen": -191.64077758789062, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -155.56088256835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.883876919746399, "rewards_train/margins": 4.743695616722107, "rewards_train/rejected": -6.627572536468506, "step": 221 }, { "epoch": 2.54, "learning_rate": 2.8747068321856556e-06, "loss": 0.0509, "step": 222 }, { "epoch": 2.54, "logps_train/chosen": -175.29031372070312, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -152.0, "logps_train/rejected": -223.1751708984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.350759983062744, "rewards_train/margins": 4.803475379943848, "rewards_train/rejected": -7.154235363006592, "step": 222 }, { "epoch": 2.54, "logps_train/chosen": -191.7686767578125, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -201.64723205566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.020545721054077, "rewards_train/margins": 4.550428628921509, "rewards_train/rejected": -6.570974349975586, "step": 222 }, { "epoch": 2.54, "logps_train/chosen": -204.0155029296875, "logps_train/ref_chosen": -178.0, "logps_train/ref_rejected": -144.0, "logps_train/rejected": -215.03477478027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.5512094497680664, "rewards_train/margins": 4.560812950134277, "rewards_train/rejected": -7.112022399902344, "step": 222 }, { "epoch": 2.54, "logps_train/chosen": -166.68942260742188, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -186.78805541992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4038538932800293, "rewards_train/margins": 4.059913635253906, "rewards_train/rejected": -6.4637675285339355, "step": 222 }, { "epoch": 2.55, "learning_rate": 2.731887893748242e-06, "loss": 0.0351, "step": 223 }, { "epoch": 2.55, "logps_train/chosen": -181.3331298828125, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -190.67776489257812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4753060340881348, "rewards_train/margins": 3.9983296394348145, "rewards_train/rejected": -6.473635673522949, "step": 223 }, { "epoch": 2.55, "logps_train/chosen": -164.18081665039062, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -196.8584747314453, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.398452043533325, "rewards_train/margins": 4.512004613876343, "rewards_train/rejected": -6.910456657409668, "step": 223 }, { "epoch": 2.55, "logps_train/chosen": -180.45425415039062, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -187.70980834960938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.8418128490448, "rewards_train/margins": 4.628679037094116, "rewards_train/rejected": -7.470491886138916, "step": 223 }, { "epoch": 2.55, "logps_train/chosen": -182.337890625, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -192.41957092285156, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7076172828674316, "rewards_train/margins": 4.944886684417725, "rewards_train/rejected": -6.652503967285156, "step": 223 }, { "epoch": 2.56, "learning_rate": 2.5925024511206207e-06, "loss": 0.0606, "step": 224 }, { "epoch": 2.56, "logps_train/chosen": -193.09463500976562, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -207.1976776123047, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4825100898742676, "rewards_train/margins": 4.484133243560791, "rewards_train/rejected": -6.966643333435059, "step": 224 }, { "epoch": 2.56, "logps_train/chosen": -160.484619140625, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -165.8876190185547, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7858647108078003, "rewards_train/margins": 4.05990469455719, "rewards_train/rejected": -5.84576940536499, "step": 224 }, { "epoch": 2.56, "logps_train/chosen": -203.03643798828125, "logps_train/ref_chosen": -188.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -178.2891082763672, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.5149726867675781, "rewards_train/margins": 4.285617828369141, "rewards_train/rejected": -5.800590515136719, "step": 224 }, { "epoch": 2.56, "logps_train/chosen": -177.3469696044922, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -200.88308715820312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.568193197250366, "rewards_train/margins": 4.182811975479126, "rewards_train/rejected": -6.751005172729492, "step": 224 }, { "epoch": 2.57, "learning_rate": 2.45657199599148e-06, "loss": 0.0759, "step": 225 }, { "epoch": 2.57, "logps_train/chosen": -173.48208618164062, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -146.0, "logps_train/rejected": -207.01780700683594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8036763668060303, "rewards_train/margins": 4.3025963306427, "rewards_train/rejected": -6.1062726974487305, "step": 225 }, { "epoch": 2.57, "logps_train/chosen": -188.08656311035156, "logps_train/ref_chosen": -175.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -201.78933715820312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3506485223770142, "rewards_train/margins": 4.938831686973572, "rewards_train/rejected": -6.289480209350586, "step": 225 }, { "epoch": 2.57, "logps_train/chosen": -116.22421264648438, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -164.86378479003906, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.71127986907959, "rewards_train/margins": 3.7973580360412598, "rewards_train/rejected": -6.50863790512085, "step": 225 }, { "epoch": 2.57, "logps_train/chosen": -160.8203125, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -185.1500244140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3180701732635498, "rewards_train/margins": 4.57881760597229, "rewards_train/rejected": -5.89688777923584, "step": 225 }, { "epoch": 2.58, "learning_rate": 2.324117487328789e-06, "loss": 0.0696, "step": 226 }, { "epoch": 2.58, "logps_train/chosen": -172.0726776123047, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -184.76156616210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2580490112304688, "rewards_train/margins": 4.4107346534729, "rewards_train/rejected": -6.668783664703369, "step": 226 }, { "epoch": 2.58, "logps_train/chosen": -170.86233520507812, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -213.73233032226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.606351375579834, "rewards_train/margins": 5.066394329071045, "rewards_train/rejected": -7.672745704650879, "step": 226 }, { "epoch": 2.58, "logps_train/chosen": -155.60812377929688, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -177.47715759277344, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.057589054107666, "rewards_train/margins": 4.667080402374268, "rewards_train/rejected": -6.724669456481934, "step": 226 }, { "epoch": 2.58, "logps_train/chosen": -188.93655395507812, "logps_train/ref_chosen": -173.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -175.76377868652344, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6039090156555176, "rewards_train/margins": 3.646785259246826, "rewards_train/rejected": -5.250694274902344, "step": 226 }, { "epoch": 2.59, "learning_rate": 2.1951593481481237e-06, "loss": 0.0553, "step": 227 }, { "epoch": 2.59, "logps_train/chosen": -165.5153350830078, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -174.52703857421875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7658891677856445, "rewards_train/margins": 5.043651580810547, "rewards_train/rejected": -6.809540748596191, "step": 227 }, { "epoch": 2.59, "logps_train/chosen": -160.86737060546875, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -181.62149047851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4635443687438965, "rewards_train/margins": 4.161396026611328, "rewards_train/rejected": -6.624940395355225, "step": 227 }, { "epoch": 2.59, "logps_train/chosen": -167.77220153808594, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -166.71109008789062, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4367904663085938, "rewards_train/margins": 4.332364082336426, "rewards_train/rejected": -6.7691545486450195, "step": 227 }, { "epoch": 2.59, "logps_train/chosen": -157.61546325683594, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -115.0, "logps_train/rejected": -174.14193725585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6726795434951782, "rewards_train/margins": 4.217974066734314, "rewards_train/rejected": -5.890653610229492, "step": 227 }, { "epoch": 2.61, "learning_rate": 2.0697174623636794e-06, "loss": 0.0618, "step": 228 }, { "epoch": 2.61, "logps_train/chosen": -177.1771697998047, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -185.34759521484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.171818733215332, "rewards_train/margins": 3.963038444519043, "rewards_train/rejected": -6.134857177734375, "step": 228 }, { "epoch": 2.61, "logps_train/chosen": -184.7438507080078, "logps_train/ref_chosen": -165.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -185.59344482421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.953291416168213, "rewards_train/margins": 4.560252666473389, "rewards_train/rejected": -6.513544082641602, "step": 228 }, { "epoch": 2.61, "logps_train/chosen": -173.85379028320312, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -178.25070190429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.407742500305176, "rewards_train/margins": 4.145502090454102, "rewards_train/rejected": -6.553244590759277, "step": 228 }, { "epoch": 2.61, "logps_train/chosen": -158.80514526367188, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -188.6862335205078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4136208295822144, "rewards_train/margins": 4.584104657173157, "rewards_train/rejected": -5.997725486755371, "step": 228 }, { "epoch": 2.62, "learning_rate": 1.947811171722397e-06, "loss": 0.0588, "step": 229 }, { "epoch": 2.62, "logps_train/chosen": -171.05941772460938, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -162.3818817138672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7870943546295166, "rewards_train/margins": 4.338154077529907, "rewards_train/rejected": -6.125248432159424, "step": 229 }, { "epoch": 2.62, "logps_train/chosen": -142.09471130371094, "logps_train/ref_chosen": -114.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -192.31854248046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.8323230743408203, "rewards_train/margins": 4.283173084259033, "rewards_train/rejected": -7.1154961585998535, "step": 229 }, { "epoch": 2.62, "logps_train/chosen": -179.9519500732422, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -205.62069702148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.694608449935913, "rewards_train/margins": 4.932012319564819, "rewards_train/rejected": -6.626620769500732, "step": 229 }, { "epoch": 2.62, "logps_train/chosen": -160.87034606933594, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -182.16702270507812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.20851993560791, "rewards_train/margins": 4.76931619644165, "rewards_train/rejected": -6.9778361320495605, "step": 229 }, { "epoch": 2.63, "learning_rate": 1.8294592728216765e-06, "loss": 0.0485, "step": 230 }, { "epoch": 2.63, "logps_train/chosen": -157.534423828125, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -174.2003173828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9986941814422607, "rewards_train/margins": 4.557275056838989, "rewards_train/rejected": -6.55596923828125, "step": 230 }, { "epoch": 2.63, "logps_train/chosen": -168.04296875, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -171.24734497070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8662118911743164, "rewards_train/margins": 4.531960487365723, "rewards_train/rejected": -6.398172378540039, "step": 230 }, { "epoch": 2.63, "logps_train/chosen": -185.285400390625, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -170.35165405273438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4988040924072266, "rewards_train/margins": 4.609604835510254, "rewards_train/rejected": -6.1084089279174805, "step": 230 }, { "epoch": 2.63, "logps_train/chosen": -173.4306640625, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -188.17437744140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2563962936401367, "rewards_train/margins": 4.628278732299805, "rewards_train/rejected": -6.884675025939941, "step": 230 }, { "epoch": 2.64, "learning_rate": 1.7146800142111535e-06, "loss": 0.0496, "step": 231 }, { "epoch": 2.64, "logps_train/chosen": -171.4075469970703, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -121.5, "logps_train/rejected": -183.84478759765625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4142889976501465, "rewards_train/margins": 3.8182373046875, "rewards_train/rejected": -6.2325263023376465, "step": 231 }, { "epoch": 2.64, "logps_train/chosen": -164.04263305664062, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -173.68544006347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1521155834198, "rewards_train/margins": 4.001340627670288, "rewards_train/rejected": -6.153456211090088, "step": 231 }, { "epoch": 2.64, "logps_train/chosen": -179.4583282470703, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -180.95101928710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6806962490081787, "rewards_train/margins": 4.379248857498169, "rewards_train/rejected": -6.059945106506348, "step": 231 }, { "epoch": 2.64, "logps_train/chosen": -181.32591247558594, "logps_train/ref_chosen": -160.0, "logps_train/ref_rejected": -135.0, "logps_train/rejected": -201.79678344726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1640372276306152, "rewards_train/margins": 4.5218915939331055, "rewards_train/rejected": -6.685928821563721, "step": 231 }, { "epoch": 2.65, "learning_rate": 1.6034910935789627e-06, "loss": 0.0576, "step": 232 }, { "epoch": 2.65, "logps_train/chosen": -177.09873962402344, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -197.26693725585938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.002061605453491, "rewards_train/margins": 4.639084577560425, "rewards_train/rejected": -6.641146183013916, "step": 232 }, { "epoch": 2.65, "logps_train/chosen": -167.2610321044922, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -206.99166870117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.846024990081787, "rewards_train/margins": 5.2769694328308105, "rewards_train/rejected": -7.122994422912598, "step": 232 }, { "epoch": 2.65, "logps_train/chosen": -135.89576721191406, "logps_train/ref_chosen": -107.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -161.03927612304688, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.8300557136535645, "rewards_train/margins": 3.8118605613708496, "rewards_train/rejected": -6.641916275024414, "step": 232 }, { "epoch": 2.65, "logps_train/chosen": -162.01553344726562, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -202.14938354492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1653234958648682, "rewards_train/margins": 5.1688525676727295, "rewards_train/rejected": -6.334176063537598, "step": 232 }, { "epoch": 2.66, "learning_rate": 1.4959096550229645e-06, "loss": 0.0669, "step": 233 }, { "epoch": 2.66, "logps_train/chosen": -180.497314453125, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -137.0, "logps_train/rejected": -197.39334106445312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7840571403503418, "rewards_train/margins": 4.3093791007995605, "rewards_train/rejected": -6.093436241149902, "step": 233 }, { "epoch": 2.66, "logps_train/chosen": -181.78704833984375, "logps_train/ref_chosen": -161.0, "logps_train/ref_rejected": -119.5, "logps_train/rejected": -179.69802856445312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.05507230758667, "rewards_train/margins": 3.9480795860290527, "rewards_train/rejected": -6.003151893615723, "step": 233 }, { "epoch": 2.66, "logps_train/chosen": -177.9119110107422, "logps_train/ref_chosen": -152.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -173.1656951904297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.5379676818847656, "rewards_train/margins": 4.04774284362793, "rewards_train/rejected": -6.585710525512695, "step": 233 }, { "epoch": 2.66, "logps_train/chosen": -191.3437042236328, "logps_train/ref_chosen": -174.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -195.78623962402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7578561305999756, "rewards_train/margins": 4.763178110122681, "rewards_train/rejected": -6.521034240722656, "step": 233 }, { "epoch": 2.67, "learning_rate": 1.391952286407311e-06, "loss": 0.0564, "step": 234 }, { "epoch": 2.67, "logps_train/chosen": -164.625244140625, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -191.79592895507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1351804733276367, "rewards_train/margins": 4.474490165710449, "rewards_train/rejected": -6.609670639038086, "step": 234 }, { "epoch": 2.67, "logps_train/chosen": -192.8685302734375, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -180.1429443359375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.744861364364624, "rewards_train/margins": 4.417285203933716, "rewards_train/rejected": -6.16214656829834, "step": 234 }, { "epoch": 2.67, "logps_train/chosen": -172.70458984375, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -175.58883666992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7502193450927734, "rewards_train/margins": 4.853781700134277, "rewards_train/rejected": -6.604001045227051, "step": 234 }, { "epoch": 2.67, "logps_train/chosen": -179.72857666015625, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -183.2381134033203, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.149763584136963, "rewards_train/margins": 4.196936130523682, "rewards_train/rejected": -6.3466997146606445, "step": 234 }, { "epoch": 2.69, "learning_rate": 1.2916350168047681e-06, "loss": 0.0664, "step": 235 }, { "epoch": 2.69, "logps_train/chosen": -172.16232299804688, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -190.31521606445312, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.4478726387023926, "rewards_train/margins": 4.224077224731445, "rewards_train/rejected": -6.671949863433838, "step": 235 }, { "epoch": 2.69, "logps_train/chosen": -183.98605346679688, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -184.5833740234375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.0531957149505615, "rewards_train/margins": 4.704164743423462, "rewards_train/rejected": -6.757360458374023, "step": 235 }, { "epoch": 2.69, "logps_train/chosen": -150.15863037109375, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -157.22695922851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0300607681274414, "rewards_train/margins": 3.7460880279541016, "rewards_train/rejected": -5.776148796081543, "step": 235 }, { "epoch": 2.69, "logps_train/chosen": -198.83621215820312, "logps_train/ref_chosen": -183.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -208.21908569335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5560832023620605, "rewards_train/margins": 5.394535064697266, "rewards_train/rejected": -6.950618267059326, "step": 235 }, { "epoch": 2.7, "learning_rate": 1.1949733140252466e-06, "loss": 0.0635, "step": 236 }, { "epoch": 2.7, "logps_train/chosen": -161.04502868652344, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -163.6453857421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8402457237243652, "rewards_train/margins": 4.072827339172363, "rewards_train/rejected": -5.9130730628967285, "step": 236 }, { "epoch": 2.7, "logps_train/chosen": -184.62203979492188, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -143.0, "logps_train/rejected": -216.21878051757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.894137382507324, "rewards_train/margins": 4.415827751159668, "rewards_train/rejected": -7.309965133666992, "step": 236 }, { "epoch": 2.7, "logps_train/chosen": -194.06765747070312, "logps_train/ref_chosen": -182.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -192.44342041015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2655562162399292, "rewards_train/margins": 4.643270134925842, "rewards_train/rejected": -5.9088263511657715, "step": 236 }, { "epoch": 2.7, "logps_train/chosen": -214.92184448242188, "logps_train/ref_chosen": -190.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -202.62722778320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.510640859603882, "rewards_train/margins": 3.877863645553589, "rewards_train/rejected": -6.388504505157471, "step": 236 }, { "epoch": 2.71, "learning_rate": 1.1019820822307985e-06, "loss": 0.0649, "step": 237 }, { "epoch": 2.71, "logps_train/chosen": -126.3829574584961, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -177.629638671875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.102309465408325, "rewards_train/margins": 4.272714376449585, "rewards_train/rejected": -6.37502384185791, "step": 237 }, { "epoch": 2.71, "logps_train/chosen": -138.21041870117188, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -169.5449981689453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0431365966796875, "rewards_train/margins": 3.923570156097412, "rewards_train/rejected": -5.9667067527771, "step": 237 }, { "epoch": 2.71, "logps_train/chosen": -161.8151092529297, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -169.7493438720703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1523118019104004, "rewards_train/margins": 4.343618869781494, "rewards_train/rejected": -6.4959306716918945, "step": 237 }, { "epoch": 2.71, "logps_train/chosen": -216.87832641601562, "logps_train/ref_chosen": -204.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -197.9646453857422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3093161582946777, "rewards_train/margins": 5.389882564544678, "rewards_train/rejected": -6.6991987228393555, "step": 237 }, { "epoch": 2.72, "learning_rate": 1.0126756596375686e-06, "loss": 0.0598, "step": 238 }, { "epoch": 2.72, "logps_train/chosen": -166.60894775390625, "logps_train/ref_chosen": -147.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -183.59124755859375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9279100894927979, "rewards_train/margins": 4.252699136734009, "rewards_train/rejected": -6.180609226226807, "step": 238 }, { "epoch": 2.72, "logps_train/chosen": -139.39071655273438, "logps_train/ref_chosen": -116.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -172.66722106933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.307724952697754, "rewards_train/margins": 4.875207901000977, "rewards_train/rejected": -7.1829328536987305, "step": 238 }, { "epoch": 2.72, "logps_train/chosen": -171.77474975585938, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -163.83929443359375, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.127732276916504, "rewards_train/margins": 3.6769256591796875, "rewards_train/rejected": -5.804657936096191, "step": 238 }, { "epoch": 2.72, "logps_train/chosen": -172.7614288330078, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -157.0, "logps_train/rejected": -227.40777587890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8960649967193604, "rewards_train/margins": 5.144322156906128, "rewards_train/rejected": -7.040387153625488, "step": 238 }, { "epoch": 2.73, "learning_rate": 9.270678163050217e-07, "loss": 0.0751, "step": 239 }, { "epoch": 2.73, "logps_train/chosen": -163.55027770996094, "logps_train/ref_chosen": -137.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -145.41427612304688, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.624851703643799, "rewards_train/margins": 3.6589112281799316, "rewards_train/rejected": -6.2837629318237305, "step": 239 }, { "epoch": 2.73, "logps_train/chosen": -203.0478515625, "logps_train/ref_chosen": -189.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -199.60626220703125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.3887698650360107, "rewards_train/margins": 5.202104806900024, "rewards_train/rejected": -6.590874671936035, "step": 239 }, { "epoch": 2.73, "logps_train/chosen": -178.8606719970703, "logps_train/ref_chosen": -158.0, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -203.78274536132812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.051692008972168, "rewards_train/margins": 5.257539749145508, "rewards_train/rejected": -7.309231758117676, "step": 239 }, { "epoch": 2.73, "logps_train/chosen": -162.5720977783203, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -175.08572387695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2999839782714844, "rewards_train/margins": 4.601556777954102, "rewards_train/rejected": -6.901540756225586, "step": 239 }, { "epoch": 2.74, "learning_rate": 8.451717520127273e-07, "loss": 0.0755, "step": 240 }, { "epoch": 2.74, "logps_train/chosen": -180.50900268554688, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -180.9222412109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7348852157592773, "rewards_train/margins": 4.948159217834473, "rewards_train/rejected": -6.68304443359375, "step": 240 }, { "epoch": 2.74, "logps_train/chosen": -189.4424285888672, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -140.0, "logps_train/rejected": -208.51254272460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2249059677124023, "rewards_train/margins": 4.5786919593811035, "rewards_train/rejected": -6.803597927093506, "step": 240 }, { "epoch": 2.74, "logps_train/chosen": -154.6390380859375, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -124.5, "logps_train/rejected": -186.6710205078125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6452138423919678, "rewards_train/margins": 4.573157072067261, "rewards_train/rejected": -6.2183709144592285, "step": 240 }, { "epoch": 2.74, "logps_train/chosen": -167.7239990234375, "logps_train/ref_chosen": -143.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -192.18130493164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4205942153930664, "rewards_train/margins": 5.258059501647949, "rewards_train/rejected": -7.678653717041016, "step": 240 }, { "epoch": 2.75, "learning_rate": 7.670000942251287e-07, "loss": 0.0483, "step": 241 }, { "epoch": 2.75, "logps_train/chosen": -177.83676147460938, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -197.16542053222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9257657527923584, "rewards_train/margins": 4.465288400650024, "rewards_train/rejected": -6.391054153442383, "step": 241 }, { "epoch": 2.75, "logps_train/chosen": -186.260498046875, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -185.99197387695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.358275890350342, "rewards_train/margins": 4.20889139175415, "rewards_train/rejected": -6.567167282104492, "step": 241 }, { "epoch": 2.75, "logps_train/chosen": -159.35272216796875, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -167.78614807128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.3801445960998535, "rewards_train/margins": 4.245735168457031, "rewards_train/rejected": -6.625879764556885, "step": 241 }, { "epoch": 2.75, "logps_train/chosen": -162.38101196289062, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -181.56993103027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7641749382019043, "rewards_train/margins": 4.599165439605713, "rewards_train/rejected": -6.363340377807617, "step": 241 }, { "epoch": 2.77, "learning_rate": 6.92564896144493e-07, "loss": 0.0347, "step": 242 }, { "epoch": 2.77, "logps_train/chosen": -169.59744262695312, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -183.14720153808594, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.599441409111023, "rewards_train/margins": 4.840278744697571, "rewards_train/rejected": -6.439720153808594, "step": 242 }, { "epoch": 2.77, "logps_train/chosen": -156.12826538085938, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -175.92604064941406, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.272787570953369, "rewards_train/margins": 4.405375957489014, "rewards_train/rejected": -6.678163528442383, "step": 242 }, { "epoch": 2.77, "logps_train/chosen": -196.60855102539062, "logps_train/ref_chosen": -170.0, "logps_train/ref_rejected": -127.0, "logps_train/rejected": -193.12551879882812, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.7350728511810303, "rewards_train/margins": 3.8505260944366455, "rewards_train/rejected": -6.585598945617676, "step": 242 }, { "epoch": 2.77, "logps_train/chosen": -161.61679077148438, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -195.109130859375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.331406354904175, "rewards_train/margins": 4.148061990737915, "rewards_train/rejected": -6.47946834564209, "step": 242 }, { "epoch": 2.78, "learning_rate": 6.218776348524663e-07, "loss": 0.0655, "step": 243 }, { "epoch": 2.78, "logps_train/chosen": -182.87452697753906, "logps_train/ref_chosen": -163.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -194.0487060546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9816908836364746, "rewards_train/margins": 5.13365364074707, "rewards_train/rejected": -7.115344524383545, "step": 243 }, { "epoch": 2.78, "logps_train/chosen": -156.08993530273438, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -178.7113037109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.6412432193756104, "rewards_train/margins": 4.133403539657593, "rewards_train/rejected": -6.774646759033203, "step": 243 }, { "epoch": 2.78, "logps_train/chosen": -211.40530395507812, "logps_train/ref_chosen": -194.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -185.21580505371094, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.7184962034225464, "rewards_train/margins": 4.194722294807434, "rewards_train/rejected": -5.9132184982299805, "step": 243 }, { "epoch": 2.78, "logps_train/chosen": -179.36248779296875, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -175.9487762451172, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.673431396484375, "rewards_train/margins": 4.250351905822754, "rewards_train/rejected": -6.923783302307129, "step": 243 }, { "epoch": 2.79, "learning_rate": 5.549492095404202e-07, "loss": 0.0632, "step": 244 }, { "epoch": 2.79, "logps_train/chosen": -172.94845581054688, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -193.4479217529297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8069548606872559, "rewards_train/margins": 5.433931350708008, "rewards_train/rejected": -7.240886211395264, "step": 244 }, { "epoch": 2.79, "logps_train/chosen": -161.23968505859375, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -179.7657470703125, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.9360785484313965, "rewards_train/margins": 3.958073139190674, "rewards_train/rejected": -5.89415168762207, "step": 244 }, { "epoch": 2.79, "logps_train/chosen": -160.264892578125, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -171.85986328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4065914154052734, "rewards_train/margins": 4.195216178894043, "rewards_train/rejected": -6.601807594299316, "step": 244 }, { "epoch": 2.79, "logps_train/chosen": -235.1175079345703, "logps_train/ref_chosen": -219.0, "logps_train/ref_rejected": -176.0, "logps_train/rejected": -239.72584533691406, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.5730059146881104, "rewards_train/margins": 4.821647882461548, "rewards_train/rejected": -6.394653797149658, "step": 244 }, { "epoch": 2.8, "learning_rate": 4.917899398289377e-07, "loss": 0.0628, "step": 245 }, { "epoch": 2.8, "logps_train/chosen": -171.4566650390625, "logps_train/ref_chosen": -150.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -161.31747436523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.139416217803955, "rewards_train/margins": 3.9931116104125977, "rewards_train/rejected": -6.132527828216553, "step": 245 }, { "epoch": 2.8, "logps_train/chosen": -160.1334228515625, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -197.39840698242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1715452671051025, "rewards_train/margins": 4.618248701095581, "rewards_train/rejected": -6.789793968200684, "step": 245 }, { "epoch": 2.8, "logps_train/chosen": -176.39114379882812, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -195.78604125976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1625027656555176, "rewards_train/margins": 4.600754737854004, "rewards_train/rejected": -6.7632575035095215, "step": 245 }, { "epoch": 2.8, "logps_train/chosen": -156.68441772460938, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -160.67849731445312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8903658390045166, "rewards_train/margins": 4.183562994003296, "rewards_train/rejected": -6.0739288330078125, "step": 245 }, { "epoch": 2.81, "learning_rate": 4.324095641766168e-07, "loss": 0.0534, "step": 246 }, { "epoch": 2.81, "logps_train/chosen": -177.7391357421875, "logps_train/ref_chosen": -159.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -191.41738891601562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8221569061279297, "rewards_train/margins": 5.090432167053223, "rewards_train/rejected": -6.912589073181152, "step": 246 }, { "epoch": 2.81, "logps_train/chosen": -133.89585876464844, "logps_train/ref_chosen": -111.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -163.14993286132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2561392784118652, "rewards_train/margins": 3.9409842491149902, "rewards_train/rejected": -6.1971235275268555, "step": 246 }, { "epoch": 2.81, "logps_train/chosen": -184.8391876220703, "logps_train/ref_chosen": -168.0, "logps_train/ref_rejected": -164.0, "logps_train/rejected": -227.5182342529297, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6263017654418945, "rewards_train/margins": 4.725424289703369, "rewards_train/rejected": -6.351726055145264, "step": 246 }, { "epoch": 2.81, "logps_train/chosen": -177.89773559570312, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -138.0, "logps_train/rejected": -202.14389038085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0483689308166504, "rewards_train/margins": 4.4206223487854, "rewards_train/rejected": -6.468991279602051, "step": 246 }, { "epoch": 2.82, "learning_rate": 3.768172383785268e-07, "loss": 0.0606, "step": 247 }, { "epoch": 2.82, "logps_train/chosen": -166.12185668945312, "logps_train/ref_chosen": -145.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -160.4178466796875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.0884547233581543, "rewards_train/margins": 3.920907974243164, "rewards_train/rejected": -6.009362697601318, "step": 247 }, { "epoch": 2.82, "logps_train/chosen": -172.78741455078125, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -119.0, "logps_train/rejected": -189.86375427246094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.8326473236083984, "rewards_train/margins": 4.249040126800537, "rewards_train/rejected": -7.0816874504089355, "step": 247 }, { "epoch": 2.82, "logps_train/chosen": -185.13803100585938, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -178.22286987304688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.280600070953369, "rewards_train/margins": 4.309672832489014, "rewards_train/rejected": -6.590272903442383, "step": 247 }, { "epoch": 2.82, "logps_train/chosen": -153.21363830566406, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -171.15322875976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.703980803489685, "rewards_train/margins": 4.131362557411194, "rewards_train/rejected": -5.835343360900879, "step": 247 }, { "epoch": 2.83, "learning_rate": 3.2502153415447656e-07, "loss": 0.0635, "step": 248 }, { "epoch": 2.83, "logps_train/chosen": -163.0394744873047, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -140.48609924316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.749880313873291, "rewards_train/margins": 3.7217764854431152, "rewards_train/rejected": -6.471656799316406, "step": 248 }, { "epoch": 2.83, "logps_train/chosen": -166.8461151123047, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -129.0, "logps_train/rejected": -198.37579345703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.565225601196289, "rewards_train/margins": 4.396816253662109, "rewards_train/rejected": -6.962041854858398, "step": 248 }, { "epoch": 2.83, "logps_train/chosen": -207.8005828857422, "logps_train/ref_chosen": -190.0, "logps_train/ref_rejected": -153.0, "logps_train/rejected": -223.14645385742188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7702926397323608, "rewards_train/margins": 5.2681804895401, "rewards_train/rejected": -7.038473129272461, "step": 248 }, { "epoch": 2.83, "logps_train/chosen": -154.43231201171875, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -154.94419860839844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8259937763214111, "rewards_train/margins": 4.078387022018433, "rewards_train/rejected": -5.904380798339844, "step": 248 }, { "epoch": 2.85, "learning_rate": 2.770304378273553e-07, "loss": 0.0647, "step": 249 }, { "epoch": 2.85, "logps_train/chosen": -145.6243438720703, "logps_train/ref_chosen": -127.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -160.12098693847656, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.8032550811767578, "rewards_train/margins": 3.8635311126708984, "rewards_train/rejected": -5.666786193847656, "step": 249 }, { "epoch": 2.85, "logps_train/chosen": -179.10397338867188, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -195.87139892578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2270493507385254, "rewards_train/margins": 4.725861549377441, "rewards_train/rejected": -6.952910900115967, "step": 249 }, { "epoch": 2.85, "logps_train/chosen": -160.42538452148438, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -172.30838012695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4366796016693115, "rewards_train/margins": 4.239910364151001, "rewards_train/rejected": -6.6765899658203125, "step": 249 }, { "epoch": 2.85, "logps_train/chosen": -163.18991088867188, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -120.5, "logps_train/rejected": -182.28509521484375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.6743369102478027, "rewards_train/margins": 4.484737873077393, "rewards_train/rejected": -6.159074783325195, "step": 249 }, { "epoch": 2.86, "learning_rate": 2.3285134909173112e-07, "loss": 0.0667, "step": 250 }, { "epoch": 2.86, "logps_train/chosen": -194.73187255859375, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -188.28573608398438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9335401058197021, "rewards_train/margins": 5.130775690078735, "rewards_train/rejected": -7.0643157958984375, "step": 250 }, { "epoch": 2.86, "logps_train/chosen": -167.18817138671875, "logps_train/ref_chosen": -142.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -181.6185302734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.518524169921875, "rewards_train/margins": 4.051629066467285, "rewards_train/rejected": -6.57015323638916, "step": 250 }, { "epoch": 2.86, "logps_train/chosen": -194.84786987304688, "logps_train/ref_chosen": -176.0, "logps_train/ref_rejected": -162.0, "logps_train/rejected": -225.637451171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.871652364730835, "rewards_train/margins": 4.4969751834869385, "rewards_train/rejected": -6.368627548217773, "step": 250 }, { "epoch": 2.86, "logps_train/chosen": -161.91610717773438, "logps_train/ref_chosen": -140.0, "logps_train/ref_rejected": -124.5, "logps_train/rejected": -195.36077880859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1696693897247314, "rewards_train/margins": 4.929694890975952, "rewards_train/rejected": -7.099364280700684, "step": 250 }, { "epoch": 2.87, "learning_rate": 1.924910798728946e-07, "loss": 0.0365, "step": 251 }, { "epoch": 2.87, "logps_train/chosen": -176.8163299560547, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -183.77633666992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.261936664581299, "rewards_train/margins": 4.28908634185791, "rewards_train/rejected": -6.551023006439209, "step": 251 }, { "epoch": 2.87, "logps_train/chosen": -173.4698486328125, "logps_train/ref_chosen": -157.0, "logps_train/ref_rejected": -122.0, "logps_train/rejected": -184.99172973632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6786491870880127, "rewards_train/margins": 4.63419508934021, "rewards_train/rejected": -6.312844276428223, "step": 251 }, { "epoch": 2.87, "logps_train/chosen": -168.40023803710938, "logps_train/ref_chosen": -146.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -183.943603515625, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.20320725440979, "rewards_train/margins": 4.175772428512573, "rewards_train/rejected": -6.378979682922363, "step": 251 }, { "epoch": 2.87, "logps_train/chosen": -184.357666015625, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -185.84925842285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7965086698532104, "rewards_train/margins": 4.701600193977356, "rewards_train/rejected": -6.498108863830566, "step": 251 }, { "epoch": 2.88, "learning_rate": 1.559558532765404e-07, "loss": 0.0579, "step": 252 }, { "epoch": 2.88, "logps_train/chosen": -133.98130798339844, "logps_train/ref_chosen": -114.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -166.95147705078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0248889923095703, "rewards_train/margins": 4.195259094238281, "rewards_train/rejected": -6.220148086547852, "step": 252 }, { "epoch": 2.88, "logps_train/chosen": -198.66932678222656, "logps_train/ref_chosen": -179.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -210.49432373046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9354877471923828, "rewards_train/margins": 4.8838677406311035, "rewards_train/rejected": -6.819355487823486, "step": 252 }, { "epoch": 2.88, "logps_train/chosen": -173.53561401367188, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -179.3866729736328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9231053590774536, "rewards_train/margins": 4.311460375785828, "rewards_train/rejected": -6.234565734863281, "step": 252 }, { "epoch": 2.88, "logps_train/chosen": -192.16253662109375, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -188.32643127441406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.32460355758667, "rewards_train/margins": 4.6487135887146, "rewards_train/rejected": -6.9733171463012695, "step": 252 }, { "epoch": 2.89, "learning_rate": 1.23251302629232e-07, "loss": 0.0477, "step": 253 }, { "epoch": 2.89, "logps_train/chosen": -164.695068359375, "logps_train/ref_chosen": -149.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -183.24545288085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.560620665550232, "rewards_train/margins": 4.226998925209045, "rewards_train/rejected": -5.787619590759277, "step": 253 }, { "epoch": 2.89, "logps_train/chosen": -164.260986328125, "logps_train/ref_chosen": -139.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -173.06460571289062, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.5073485374450684, "rewards_train/margins": 4.087198734283447, "rewards_train/rejected": -6.594547271728516, "step": 253 }, { "epoch": 2.89, "logps_train/chosen": -151.60467529296875, "logps_train/ref_chosen": -128.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -179.28170776367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.3181824684143066, "rewards_train/margins": 3.9910435676574707, "rewards_train/rejected": -6.309226036071777, "step": 253 }, { "epoch": 2.89, "logps_train/chosen": -191.03347778320312, "logps_train/ref_chosen": -166.0, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -191.88462829589844, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.450467109680176, "rewards_train/margins": 4.3493733406066895, "rewards_train/rejected": -6.799840450286865, "step": 253 }, { "epoch": 2.9, "learning_rate": 9.438247060979955e-08, "loss": 0.0777, "step": 254 }, { "epoch": 2.9, "logps_train/chosen": -172.36068725585938, "logps_train/ref_chosen": -153.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -175.3718719482422, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.9357753992080688, "rewards_train/margins": 4.9021934270858765, "rewards_train/rejected": -6.837968826293945, "step": 254 }, { "epoch": 2.9, "logps_train/chosen": -179.45162963867188, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -117.5, "logps_train/rejected": -178.16188049316406, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.3817834854125977, "rewards_train/margins": 3.6969776153564453, "rewards_train/rejected": -6.078761100769043, "step": 254 }, { "epoch": 2.9, "logps_train/chosen": -167.01654052734375, "logps_train/ref_chosen": -141.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -179.34791564941406, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.5837817192077637, "rewards_train/margins": 4.215170383453369, "rewards_train/rejected": -6.798952102661133, "step": 254 }, { "epoch": 2.9, "logps_train/chosen": -170.49327087402344, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -165.78018188476562, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -2.2194879055023193, "rewards_train/margins": 4.055551767349243, "rewards_train/rejected": -6.2750396728515625, "step": 254 }, { "epoch": 2.91, "learning_rate": 6.935380847182815e-08, "loss": 0.0963, "step": 255 }, { "epoch": 2.91, "logps_train/chosen": -184.44064331054688, "logps_train/ref_chosen": -162.0, "logps_train/ref_rejected": -143.0, "logps_train/rejected": -211.0648956298828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2682833671569824, "rewards_train/margins": 4.527073860168457, "rewards_train/rejected": -6.7953572273254395, "step": 255 }, { "epoch": 2.91, "logps_train/chosen": -171.735595703125, "logps_train/ref_chosen": -148.0, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -186.7560272216797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.400291919708252, "rewards_train/margins": 4.154388904571533, "rewards_train/rejected": -6.554680824279785, "step": 255 }, { "epoch": 2.91, "logps_train/chosen": -175.62689208984375, "logps_train/ref_chosen": -151.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -202.15704345703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4636664390563965, "rewards_train/margins": 4.4941277503967285, "rewards_train/rejected": -6.957794189453125, "step": 255 }, { "epoch": 2.91, "logps_train/chosen": -186.03994750976562, "logps_train/ref_chosen": -169.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -196.6202392578125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.696962833404541, "rewards_train/margins": 4.931369781494141, "rewards_train/rejected": -6.628332614898682, "step": 255 }, { "epoch": 2.93, "learning_rate": 4.816917535731547e-08, "loss": 0.0631, "step": 256 }, { "epoch": 2.93, "logps_train/chosen": -153.82058715820312, "logps_train/ref_chosen": -135.0, "logps_train/ref_rejected": -124.5, "logps_train/rejected": -184.61001586914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.905739426612854, "rewards_train/margins": 4.107263922691345, "rewards_train/rejected": -6.013003349304199, "step": 256 }, { "epoch": 2.93, "logps_train/chosen": -152.47775268554688, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -147.11019897460938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.2787318229675293, "rewards_train/margins": 3.5291028022766113, "rewards_train/rejected": -5.807834625244141, "step": 256 }, { "epoch": 2.93, "logps_train/chosen": -163.71652221679688, "logps_train/ref_chosen": -138.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -201.04147338867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.5193088054656982, "rewards_train/margins": 4.136402368545532, "rewards_train/rejected": -6.6557111740112305, "step": 256 }, { "epoch": 2.93, "logps_train/chosen": -191.13845825195312, "logps_train/ref_chosen": -172.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -182.69961547851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8818151950836182, "rewards_train/margins": 4.146349668502808, "rewards_train/rejected": -6.028164863586426, "step": 256 }, { "epoch": 2.94, "learning_rate": 3.083183770162812e-08, "loss": 0.0762, "step": 257 }, { "epoch": 2.94, "logps_train/chosen": -165.29769897460938, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -193.4224395751953, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.089618444442749, "rewards_train/margins": 3.9063212871551514, "rewards_train/rejected": -5.9959397315979, "step": 257 }, { "epoch": 2.94, "logps_train/chosen": -175.6675567626953, "logps_train/ref_chosen": -154.0, "logps_train/ref_rejected": -142.0, "logps_train/rejected": -212.16360473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1210525035858154, "rewards_train/margins": 4.94413685798645, "rewards_train/rejected": -7.065189361572266, "step": 257 }, { "epoch": 2.94, "logps_train/chosen": -146.52691650390625, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -156.18231201171875, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.743121862411499, "rewards_train/margins": 3.6365344524383545, "rewards_train/rejected": -5.3796563148498535, "step": 257 }, { "epoch": 2.94, "logps_train/chosen": -187.83929443359375, "logps_train/ref_chosen": -167.0, "logps_train/ref_rejected": -122.5, "logps_train/rejected": -187.14642333984375, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.0775322914123535, "rewards_train/margins": 4.39111328125, "rewards_train/rejected": -6.4686455726623535, "step": 257 }, { "epoch": 2.95, "learning_rate": 1.73444687298685e-08, "loss": 0.077, "step": 258 }, { "epoch": 2.95, "logps_train/chosen": -177.5012969970703, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -168.67471313476562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.313605308532715, "rewards_train/margins": 4.108870983123779, "rewards_train/rejected": -6.422476291656494, "step": 258 }, { "epoch": 2.95, "logps_train/chosen": -150.55606079101562, "logps_train/ref_chosen": -134.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -195.2097930908203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6292637586593628, "rewards_train/margins": 4.666714787483215, "rewards_train/rejected": -6.295978546142578, "step": 258 }, { "epoch": 2.95, "logps_train/chosen": -166.3050537109375, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -125.0, "logps_train/rejected": -192.82681274414062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.15824031829834, "rewards_train/margins": 4.585964679718018, "rewards_train/rejected": -6.744204998016357, "step": 258 }, { "epoch": 2.95, "logps_train/chosen": -156.59942626953125, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -170.30538940429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.3700997829437256, "rewards_train/margins": 4.345986604690552, "rewards_train/rejected": -6.716086387634277, "step": 258 }, { "epoch": 2.96, "learning_rate": 7.709148044679481e-09, "loss": 0.0518, "step": 259 }, { "epoch": 2.96, "logps_train/chosen": -173.42506408691406, "logps_train/ref_chosen": -155.0, "logps_train/ref_rejected": -125.5, "logps_train/rejected": -185.6907196044922, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": -1.8300071954727173, "rewards_train/margins": 4.202877402305603, "rewards_train/rejected": -6.03288459777832, "step": 259 }, { "epoch": 2.96, "logps_train/chosen": -176.98544311523438, "logps_train/ref_chosen": -156.0, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -179.13150024414062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.050905704498291, "rewards_train/margins": 4.533289432525635, "rewards_train/rejected": -6.584195137023926, "step": 259 }, { "epoch": 2.96, "logps_train/chosen": -193.09963989257812, "logps_train/ref_chosen": -171.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -172.74990844726562, "rewards_train/accuracies": 0.90625, "rewards_train/chosen": -2.1755151748657227, "rewards_train/margins": 3.523548126220703, "rewards_train/rejected": -5.699063301086426, "step": 259 }, { "epoch": 2.96, "logps_train/chosen": -161.4818115234375, "logps_train/ref_chosen": -136.0, "logps_train/ref_rejected": -126.0, "logps_train/rejected": -198.05450439453125, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.5415897369384766, "rewards_train/margins": 4.642866134643555, "rewards_train/rejected": -7.184455871582031, "step": 259 }, { "epoch": 2.97, "learning_rate": 1.9273613056008944e-09, "loss": 0.0788, "step": 260 }, { "epoch": 2.97, "logps_train/chosen": -185.74722290039062, "logps_train/ref_chosen": -164.0, "logps_train/ref_rejected": -123.0, "logps_train/rejected": -188.15093994140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.189370632171631, "rewards_train/margins": 4.313614845275879, "rewards_train/rejected": -6.50298547744751, "step": 260 }, { "epoch": 2.97, "logps_train/chosen": -200.0399932861328, "logps_train/ref_chosen": -180.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -180.8087615966797, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -2.0358355045318604, "rewards_train/margins": 4.2098352909088135, "rewards_train/rejected": -6.245670795440674, "step": 260 }, { "epoch": 2.97, "logps_train/chosen": -152.52565002441406, "logps_train/ref_chosen": -133.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -153.40609741210938, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": -1.9625990390777588, "rewards_train/margins": 3.9296958446502686, "rewards_train/rejected": -5.892294883728027, "step": 260 }, { "epoch": 2.97, "logps_train/chosen": -162.84246826171875, "logps_train/ref_chosen": -144.0, "logps_train/ref_rejected": -133.0, "logps_train/rejected": -196.57708740234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8697924613952637, "rewards_train/margins": 4.492800235748291, "rewards_train/rejected": -6.362592697143555, "step": 260 }, { "epoch": 2.98, "learning_rate": 0.0, "loss": 0.06, "step": 261 }, { "epoch": 2.98, "step": 261, "total_flos": 0.0, "train_loss": 0.26605752715662523, "train_runtime": 15002.1853, "train_samples_per_second": 2.236, "train_steps_per_second": 0.017 } ], "max_steps": 261, "num_train_epochs": 3, "total_flos": 0.0, "trial_name": null, "trial_params": null }