BraylonDash's picture
Model save
e460f00 verified
raw
history blame contribute delete
No virus
105 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.617801047120419e-08,
"logits/chosen": -0.22574472427368164,
"logits/rejected": -0.2384113073348999,
"logps/chosen": -1586.180908203125,
"logps/rejected": -1626.5421142578125,
"loss": 0.0638,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 2.617801047120419e-07,
"logits/chosen": -0.1639188826084137,
"logits/rejected": -0.1851254105567932,
"logps/chosen": -2052.12841796875,
"logps/rejected": -1800.1533203125,
"loss": 0.0588,
"rewards/accuracies": 0.4513888955116272,
"rewards/chosen": 6.274010956985876e-05,
"rewards/margins": -1.1924101272597909e-05,
"rewards/rejected": 7.466421811841428e-05,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 5.235602094240838e-07,
"logits/chosen": -0.21358470618724823,
"logits/rejected": -0.1908903419971466,
"logps/chosen": -2196.85498046875,
"logps/rejected": -1773.3756103515625,
"loss": 0.0627,
"rewards/accuracies": 0.3812499940395355,
"rewards/chosen": 0.00044371531112119555,
"rewards/margins": 9.080490417545661e-05,
"rewards/rejected": 0.00035291039966978133,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 7.853403141361258e-07,
"logits/chosen": -0.2191818505525589,
"logits/rejected": -0.22062306106090546,
"logps/chosen": -2141.364501953125,
"logps/rejected": -1710.662353515625,
"loss": 0.0522,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0025672917254269123,
"rewards/margins": 0.0005076726665720344,
"rewards/rejected": 0.0020596194081008434,
"step": 30
},
{
"epoch": 0.02,
"learning_rate": 1.0471204188481676e-06,
"logits/chosen": -0.2520692050457001,
"logits/rejected": -0.22583802044391632,
"logps/chosen": -2189.7646484375,
"logps/rejected": -1715.2425537109375,
"loss": 0.0495,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00766522204503417,
"rewards/margins": 0.0016571322921663523,
"rewards/rejected": 0.0060080899856984615,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 1.3089005235602096e-06,
"logits/chosen": -0.17123639583587646,
"logits/rejected": -0.19555726647377014,
"logps/chosen": -2526.5703125,
"logps/rejected": -2165.141845703125,
"loss": 0.0538,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.02015666291117668,
"rewards/margins": 0.0033235768787562847,
"rewards/rejected": 0.01683308556675911,
"step": 50
},
{
"epoch": 0.03,
"learning_rate": 1.5706806282722515e-06,
"logits/chosen": -0.18598869442939758,
"logits/rejected": -0.20677652955055237,
"logps/chosen": -2151.3115234375,
"logps/rejected": -1970.6624755859375,
"loss": 0.0505,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": 0.029178302735090256,
"rewards/margins": 0.0026255736593157053,
"rewards/rejected": 0.026552731171250343,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 1.8324607329842933e-06,
"logits/chosen": -0.18310071527957916,
"logits/rejected": -0.20503754913806915,
"logps/chosen": -1844.6480712890625,
"logps/rejected": -1762.2308349609375,
"loss": 0.056,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.028074929490685463,
"rewards/margins": 0.001591854146681726,
"rewards/rejected": 0.026483073830604553,
"step": 70
},
{
"epoch": 0.04,
"learning_rate": 2.094240837696335e-06,
"logits/chosen": -0.22824080288410187,
"logits/rejected": -0.24587313830852509,
"logps/chosen": -1901.586181640625,
"logps/rejected": -1624.0626220703125,
"loss": 0.064,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.031114792451262474,
"rewards/margins": 0.005912109278142452,
"rewards/rejected": 0.025202685967087746,
"step": 80
},
{
"epoch": 0.05,
"learning_rate": 2.356020942408377e-06,
"logits/chosen": -0.2366272509098053,
"logits/rejected": -0.22877153754234314,
"logps/chosen": -1691.4013671875,
"logps/rejected": -1524.5679931640625,
"loss": 0.0481,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.035714153200387955,
"rewards/margins": 0.0030426979064941406,
"rewards/rejected": 0.032671455293893814,
"step": 90
},
{
"epoch": 0.05,
"learning_rate": 2.617801047120419e-06,
"logits/chosen": -0.22739839553833008,
"logits/rejected": -0.24034900963306427,
"logps/chosen": -2141.99365234375,
"logps/rejected": -2006.7513427734375,
"loss": 0.0519,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.038611847907304764,
"rewards/margins": 0.0053280796855688095,
"rewards/rejected": 0.033283766359090805,
"step": 100
},
{
"epoch": 0.05,
"eval_logits/chosen": -0.25320005416870117,
"eval_logits/rejected": -0.25199252367019653,
"eval_logps/chosen": -2183.76953125,
"eval_logps/rejected": -1849.702880859375,
"eval_loss": 0.052377186715602875,
"eval_rewards/accuracies": 0.5254999995231628,
"eval_rewards/chosen": 0.03263631835579872,
"eval_rewards/margins": 0.00592681672424078,
"eval_rewards/rejected": 0.026709498837590218,
"eval_runtime": 510.4972,
"eval_samples_per_second": 3.918,
"eval_steps_per_second": 0.979,
"step": 100
},
{
"epoch": 0.06,
"learning_rate": 2.8795811518324613e-06,
"logits/chosen": -0.2320372760295868,
"logits/rejected": -0.27123022079467773,
"logps/chosen": -1939.3607177734375,
"logps/rejected": -1764.5439453125,
"loss": 0.0502,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": 0.027445796877145767,
"rewards/margins": 0.00373500632122159,
"rewards/rejected": 0.023710791021585464,
"step": 110
},
{
"epoch": 0.06,
"learning_rate": 3.141361256544503e-06,
"logits/chosen": -0.2857373058795929,
"logits/rejected": -0.26925256848335266,
"logps/chosen": -2433.180419921875,
"logps/rejected": -2053.70361328125,
"loss": 0.0785,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.03826409578323364,
"rewards/margins": 0.007337054703384638,
"rewards/rejected": 0.030927041545510292,
"step": 120
},
{
"epoch": 0.07,
"learning_rate": 3.403141361256545e-06,
"logits/chosen": -0.27496081590652466,
"logits/rejected": -0.30028867721557617,
"logps/chosen": -2130.792236328125,
"logps/rejected": -1784.03125,
"loss": 0.0549,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.05773577094078064,
"rewards/margins": 0.011168297380208969,
"rewards/rejected": 0.04656747728586197,
"step": 130
},
{
"epoch": 0.07,
"learning_rate": 3.6649214659685865e-06,
"logits/chosen": -0.31289300322532654,
"logits/rejected": -0.31437715888023376,
"logps/chosen": -2071.06982421875,
"logps/rejected": -1879.8802490234375,
"loss": 0.055,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.07026473432779312,
"rewards/margins": 0.007077778223901987,
"rewards/rejected": 0.06318695098161697,
"step": 140
},
{
"epoch": 0.08,
"learning_rate": 3.926701570680629e-06,
"logits/chosen": -0.29269808530807495,
"logits/rejected": -0.3180951476097107,
"logps/chosen": -2014.0640869140625,
"logps/rejected": -1808.185302734375,
"loss": 0.0543,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.060369331389665604,
"rewards/margins": 0.007043222431093454,
"rewards/rejected": 0.05332610756158829,
"step": 150
},
{
"epoch": 0.08,
"learning_rate": 4.18848167539267e-06,
"logits/chosen": -0.2773135304450989,
"logits/rejected": -0.2673946022987366,
"logps/chosen": -2283.48779296875,
"logps/rejected": -1938.6422119140625,
"loss": 0.0524,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.05114240199327469,
"rewards/margins": 0.008266921155154705,
"rewards/rejected": 0.04287547618150711,
"step": 160
},
{
"epoch": 0.09,
"learning_rate": 4.450261780104713e-06,
"logits/chosen": -0.2700185179710388,
"logits/rejected": -0.26662972569465637,
"logps/chosen": -2404.58984375,
"logps/rejected": -1977.1859130859375,
"loss": 0.0624,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.05304870009422302,
"rewards/margins": 0.011285845190286636,
"rewards/rejected": 0.04176285117864609,
"step": 170
},
{
"epoch": 0.09,
"learning_rate": 4.712041884816754e-06,
"logits/chosen": -0.2975671887397766,
"logits/rejected": -0.2988983690738678,
"logps/chosen": -2047.671630859375,
"logps/rejected": -1742.282470703125,
"loss": 0.0418,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.05823253467679024,
"rewards/margins": 0.01046661101281643,
"rewards/rejected": 0.04776592180132866,
"step": 180
},
{
"epoch": 0.1,
"learning_rate": 4.9738219895287965e-06,
"logits/chosen": -0.2745932936668396,
"logits/rejected": -0.2855191230773926,
"logps/chosen": -2184.26220703125,
"logps/rejected": -1788.6656494140625,
"loss": 0.0408,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.06112230569124222,
"rewards/margins": 0.012786999344825745,
"rewards/rejected": 0.04833530634641647,
"step": 190
},
{
"epoch": 0.1,
"learning_rate": 4.999661831436499e-06,
"logits/chosen": -0.27325528860092163,
"logits/rejected": -0.2756146490573883,
"logps/chosen": -2187.59130859375,
"logps/rejected": -2025.250732421875,
"loss": 0.0379,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.05468825250864029,
"rewards/margins": 0.006374381482601166,
"rewards/rejected": 0.04831386357545853,
"step": 200
},
{
"epoch": 0.1,
"eval_logits/chosen": -0.27396515011787415,
"eval_logits/rejected": -0.2760486304759979,
"eval_logps/chosen": -2172.962890625,
"eval_logps/rejected": -1842.2476806640625,
"eval_loss": 0.051403772085905075,
"eval_rewards/accuracies": 0.5389999747276306,
"eval_rewards/chosen": 0.043442659080028534,
"eval_rewards/margins": 0.009277699515223503,
"eval_rewards/rejected": 0.03416495770215988,
"eval_runtime": 510.5925,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 200
},
{
"epoch": 0.11,
"learning_rate": 4.9984929711403395e-06,
"logits/chosen": -0.24565927684307098,
"logits/rejected": -0.24346761405467987,
"logps/chosen": -2105.339111328125,
"logps/rejected": -1993.477294921875,
"loss": 0.0456,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.03817785158753395,
"rewards/margins": 0.0046168239787220955,
"rewards/rejected": 0.03356102854013443,
"step": 210
},
{
"epoch": 0.12,
"learning_rate": 4.996489634487865e-06,
"logits/chosen": -0.2854730486869812,
"logits/rejected": -0.27373185753822327,
"logps/chosen": -2071.35595703125,
"logps/rejected": -1617.6314697265625,
"loss": 0.0471,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.03769981488585472,
"rewards/margins": 0.012330549769103527,
"rewards/rejected": 0.02536926604807377,
"step": 220
},
{
"epoch": 0.12,
"learning_rate": 4.9936524905772466e-06,
"logits/chosen": -0.2610529661178589,
"logits/rejected": -0.28053849935531616,
"logps/chosen": -1956.2564697265625,
"logps/rejected": -1615.5814208984375,
"loss": 0.0735,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.04565655067563057,
"rewards/margins": 0.011204726994037628,
"rewards/rejected": 0.03445183113217354,
"step": 230
},
{
"epoch": 0.13,
"learning_rate": 4.9899824869915e-06,
"logits/chosen": -0.24108798801898956,
"logits/rejected": -0.2399587333202362,
"logps/chosen": -1775.907470703125,
"logps/rejected": -1713.854736328125,
"loss": 0.0715,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.03222992643713951,
"rewards/margins": 0.008830582723021507,
"rewards/rejected": 0.023399341851472855,
"step": 240
},
{
"epoch": 0.13,
"learning_rate": 4.985480849482012e-06,
"logits/chosen": -0.20024847984313965,
"logits/rejected": -0.22306282818317413,
"logps/chosen": -2255.089599609375,
"logps/rejected": -1934.8642578125,
"loss": 0.0577,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.030694425106048584,
"rewards/margins": 0.01082837488502264,
"rewards/rejected": 0.01986604928970337,
"step": 250
},
{
"epoch": 0.14,
"learning_rate": 4.980149081559142e-06,
"logits/chosen": -0.21732480823993683,
"logits/rejected": -0.24718734622001648,
"logps/chosen": -1957.7998046875,
"logps/rejected": -1881.0550537109375,
"loss": 0.056,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.05905503034591675,
"rewards/margins": 0.0030602319166064262,
"rewards/rejected": 0.05599479004740715,
"step": 260
},
{
"epoch": 0.14,
"learning_rate": 4.9739889639900655e-06,
"logits/chosen": -0.24414131045341492,
"logits/rejected": -0.22118325531482697,
"logps/chosen": -1925.445556640625,
"logps/rejected": -1909.0667724609375,
"loss": 0.0539,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.05300917103886604,
"rewards/margins": 0.006324948277324438,
"rewards/rejected": 0.04668421670794487,
"step": 270
},
{
"epoch": 0.15,
"learning_rate": 4.967002554204009e-06,
"logits/chosen": -0.25582337379455566,
"logits/rejected": -0.2471769154071808,
"logps/chosen": -2269.031982421875,
"logps/rejected": -2033.6907958984375,
"loss": 0.0621,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.01642546057701111,
"rewards/margins": 0.0032355361618101597,
"rewards/rejected": 0.013189923949539661,
"step": 280
},
{
"epoch": 0.15,
"learning_rate": 4.959192185605089e-06,
"logits/chosen": -0.30079659819602966,
"logits/rejected": -0.28022244572639465,
"logps/chosen": -1992.6142578125,
"logps/rejected": -1820.0687255859375,
"loss": 0.0584,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.02403336763381958,
"rewards/margins": 0.00603306433185935,
"rewards/rejected": 0.018000302836298943,
"step": 290
},
{
"epoch": 0.16,
"learning_rate": 4.950560466792969e-06,
"logits/chosen": -0.28634804487228394,
"logits/rejected": -0.2918199896812439,
"logps/chosen": -2390.38623046875,
"logps/rejected": -1984.9703369140625,
"loss": 0.0425,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.035278573632240295,
"rewards/margins": 0.012735734693706036,
"rewards/rejected": 0.022542843595147133,
"step": 300
},
{
"epoch": 0.16,
"eval_logits/chosen": -0.29014500975608826,
"eval_logits/rejected": -0.28990820050239563,
"eval_logps/chosen": -2182.04541015625,
"eval_logps/rejected": -1851.862060546875,
"eval_loss": 0.05131419003009796,
"eval_rewards/accuracies": 0.5630000233650208,
"eval_rewards/chosen": 0.03436028212308884,
"eval_rewards/margins": 0.009809814393520355,
"eval_rewards/rejected": 0.02455046772956848,
"eval_runtime": 510.7215,
"eval_samples_per_second": 3.916,
"eval_steps_per_second": 0.979,
"step": 300
},
{
"epoch": 0.16,
"learning_rate": 4.9411102806916185e-06,
"logits/chosen": -0.2964246869087219,
"logits/rejected": -0.3249427080154419,
"logps/chosen": -2153.874267578125,
"logps/rejected": -1754.1324462890625,
"loss": 0.0521,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.04025361314415932,
"rewards/margins": 0.009616317227482796,
"rewards/rejected": 0.03063729964196682,
"step": 310
},
{
"epoch": 0.17,
"learning_rate": 4.930844783586424e-06,
"logits/chosen": -0.26167505979537964,
"logits/rejected": -0.2782900929450989,
"logps/chosen": -2090.10986328125,
"logps/rejected": -1866.400146484375,
"loss": 0.0581,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.044223010540008545,
"rewards/margins": 0.0066053010523319244,
"rewards/rejected": 0.03761770576238632,
"step": 320
},
{
"epoch": 0.17,
"learning_rate": 4.919767404070033e-06,
"logits/chosen": -0.2866571545600891,
"logits/rejected": -0.2904338836669922,
"logps/chosen": -2089.8603515625,
"logps/rejected": -1703.691650390625,
"loss": 0.0577,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0637887567281723,
"rewards/margins": 0.019382018595933914,
"rewards/rejected": 0.04440673440694809,
"step": 330
},
{
"epoch": 0.18,
"learning_rate": 4.907881841897216e-06,
"logits/chosen": -0.2776980698108673,
"logits/rejected": -0.2663383185863495,
"logps/chosen": -1941.0628662109375,
"logps/rejected": -1724.725830078125,
"loss": 0.057,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.07600688189268112,
"rewards/margins": 0.014780363067984581,
"rewards/rejected": 0.06122652441263199,
"step": 340
},
{
"epoch": 0.18,
"learning_rate": 4.89519206674919e-06,
"logits/chosen": -0.28663453459739685,
"logits/rejected": -0.2781517803668976,
"logps/chosen": -2123.11865234375,
"logps/rejected": -1684.65625,
"loss": 0.0578,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.07864506542682648,
"rewards/margins": 0.023925408720970154,
"rewards/rejected": 0.05471965670585632,
"step": 350
},
{
"epoch": 0.19,
"learning_rate": 4.881702316907769e-06,
"logits/chosen": -0.2786110043525696,
"logits/rejected": -0.2901211082935333,
"logps/chosen": -2082.64208984375,
"logps/rejected": -1863.649169921875,
"loss": 0.067,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.052565790712833405,
"rewards/margins": 0.014038707129657269,
"rewards/rejected": 0.038527075201272964,
"step": 360
},
{
"epoch": 0.19,
"learning_rate": 4.86741709783982e-06,
"logits/chosen": -0.34627729654312134,
"logits/rejected": -0.33580657839775085,
"logps/chosen": -1979.3060302734375,
"logps/rejected": -1685.088134765625,
"loss": 0.0563,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.017150847241282463,
"rewards/margins": 0.008625769056379795,
"rewards/rejected": 0.008525079116225243,
"step": 370
},
{
"epoch": 0.2,
"learning_rate": 4.852341180692471e-06,
"logits/chosen": -0.28853368759155273,
"logits/rejected": -0.33309391140937805,
"logps/chosen": -2051.138671875,
"logps/rejected": -1604.300537109375,
"loss": 0.0629,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.03238735720515251,
"rewards/margins": 0.011523480527102947,
"rewards/rejected": 0.02086387760937214,
"step": 380
},
{
"epoch": 0.2,
"learning_rate": 4.836479600699579e-06,
"logits/chosen": -0.2653834819793701,
"logits/rejected": -0.27924028038978577,
"logps/chosen": -2167.791748046875,
"logps/rejected": -1883.7181396484375,
"loss": 0.0567,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.07641658931970596,
"rewards/margins": 0.014170339331030846,
"rewards/rejected": 0.06224624067544937,
"step": 390
},
{
"epoch": 0.21,
"learning_rate": 4.819837655500014e-06,
"logits/chosen": -0.23558492958545685,
"logits/rejected": -0.252250611782074,
"logps/chosen": -2008.2281494140625,
"logps/rejected": -1735.037109375,
"loss": 0.0522,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.08481944352388382,
"rewards/margins": 0.018725356087088585,
"rewards/rejected": 0.06609407812356949,
"step": 400
},
{
"epoch": 0.21,
"eval_logits/chosen": -0.26833415031433105,
"eval_logits/rejected": -0.27769944071769714,
"eval_logps/chosen": -2134.577880859375,
"eval_logps/rejected": -1810.503662109375,
"eval_loss": 0.052033666521310806,
"eval_rewards/accuracies": 0.5249999761581421,
"eval_rewards/chosen": 0.08182776719331741,
"eval_rewards/margins": 0.01591898687183857,
"eval_rewards/rejected": 0.06590878218412399,
"eval_runtime": 510.467,
"eval_samples_per_second": 3.918,
"eval_steps_per_second": 0.979,
"step": 400
},
{
"epoch": 0.21,
"learning_rate": 4.802420903368286e-06,
"logits/chosen": -0.22416555881500244,
"logits/rejected": -0.23775295913219452,
"logps/chosen": -2305.072265625,
"logps/rejected": -2017.150390625,
"loss": 0.055,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.07892463356256485,
"rewards/margins": 0.010944006033241749,
"rewards/rejected": 0.06798062473535538,
"step": 410
},
{
"epoch": 0.22,
"learning_rate": 4.784235161358124e-06,
"logits/chosen": -0.24204190075397491,
"logits/rejected": -0.24225695431232452,
"logps/chosen": -1825.3125,
"logps/rejected": -1693.0045166015625,
"loss": 0.0497,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.05524778366088867,
"rewards/margins": 0.006465147249400616,
"rewards/rejected": 0.048782628029584885,
"step": 420
},
{
"epoch": 0.23,
"learning_rate": 4.765286503359632e-06,
"logits/chosen": -0.23344504833221436,
"logits/rejected": -0.27365198731422424,
"logps/chosen": -2049.459716796875,
"logps/rejected": -1840.787841796875,
"loss": 0.0565,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.04559114947915077,
"rewards/margins": 0.006585550494492054,
"rewards/rejected": 0.039005596190690994,
"step": 430
},
{
"epoch": 0.23,
"learning_rate": 4.745581258070654e-06,
"logits/chosen": -0.27591726183891296,
"logits/rejected": -0.2608277499675751,
"logps/chosen": -1806.8870849609375,
"logps/rejected": -1811.4437255859375,
"loss": 0.0541,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.033870596438646317,
"rewards/margins": 0.0030527892522513866,
"rewards/rejected": 0.030817802995443344,
"step": 440
},
{
"epoch": 0.24,
"learning_rate": 4.725126006883047e-06,
"logits/chosen": -0.2728896141052246,
"logits/rejected": -0.2633044123649597,
"logps/chosen": -2298.3818359375,
"logps/rejected": -2048.328125,
"loss": 0.052,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.027686957269906998,
"rewards/margins": 0.0040281787514686584,
"rewards/rejected": 0.02365877851843834,
"step": 450
},
{
"epoch": 0.24,
"learning_rate": 4.70392758168454e-06,
"logits/chosen": -0.2538016438484192,
"logits/rejected": -0.25012341141700745,
"logps/chosen": -2255.5146484375,
"logps/rejected": -1954.8531494140625,
"loss": 0.0536,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.03767388314008713,
"rewards/margins": 0.008208373561501503,
"rewards/rejected": 0.029465511441230774,
"step": 460
},
{
"epoch": 0.25,
"learning_rate": 4.68199306257695e-06,
"logits/chosen": -0.2599068284034729,
"logits/rejected": -0.26421061158180237,
"logps/chosen": -2130.776123046875,
"logps/rejected": -1925.4456787109375,
"loss": 0.0521,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.05753815174102783,
"rewards/margins": 0.011079727672040462,
"rewards/rejected": 0.04645842686295509,
"step": 470
},
{
"epoch": 0.25,
"learning_rate": 4.659329775511478e-06,
"logits/chosen": -0.27710121870040894,
"logits/rejected": -0.2857569754123688,
"logps/chosen": -2018.772705078125,
"logps/rejected": -1903.8472900390625,
"loss": 0.0537,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.07305508106946945,
"rewards/margins": 0.009427006356418133,
"rewards/rejected": 0.06362807750701904,
"step": 480
},
{
"epoch": 0.26,
"learning_rate": 4.635945289841902e-06,
"logits/chosen": -0.28116849064826965,
"logits/rejected": -0.2983720004558563,
"logps/chosen": -1921.1497802734375,
"logps/rejected": -1723.8843994140625,
"loss": 0.0443,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.0624106340110302,
"rewards/margins": 0.009529463946819305,
"rewards/rejected": 0.05288117378950119,
"step": 490
},
{
"epoch": 0.26,
"learning_rate": 4.611847415796476e-06,
"logits/chosen": -0.27481353282928467,
"logits/rejected": -0.29158735275268555,
"logps/chosen": -2325.54345703125,
"logps/rejected": -2043.1536865234375,
"loss": 0.0559,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.0638991966843605,
"rewards/margins": 0.01127773616462946,
"rewards/rejected": 0.052621446549892426,
"step": 500
},
{
"epoch": 0.26,
"eval_logits/chosen": -0.29030030965805054,
"eval_logits/rejected": -0.29912662506103516,
"eval_logps/chosen": -2155.4169921875,
"eval_logps/rejected": -1828.8736572265625,
"eval_loss": 0.05023103952407837,
"eval_rewards/accuracies": 0.5625,
"eval_rewards/chosen": 0.060988761484622955,
"eval_rewards/margins": 0.013449816033244133,
"eval_rewards/rejected": 0.04753894358873367,
"eval_runtime": 510.5382,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 500
},
{
"epoch": 0.27,
"learning_rate": 4.587044201869378e-06,
"logits/chosen": -0.2749403417110443,
"logits/rejected": -0.28757306933403015,
"logps/chosen": -2167.8203125,
"logps/rejected": -1664.1771240234375,
"loss": 0.0518,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.06645651906728745,
"rewards/margins": 0.021903514862060547,
"rewards/rejected": 0.0445530042052269,
"step": 510
},
{
"epoch": 0.27,
"learning_rate": 4.561543932132574e-06,
"logits/chosen": -0.3093597888946533,
"logits/rejected": -0.3130527138710022,
"logps/chosen": -2028.697509765625,
"logps/rejected": -1775.0302734375,
"loss": 0.0559,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.07832999527454376,
"rewards/margins": 0.014924841932952404,
"rewards/rejected": 0.06340514868497849,
"step": 520
},
{
"epoch": 0.28,
"learning_rate": 4.535355123469009e-06,
"logits/chosen": -0.32513946294784546,
"logits/rejected": -0.34443390369415283,
"logps/chosen": -2135.48974609375,
"logps/rejected": -1824.90625,
"loss": 0.0565,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.06139357015490532,
"rewards/margins": 0.012111430056393147,
"rewards/rejected": 0.04928214102983475,
"step": 530
},
{
"epoch": 0.28,
"learning_rate": 4.508486522728037e-06,
"logits/chosen": -0.34302735328674316,
"logits/rejected": -0.36917632818222046,
"logps/chosen": -2007.6627197265625,
"logps/rejected": -1699.0699462890625,
"loss": 0.0676,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.04311789572238922,
"rewards/margins": 0.012735480442643166,
"rewards/rejected": 0.030382419005036354,
"step": 540
},
{
"epoch": 0.29,
"learning_rate": 4.480947103804044e-06,
"logits/chosen": -0.35971927642822266,
"logits/rejected": -0.36432451009750366,
"logps/chosen": -2163.0068359375,
"logps/rejected": -2066.22509765625,
"loss": 0.0428,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.04490477591753006,
"rewards/margins": 0.006312023848295212,
"rewards/rejected": 0.03859275206923485,
"step": 550
},
{
"epoch": 0.29,
"learning_rate": 4.452746064639239e-06,
"logits/chosen": -0.38384127616882324,
"logits/rejected": -0.3922134339809418,
"logps/chosen": -2226.274658203125,
"logps/rejected": -1989.887451171875,
"loss": 0.0582,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.057973384857177734,
"rewards/margins": 0.015295244753360748,
"rewards/rejected": 0.042678140103816986,
"step": 560
},
{
"epoch": 0.3,
"learning_rate": 4.423892824151617e-06,
"logits/chosen": -0.37657466530799866,
"logits/rejected": -0.38766008615493774,
"logps/chosen": -1836.3118896484375,
"logps/rejected": -1472.295654296875,
"loss": 0.0701,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.07644511014223099,
"rewards/margins": 0.02132570371031761,
"rewards/rejected": 0.05511941760778427,
"step": 570
},
{
"epoch": 0.3,
"learning_rate": 4.3943970190891164e-06,
"logits/chosen": -0.37011387944221497,
"logits/rejected": -0.42118391394615173,
"logps/chosen": -2419.860107421875,
"logps/rejected": -1769.7777099609375,
"loss": 0.0626,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.12023104727268219,
"rewards/margins": 0.03250167518854141,
"rewards/rejected": 0.08772937208414078,
"step": 580
},
{
"epoch": 0.31,
"learning_rate": 4.364268500811025e-06,
"logits/chosen": -0.35418859124183655,
"logits/rejected": -0.37661364674568176,
"logps/chosen": -1887.2279052734375,
"logps/rejected": -1624.3062744140625,
"loss": 0.072,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.08441803604364395,
"rewards/margins": 0.01872970722615719,
"rewards/rejected": 0.06568832695484161,
"step": 590
},
{
"epoch": 0.31,
"learning_rate": 4.333517331997704e-06,
"logits/chosen": -0.36238303780555725,
"logits/rejected": -0.36792057752609253,
"logps/chosen": -1933.2572021484375,
"logps/rejected": -1661.876953125,
"loss": 0.0546,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.06127943471074104,
"rewards/margins": 0.016731832176446915,
"rewards/rejected": 0.044547609984874725,
"step": 600
},
{
"epoch": 0.31,
"eval_logits/chosen": -0.37191054224967957,
"eval_logits/rejected": -0.38397690653800964,
"eval_logps/chosen": -2167.108642578125,
"eval_logps/rejected": -1839.52685546875,
"eval_loss": 0.05038134753704071,
"eval_rewards/accuracies": 0.5525000095367432,
"eval_rewards/chosen": 0.049297019839286804,
"eval_rewards/margins": 0.01241131592541933,
"eval_rewards/rejected": 0.03688570857048035,
"eval_runtime": 510.5837,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 600
},
{
"epoch": 0.32,
"learning_rate": 4.302153783289737e-06,
"logits/chosen": -0.3634631633758545,
"logits/rejected": -0.37499555945396423,
"logps/chosen": -2023.001220703125,
"logps/rejected": -1739.4332275390625,
"loss": 0.0544,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.041168130934238434,
"rewards/margins": 0.010140376165509224,
"rewards/rejected": 0.03102775290608406,
"step": 610
},
{
"epoch": 0.32,
"learning_rate": 4.270188329857613e-06,
"logits/chosen": -0.3298744261264801,
"logits/rejected": -0.32282137870788574,
"logps/chosen": -2020.5091552734375,
"logps/rejected": -1689.3531494140625,
"loss": 0.047,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.050172846764326096,
"rewards/margins": 0.009077770635485649,
"rewards/rejected": 0.041095077991485596,
"step": 620
},
{
"epoch": 0.33,
"learning_rate": 4.237631647903115e-06,
"logits/chosen": -0.321160227060318,
"logits/rejected": -0.34205105900764465,
"logps/chosen": -1793.309326171875,
"logps/rejected": -1498.567626953125,
"loss": 0.049,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.0489073321223259,
"rewards/margins": 0.012575352564454079,
"rewards/rejected": 0.03633198142051697,
"step": 630
},
{
"epoch": 0.33,
"learning_rate": 4.204494611093548e-06,
"logits/chosen": -0.32717442512512207,
"logits/rejected": -0.34008845686912537,
"logps/chosen": -1978.6207275390625,
"logps/rejected": -1785.7669677734375,
"loss": 0.0617,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": 0.05437788367271423,
"rewards/margins": 0.005566168110817671,
"rewards/rejected": 0.048811715096235275,
"step": 640
},
{
"epoch": 0.34,
"learning_rate": 4.170788286930024e-06,
"logits/chosen": -0.3271678388118744,
"logits/rejected": -0.3383072018623352,
"logps/chosen": -2002.5355224609375,
"logps/rejected": -1623.6373291015625,
"loss": 0.0429,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.056448131799697876,
"rewards/margins": 0.013460059650242329,
"rewards/rejected": 0.04298807680606842,
"step": 650
},
{
"epoch": 0.35,
"learning_rate": 4.136523933051005e-06,
"logits/chosen": -0.28324219584465027,
"logits/rejected": -0.2753041982650757,
"logps/chosen": -1772.493896484375,
"logps/rejected": -1581.4808349609375,
"loss": 0.047,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.04456415772438049,
"rewards/margins": 0.007894165813922882,
"rewards/rejected": 0.03666999563574791,
"step": 660
},
{
"epoch": 0.35,
"learning_rate": 4.101712993472348e-06,
"logits/chosen": -0.286260187625885,
"logits/rejected": -0.3045397698879242,
"logps/chosen": -1830.4456787109375,
"logps/rejected": -1603.759521484375,
"loss": 0.0541,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.05162501335144043,
"rewards/margins": 0.011338387615978718,
"rewards/rejected": 0.040286630392074585,
"step": 670
},
{
"epoch": 0.36,
"learning_rate": 4.066367094765091e-06,
"logits/chosen": -0.2880704998970032,
"logits/rejected": -0.2942127585411072,
"logps/chosen": -2038.3916015625,
"logps/rejected": -1857.885498046875,
"loss": 0.0472,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.06582482159137726,
"rewards/margins": 0.009796356782317162,
"rewards/rejected": 0.05602846294641495,
"step": 680
},
{
"epoch": 0.36,
"learning_rate": 4.030498042172277e-06,
"logits/chosen": -0.29781144857406616,
"logits/rejected": -0.3116939663887024,
"logps/chosen": -2132.72802734375,
"logps/rejected": -1934.0364990234375,
"loss": 0.0439,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.06504924595355988,
"rewards/margins": 0.00774806085973978,
"rewards/rejected": 0.05730118602514267,
"step": 690
},
{
"epoch": 0.37,
"learning_rate": 3.994117815666095e-06,
"logits/chosen": -0.3007664084434509,
"logits/rejected": -0.29853954911231995,
"logps/chosen": -1988.636962890625,
"logps/rejected": -1707.418212890625,
"loss": 0.0443,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.05466890335083008,
"rewards/margins": 0.013832475058734417,
"rewards/rejected": 0.040836431086063385,
"step": 700
},
{
"epoch": 0.37,
"eval_logits/chosen": -0.3144506812095642,
"eval_logits/rejected": -0.3237921893596649,
"eval_logps/chosen": -2163.694091796875,
"eval_logps/rejected": -1836.1396484375,
"eval_loss": 0.05007108300924301,
"eval_rewards/accuracies": 0.5669999718666077,
"eval_rewards/chosen": 0.052711814641952515,
"eval_rewards/margins": 0.012439063750207424,
"eval_rewards/rejected": 0.04027275741100311,
"eval_runtime": 510.3528,
"eval_samples_per_second": 3.919,
"eval_steps_per_second": 0.98,
"step": 700
},
{
"epoch": 0.37,
"learning_rate": 3.957238565946672e-06,
"logits/chosen": -0.28171759843826294,
"logits/rejected": -0.3016406297683716,
"logps/chosen": -1951.7197265625,
"logps/rejected": -1821.9302978515625,
"loss": 0.0746,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.046342235058546066,
"rewards/margins": 0.00639796257019043,
"rewards/rejected": 0.03994427248835564,
"step": 710
},
{
"epoch": 0.38,
"learning_rate": 3.919872610383831e-06,
"logits/chosen": -0.30082041025161743,
"logits/rejected": -0.3195782005786896,
"logps/chosen": -2009.2193603515625,
"logps/rejected": -1790.225830078125,
"loss": 0.0684,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.04109364002943039,
"rewards/margins": 0.007526120636612177,
"rewards/rejected": 0.03356752544641495,
"step": 720
},
{
"epoch": 0.38,
"learning_rate": 3.882032428903195e-06,
"logits/chosen": -0.3266572058200836,
"logits/rejected": -0.3410620093345642,
"logps/chosen": -2097.94140625,
"logps/rejected": -1642.9635009765625,
"loss": 0.0475,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.06069540977478027,
"rewards/margins": 0.01798270270228386,
"rewards/rejected": 0.042712707072496414,
"step": 730
},
{
"epoch": 0.39,
"learning_rate": 3.84373065981799e-06,
"logits/chosen": -0.29377710819244385,
"logits/rejected": -0.2976624369621277,
"logps/chosen": -2122.676513671875,
"logps/rejected": -1956.9495849609375,
"loss": 0.0456,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0653495341539383,
"rewards/margins": 0.013613177463412285,
"rewards/rejected": 0.051736362278461456,
"step": 740
},
{
"epoch": 0.39,
"learning_rate": 3.8049800956079552e-06,
"logits/chosen": -0.33634868264198303,
"logits/rejected": -0.3460080027580261,
"logps/chosen": -1977.577392578125,
"logps/rejected": -1716.836669921875,
"loss": 0.0618,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.05900438502430916,
"rewards/margins": 0.01787043735384941,
"rewards/rejected": 0.04113394767045975,
"step": 750
},
{
"epoch": 0.4,
"learning_rate": 3.765793678646753e-06,
"logits/chosen": -0.3246403634548187,
"logits/rejected": -0.3240343928337097,
"logps/chosen": -2022.0374755859375,
"logps/rejected": -1934.3929443359375,
"loss": 0.0499,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.056883443146944046,
"rewards/margins": 0.010135297663509846,
"rewards/rejected": 0.046748142689466476,
"step": 760
},
{
"epoch": 0.4,
"learning_rate": 3.726184496879323e-06,
"logits/chosen": -0.32194751501083374,
"logits/rejected": -0.3437530994415283,
"logps/chosen": -2066.994873046875,
"logps/rejected": -1785.517333984375,
"loss": 0.0618,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.06701908260583878,
"rewards/margins": 0.017347043380141258,
"rewards/rejected": 0.04967203736305237,
"step": 770
},
{
"epoch": 0.41,
"learning_rate": 3.686165779450619e-06,
"logits/chosen": -0.32135313749313354,
"logits/rejected": -0.33263832330703735,
"logps/chosen": -2046.4375,
"logps/rejected": -1752.5133056640625,
"loss": 0.0629,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.07863454520702362,
"rewards/margins": 0.013034949079155922,
"rewards/rejected": 0.06559960544109344,
"step": 780
},
{
"epoch": 0.41,
"learning_rate": 3.645750892287178e-06,
"logits/chosen": -0.30609697103500366,
"logits/rejected": -0.3328899145126343,
"logps/chosen": -2209.924560546875,
"logps/rejected": -1803.526123046875,
"loss": 0.06,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.08860823512077332,
"rewards/margins": 0.02287045121192932,
"rewards/rejected": 0.065737783908844,
"step": 790
},
{
"epoch": 0.42,
"learning_rate": 3.604953333633009e-06,
"logits/chosen": -0.301249623298645,
"logits/rejected": -0.3167082369327545,
"logps/chosen": -1958.112548828125,
"logps/rejected": -1749.8187255859375,
"loss": 0.0583,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.0594819113612175,
"rewards/margins": 0.00925761554390192,
"rewards/rejected": 0.050224293023347855,
"step": 800
},
{
"epoch": 0.42,
"eval_logits/chosen": -0.2990359365940094,
"eval_logits/rejected": -0.3079277575016022,
"eval_logps/chosen": -2160.533447265625,
"eval_logps/rejected": -1833.001220703125,
"eval_loss": 0.05018917843699455,
"eval_rewards/accuracies": 0.5625,
"eval_rewards/chosen": 0.055872511118650436,
"eval_rewards/margins": 0.012461244128644466,
"eval_rewards/rejected": 0.043411269783973694,
"eval_runtime": 510.4542,
"eval_samples_per_second": 3.918,
"eval_steps_per_second": 0.98,
"step": 800
},
{
"epoch": 0.42,
"learning_rate": 3.56378672954129e-06,
"logits/chosen": -0.2567403316497803,
"logits/rejected": -0.3088562786579132,
"logps/chosen": -1969.7041015625,
"logps/rejected": -1587.13134765625,
"loss": 0.0589,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.05050656199455261,
"rewards/margins": 0.01574171707034111,
"rewards/rejected": 0.0347648449242115,
"step": 810
},
{
"epoch": 0.43,
"learning_rate": 3.5222648293233806e-06,
"logits/chosen": -0.3206945061683655,
"logits/rejected": -0.32324275374412537,
"logps/chosen": -2125.584228515625,
"logps/rejected": -1908.9595947265625,
"loss": 0.0508,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.07730694115161896,
"rewards/margins": 0.020585492253303528,
"rewards/rejected": 0.05672144889831543,
"step": 820
},
{
"epoch": 0.43,
"learning_rate": 3.4804015009566573e-06,
"logits/chosen": -0.30177921056747437,
"logits/rejected": -0.30555492639541626,
"logps/chosen": -2047.0084228515625,
"logps/rejected": -1866.709228515625,
"loss": 0.0529,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.080367811024189,
"rewards/margins": 0.01644848845899105,
"rewards/rejected": 0.0639193207025528,
"step": 830
},
{
"epoch": 0.44,
"learning_rate": 3.4382107264527244e-06,
"logits/chosen": -0.2914479076862335,
"logits/rejected": -0.3034920394420624,
"logps/chosen": -2094.360595703125,
"logps/rejected": -1812.98046875,
"loss": 0.0469,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.08005829900503159,
"rewards/margins": 0.010312746278941631,
"rewards/rejected": 0.06974555552005768,
"step": 840
},
{
"epoch": 0.44,
"learning_rate": 3.3957065971875387e-06,
"logits/chosen": -0.3109249472618103,
"logits/rejected": -0.32668763399124146,
"logps/chosen": -2224.466796875,
"logps/rejected": -1824.3785400390625,
"loss": 0.0493,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.06907899677753448,
"rewards/margins": 0.012219742871820927,
"rewards/rejected": 0.056859247386455536,
"step": 850
},
{
"epoch": 0.45,
"learning_rate": 3.352903309194999e-06,
"logits/chosen": -0.29552769660949707,
"logits/rejected": -0.30279669165611267,
"logps/chosen": -2010.127685546875,
"logps/rejected": -1726.2581787109375,
"loss": 0.0523,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": 0.05864205211400986,
"rewards/margins": 0.011820727959275246,
"rewards/rejected": 0.04682133346796036,
"step": 860
},
{
"epoch": 0.46,
"learning_rate": 3.309815158425591e-06,
"logits/chosen": -0.30413001775741577,
"logits/rejected": -0.317624032497406,
"logps/chosen": -2200.095947265625,
"logps/rejected": -1815.937744140625,
"loss": 0.0634,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.07513566315174103,
"rewards/margins": 0.01764606684446335,
"rewards/rejected": 0.057489603757858276,
"step": 870
},
{
"epoch": 0.46,
"learning_rate": 3.266456535971654e-06,
"logits/chosen": -0.2624972462654114,
"logits/rejected": -0.28810930252075195,
"logps/chosen": -2114.169189453125,
"logps/rejected": -1792.790771484375,
"loss": 0.0522,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0786682516336441,
"rewards/margins": 0.01509961299598217,
"rewards/rejected": 0.06356863677501678,
"step": 880
},
{
"epoch": 0.47,
"learning_rate": 3.2228419232608692e-06,
"logits/chosen": -0.2368161380290985,
"logits/rejected": -0.24519118666648865,
"logps/chosen": -1967.4462890625,
"logps/rejected": -1798.807373046875,
"loss": 0.0492,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.07654932141304016,
"rewards/margins": 0.007422330789268017,
"rewards/rejected": 0.06912699341773987,
"step": 890
},
{
"epoch": 0.47,
"learning_rate": 3.1789858872195888e-06,
"logits/chosen": -0.21885935962200165,
"logits/rejected": -0.24373655021190643,
"logps/chosen": -2283.5390625,
"logps/rejected": -1840.8265380859375,
"loss": 0.0432,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.09643899649381638,
"rewards/margins": 0.018616409972310066,
"rewards/rejected": 0.07782258838415146,
"step": 900
},
{
"epoch": 0.47,
"eval_logits/chosen": -0.24547961354255676,
"eval_logits/rejected": -0.25286465883255005,
"eval_logps/chosen": -2129.181884765625,
"eval_logps/rejected": -1806.208740234375,
"eval_loss": 0.050007544457912445,
"eval_rewards/accuracies": 0.5485000014305115,
"eval_rewards/chosen": 0.0872238427400589,
"eval_rewards/margins": 0.017020048573613167,
"eval_rewards/rejected": 0.07020379602909088,
"eval_runtime": 510.5362,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 900
},
{
"epoch": 0.48,
"learning_rate": 3.1349030754075945e-06,
"logits/chosen": -0.22288069128990173,
"logits/rejected": -0.2447211742401123,
"logps/chosen": -2140.885498046875,
"logps/rejected": -1654.9674072265625,
"loss": 0.0626,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.09542791545391083,
"rewards/margins": 0.02674751542508602,
"rewards/rejected": 0.06868041306734085,
"step": 910
},
{
"epoch": 0.48,
"learning_rate": 3.0906082111259313e-06,
"logits/chosen": -0.2237463891506195,
"logits/rejected": -0.2500147521495819,
"logps/chosen": -2403.999267578125,
"logps/rejected": -1815.796142578125,
"loss": 0.0436,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.09730223566293716,
"rewards/margins": 0.025161966681480408,
"rewards/rejected": 0.07214026153087616,
"step": 920
},
{
"epoch": 0.49,
"learning_rate": 3.046116088499449e-06,
"logits/chosen": -0.2534050941467285,
"logits/rejected": -0.27334827184677124,
"logps/chosen": -2099.66064453125,
"logps/rejected": -1671.605712890625,
"loss": 0.0409,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.09167732298374176,
"rewards/margins": 0.01841827854514122,
"rewards/rejected": 0.07325904071331024,
"step": 930
},
{
"epoch": 0.49,
"learning_rate": 3.0014415675356813e-06,
"logits/chosen": -0.24992087483406067,
"logits/rejected": -0.2547626495361328,
"logps/chosen": -2133.786376953125,
"logps/rejected": -1852.0260009765625,
"loss": 0.0401,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.09754703938961029,
"rewards/margins": 0.015503397211432457,
"rewards/rejected": 0.08204366266727448,
"step": 940
},
{
"epoch": 0.5,
"learning_rate": 2.9565995691617242e-06,
"logits/chosen": -0.23162353038787842,
"logits/rejected": -0.24423262476921082,
"logps/chosen": -1878.375732421875,
"logps/rejected": -1515.8773193359375,
"loss": 0.047,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.07727902382612228,
"rewards/margins": 0.018643613904714584,
"rewards/rejected": 0.058635413646698,
"step": 950
},
{
"epoch": 0.5,
"learning_rate": 2.9116050702407706e-06,
"logits/chosen": -0.2648778259754181,
"logits/rejected": -0.2825019359588623,
"logps/chosen": -2200.065185546875,
"logps/rejected": -1777.919677734375,
"loss": 0.0385,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.07928620278835297,
"rewards/margins": 0.01838754117488861,
"rewards/rejected": 0.06089866906404495,
"step": 960
},
{
"epoch": 0.51,
"learning_rate": 2.8664730985699537e-06,
"logits/chosen": -0.23890802264213562,
"logits/rejected": -0.2561323344707489,
"logps/chosen": -2259.957763671875,
"logps/rejected": -1922.253173828125,
"loss": 0.0508,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.06860624998807907,
"rewards/margins": 0.010691315867006779,
"rewards/rejected": 0.05791493132710457,
"step": 970
},
{
"epoch": 0.51,
"learning_rate": 2.8212187278611907e-06,
"logits/chosen": -0.2580435872077942,
"logits/rejected": -0.2608950734138489,
"logps/chosen": -2114.46044921875,
"logps/rejected": -1843.2421875,
"loss": 0.0502,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.060202427208423615,
"rewards/margins": 0.0087806461378932,
"rewards/rejected": 0.05142177268862724,
"step": 980
},
{
"epoch": 0.52,
"learning_rate": 2.7758570727066843e-06,
"logits/chosen": -0.26601457595825195,
"logits/rejected": -0.27015531063079834,
"logps/chosen": -1853.6099853515625,
"logps/rejected": -1555.1754150390625,
"loss": 0.0605,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.060611844062805176,
"rewards/margins": 0.013232124969363213,
"rewards/rejected": 0.047379713505506516,
"step": 990
},
{
"epoch": 0.52,
"learning_rate": 2.730403283530767e-06,
"logits/chosen": -0.24036483466625214,
"logits/rejected": -0.2455415278673172,
"logps/chosen": -1859.8070068359375,
"logps/rejected": -1691.760498046875,
"loss": 0.0538,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.06643722951412201,
"rewards/margins": 0.01359983254224062,
"rewards/rejected": 0.052837394177913666,
"step": 1000
},
{
"epoch": 0.52,
"eval_logits/chosen": -0.2564674913883209,
"eval_logits/rejected": -0.25929296016693115,
"eval_logps/chosen": -2156.65283203125,
"eval_logps/rejected": -1829.5831298828125,
"eval_loss": 0.04961266368627548,
"eval_rewards/accuracies": 0.5649999976158142,
"eval_rewards/chosen": 0.05975308269262314,
"eval_rewards/margins": 0.012923642992973328,
"eval_rewards/rejected": 0.04682943597435951,
"eval_runtime": 510.5574,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 1000
},
{
"epoch": 0.53,
"learning_rate": 2.6848725415297888e-06,
"logits/chosen": -0.251176655292511,
"logits/rejected": -0.255452036857605,
"logps/chosen": -2234.47119140625,
"logps/rejected": -1848.127685546875,
"loss": 0.0405,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.06075858324766159,
"rewards/margins": 0.016493605449795723,
"rewards/rejected": 0.04426497966051102,
"step": 1010
},
{
"epoch": 0.53,
"learning_rate": 2.639280053601719e-06,
"logits/chosen": -0.2578621506690979,
"logits/rejected": -0.26309382915496826,
"logps/chosen": -2132.969970703125,
"logps/rejected": -1790.293212890625,
"loss": 0.034,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.06318069994449615,
"rewards/margins": 0.01207827776670456,
"rewards/rejected": 0.0511024184525013,
"step": 1020
},
{
"epoch": 0.54,
"learning_rate": 2.59364104726716e-06,
"logits/chosen": -0.25486692786216736,
"logits/rejected": -0.24112336337566376,
"logps/chosen": -1739.303955078125,
"logps/rejected": -1716.464599609375,
"loss": 0.0597,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.053387343883514404,
"rewards/margins": 0.00615662382915616,
"rewards/rejected": 0.04723071679472923,
"step": 1030
},
{
"epoch": 0.54,
"learning_rate": 2.547970765583491e-06,
"logits/chosen": -0.23829662799835205,
"logits/rejected": -0.2544878125190735,
"logps/chosen": -2119.11474609375,
"logps/rejected": -1764.7984619140625,
"loss": 0.0518,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.06005573272705078,
"rewards/margins": 0.01636183261871338,
"rewards/rejected": 0.0436939001083374,
"step": 1040
},
{
"epoch": 0.55,
"learning_rate": 2.502284462053799e-06,
"logits/chosen": -0.2548423409461975,
"logits/rejected": -0.24885638058185577,
"logps/chosen": -2136.72998046875,
"logps/rejected": -1792.2359619140625,
"loss": 0.0579,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.0639239102602005,
"rewards/margins": 0.014888137578964233,
"rewards/rejected": 0.04903577268123627,
"step": 1050
},
{
"epoch": 0.55,
"learning_rate": 2.456597395532338e-06,
"logits/chosen": -0.2554526925086975,
"logits/rejected": -0.29498496651649475,
"logps/chosen": -1785.2249755859375,
"logps/rejected": -1444.23291015625,
"loss": 0.0467,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.051430024206638336,
"rewards/margins": 0.014165714383125305,
"rewards/rejected": 0.03726430982351303,
"step": 1060
},
{
"epoch": 0.56,
"learning_rate": 2.4109248251281953e-06,
"logits/chosen": -0.25295186042785645,
"logits/rejected": -0.2443423569202423,
"logps/chosen": -2214.04541015625,
"logps/rejected": -1779.729248046875,
"loss": 0.0427,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.07067938894033432,
"rewards/margins": 0.0159921832382679,
"rewards/rejected": 0.05468720197677612,
"step": 1070
},
{
"epoch": 0.57,
"learning_rate": 2.365282005108875e-06,
"logits/chosen": -0.21839866042137146,
"logits/rejected": -0.22934658825397491,
"logps/chosen": -2245.37646484375,
"logps/rejected": -2051.3115234375,
"loss": 0.0387,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.07870273292064667,
"rewards/margins": 0.015385419130325317,
"rewards/rejected": 0.06331731379032135,
"step": 1080
},
{
"epoch": 0.57,
"learning_rate": 2.319684179805491e-06,
"logits/chosen": -0.2654665410518646,
"logits/rejected": -0.2958211302757263,
"logps/chosen": -2201.913818359375,
"logps/rejected": -1623.56298828125,
"loss": 0.0428,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.08239830285310745,
"rewards/margins": 0.022424213588237762,
"rewards/rejected": 0.05997408553957939,
"step": 1090
},
{
"epoch": 0.58,
"learning_rate": 2.2741465785212905e-06,
"logits/chosen": -0.24744835495948792,
"logits/rejected": -0.27335745096206665,
"logps/chosen": -2357.655029296875,
"logps/rejected": -1799.583740234375,
"loss": 0.0545,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.09354601800441742,
"rewards/margins": 0.019701533019542694,
"rewards/rejected": 0.07384449243545532,
"step": 1100
},
{
"epoch": 0.58,
"eval_logits/chosen": -0.23114541172981262,
"eval_logits/rejected": -0.2394075095653534,
"eval_logps/chosen": -2124.1083984375,
"eval_logps/rejected": -1802.593505859375,
"eval_loss": 0.04950037598609924,
"eval_rewards/accuracies": 0.5559999942779541,
"eval_rewards/chosen": 0.0922975018620491,
"eval_rewards/margins": 0.018478482961654663,
"eval_rewards/rejected": 0.07381902635097504,
"eval_runtime": 510.4268,
"eval_samples_per_second": 3.918,
"eval_steps_per_second": 0.98,
"step": 1100
},
{
"epoch": 0.58,
"learning_rate": 2.2286844104451848e-06,
"logits/chosen": -0.2077624499797821,
"logits/rejected": -0.2412451207637787,
"logps/chosen": -2270.83935546875,
"logps/rejected": -1800.899169921875,
"loss": 0.049,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.09902598708868027,
"rewards/margins": 0.026153406128287315,
"rewards/rejected": 0.0728725865483284,
"step": 1110
},
{
"epoch": 0.59,
"learning_rate": 2.183312859572008e-06,
"logits/chosen": -0.20635256171226501,
"logits/rejected": -0.19912874698638916,
"logps/chosen": -2253.06689453125,
"logps/rejected": -1995.773193359375,
"loss": 0.0598,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.08980433642864227,
"rewards/margins": 0.015359434299170971,
"rewards/rejected": 0.07444489747285843,
"step": 1120
},
{
"epoch": 0.59,
"learning_rate": 2.1380470796311843e-06,
"logits/chosen": -0.21904154121875763,
"logits/rejected": -0.24687853455543518,
"logps/chosen": -2060.12109375,
"logps/rejected": -1746.0015869140625,
"loss": 0.0457,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0698038712143898,
"rewards/margins": 0.01698939874768257,
"rewards/rejected": 0.05281447246670723,
"step": 1130
},
{
"epoch": 0.6,
"learning_rate": 2.092902189025507e-06,
"logits/chosen": -0.2082248479127884,
"logits/rejected": -0.21504366397857666,
"logps/chosen": -2301.1181640625,
"logps/rejected": -1757.7796630859375,
"loss": 0.0446,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0717499703168869,
"rewards/margins": 0.022804908454418182,
"rewards/rejected": 0.04894506186246872,
"step": 1140
},
{
"epoch": 0.6,
"learning_rate": 2.0478932657817105e-06,
"logits/chosen": -0.21141843497753143,
"logits/rejected": -0.2168281078338623,
"logps/chosen": -2182.329345703125,
"logps/rejected": -1772.6962890625,
"loss": 0.0492,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.07858923077583313,
"rewards/margins": 0.014738768339157104,
"rewards/rejected": 0.06385046243667603,
"step": 1150
},
{
"epoch": 0.61,
"learning_rate": 2.0030353425145376e-06,
"logits/chosen": -0.21451938152313232,
"logits/rejected": -0.23753699660301208,
"logps/chosen": -2020.727783203125,
"logps/rejected": -1757.3990478515625,
"loss": 0.0512,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.07901870459318161,
"rewards/margins": 0.01762666180729866,
"rewards/rejected": 0.06139205023646355,
"step": 1160
},
{
"epoch": 0.61,
"learning_rate": 1.958343401405964e-06,
"logits/chosen": -0.18361331522464752,
"logits/rejected": -0.1837645322084427,
"logps/chosen": -2371.175537109375,
"logps/rejected": -1958.777099609375,
"loss": 0.0508,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0850948616862297,
"rewards/margins": 0.020093852654099464,
"rewards/rejected": 0.06500101089477539,
"step": 1170
},
{
"epoch": 0.62,
"learning_rate": 1.9138323692012734e-06,
"logits/chosen": -0.22541293501853943,
"logits/rejected": -0.23021379113197327,
"logps/chosen": -2021.7099609375,
"logps/rejected": -1991.3853759765625,
"loss": 0.0582,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.06989626586437225,
"rewards/margins": 0.008970921859145164,
"rewards/rejected": 0.060925353318452835,
"step": 1180
},
{
"epoch": 0.62,
"learning_rate": 1.8695171122236443e-06,
"logits/chosen": -0.19789089262485504,
"logits/rejected": -0.21101799607276917,
"logps/chosen": -2177.219970703125,
"logps/rejected": -1758.7890625,
"loss": 0.0571,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.06635448336601257,
"rewards/margins": 0.014402633532881737,
"rewards/rejected": 0.05195184424519539,
"step": 1190
},
{
"epoch": 0.63,
"learning_rate": 1.8254124314089225e-06,
"logits/chosen": -0.2192670851945877,
"logits/rejected": -0.20262674987316132,
"logps/chosen": -2045.339111328125,
"logps/rejected": -1922.4957275390625,
"loss": 0.0481,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.06084597855806351,
"rewards/margins": 0.0031062946654856205,
"rewards/rejected": 0.05773968622088432,
"step": 1200
},
{
"epoch": 0.63,
"eval_logits/chosen": -0.21473824977874756,
"eval_logits/rejected": -0.2180851548910141,
"eval_logps/chosen": -2155.742919921875,
"eval_logps/rejected": -1829.7305908203125,
"eval_loss": 0.04951399564743042,
"eval_rewards/accuracies": 0.5684999823570251,
"eval_rewards/chosen": 0.06066294014453888,
"eval_rewards/margins": 0.013980962336063385,
"eval_rewards/rejected": 0.046681977808475494,
"eval_runtime": 510.4546,
"eval_samples_per_second": 3.918,
"eval_steps_per_second": 0.98,
"step": 1200
},
{
"epoch": 0.63,
"learning_rate": 1.781533057362221e-06,
"logits/chosen": -0.23097166419029236,
"logits/rejected": -0.2347377985715866,
"logps/chosen": -1948.361328125,
"logps/rejected": -1586.739013671875,
"loss": 0.0511,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.05206098034977913,
"rewards/margins": 0.012983322143554688,
"rewards/rejected": 0.03907765448093414,
"step": 1210
},
{
"epoch": 0.64,
"learning_rate": 1.7378936454380277e-06,
"logits/chosen": -0.20886722207069397,
"logits/rejected": -0.21028542518615723,
"logps/chosen": -2190.082763671875,
"logps/rejected": -1998.083984375,
"loss": 0.0517,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.060171376913785934,
"rewards/margins": 0.008495164103806019,
"rewards/rejected": 0.05167621374130249,
"step": 1220
},
{
"epoch": 0.64,
"learning_rate": 1.6945087708454273e-06,
"logits/chosen": -0.18295393884181976,
"logits/rejected": -0.1880742609500885,
"logps/chosen": -2117.80908203125,
"logps/rejected": -1735.502197265625,
"loss": 0.0499,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.060192208737134933,
"rewards/margins": 0.009002082981169224,
"rewards/rejected": 0.05119013041257858,
"step": 1230
},
{
"epoch": 0.65,
"learning_rate": 1.651392923780105e-06,
"logits/chosen": -0.19351014494895935,
"logits/rejected": -0.20447520911693573,
"logps/chosen": -2093.595703125,
"logps/rejected": -1855.68359375,
"loss": 0.0491,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.06083123758435249,
"rewards/margins": 0.008342139422893524,
"rewards/rejected": 0.05248909443616867,
"step": 1240
},
{
"epoch": 0.65,
"learning_rate": 1.608560504584737e-06,
"logits/chosen": -0.20279578864574432,
"logits/rejected": -0.21171894669532776,
"logps/chosen": -2053.47412109375,
"logps/rejected": -1884.202392578125,
"loss": 0.0516,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.06217293068766594,
"rewards/margins": 0.01359265111386776,
"rewards/rejected": 0.04858027398586273,
"step": 1250
},
{
"epoch": 0.66,
"learning_rate": 1.5660258189393945e-06,
"logits/chosen": -0.2138860523700714,
"logits/rejected": -0.20899005234241486,
"logps/chosen": -2356.08447265625,
"logps/rejected": -2091.6435546875,
"loss": 0.0455,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.06469549238681793,
"rewards/margins": 0.009515106678009033,
"rewards/rejected": 0.055180393159389496,
"step": 1260
},
{
"epoch": 0.66,
"learning_rate": 1.5238030730835578e-06,
"logits/chosen": -0.22949472069740295,
"logits/rejected": -0.22715874016284943,
"logps/chosen": -2223.537353515625,
"logps/rejected": -1747.806640625,
"loss": 0.0535,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.06524848937988281,
"rewards/margins": 0.01758180931210518,
"rewards/rejected": 0.047666680067777634,
"step": 1270
},
{
"epoch": 0.67,
"learning_rate": 1.4819063690713565e-06,
"logits/chosen": -0.19447948038578033,
"logits/rejected": -0.2098011076450348,
"logps/chosen": -1938.2740478515625,
"logps/rejected": -1719.427734375,
"loss": 0.0583,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.057724129408597946,
"rewards/margins": 0.012677346356213093,
"rewards/rejected": 0.04504678025841713,
"step": 1280
},
{
"epoch": 0.68,
"learning_rate": 1.4403497000615885e-06,
"logits/chosen": -0.20776407420635223,
"logits/rejected": -0.1970272809267044,
"logps/chosen": -1996.076171875,
"logps/rejected": -1722.409423828125,
"loss": 0.0624,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.06180752441287041,
"rewards/margins": 0.012057540938258171,
"rewards/rejected": 0.049749989062547684,
"step": 1290
},
{
"epoch": 0.68,
"learning_rate": 1.3991469456441273e-06,
"logits/chosen": -0.19028015434741974,
"logits/rejected": -0.18771126866340637,
"logps/chosen": -2132.365966796875,
"logps/rejected": -1654.494140625,
"loss": 0.0441,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.053263597190380096,
"rewards/margins": 0.01780819520354271,
"rewards/rejected": 0.03545539826154709,
"step": 1300
},
{
"epoch": 0.68,
"eval_logits/chosen": -0.21745455265045166,
"eval_logits/rejected": -0.22021788358688354,
"eval_logps/chosen": -2159.675537109375,
"eval_logps/rejected": -1833.5484619140625,
"eval_loss": 0.04945502430200577,
"eval_rewards/accuracies": 0.5690000057220459,
"eval_rewards/chosen": 0.05673002824187279,
"eval_rewards/margins": 0.013866120018064976,
"eval_rewards/rejected": 0.042863909155130386,
"eval_runtime": 510.5607,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 1300
},
{
"epoch": 0.69,
"learning_rate": 1.3583118672042441e-06,
"logits/chosen": -0.20240898430347443,
"logits/rejected": -0.23169991374015808,
"logps/chosen": -2309.421630859375,
"logps/rejected": -1885.686279296875,
"loss": 0.0545,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.06426791846752167,
"rewards/margins": 0.016114329919219017,
"rewards/rejected": 0.0481535978615284,
"step": 1310
},
{
"epoch": 0.69,
"learning_rate": 1.3178581033264218e-06,
"logits/chosen": -0.2012084424495697,
"logits/rejected": -0.23439760506153107,
"logps/chosen": -1958.7874755859375,
"logps/rejected": -1563.35302734375,
"loss": 0.0578,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.05583029240369797,
"rewards/margins": 0.016821032389998436,
"rewards/rejected": 0.03900925815105438,
"step": 1320
},
{
"epoch": 0.7,
"learning_rate": 1.2777991652391757e-06,
"logits/chosen": -0.2176096886396408,
"logits/rejected": -0.23196351528167725,
"logps/chosen": -2121.034912109375,
"logps/rejected": -1711.7109375,
"loss": 0.0395,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.06722841411828995,
"rewards/margins": 0.011516690254211426,
"rewards/rejected": 0.055711716413497925,
"step": 1330
},
{
"epoch": 0.7,
"learning_rate": 1.2381484323024178e-06,
"logits/chosen": -0.19108158349990845,
"logits/rejected": -0.2023816853761673,
"logps/chosen": -2302.89697265625,
"logps/rejected": -1892.548095703125,
"loss": 0.055,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.08529181778430939,
"rewards/margins": 0.020634423941373825,
"rewards/rejected": 0.06465739011764526,
"step": 1340
},
{
"epoch": 0.71,
"learning_rate": 1.1989191475388518e-06,
"logits/chosen": -0.2374308556318283,
"logits/rejected": -0.2234220951795578,
"logps/chosen": -2113.50146484375,
"logps/rejected": -1874.0279541015625,
"loss": 0.0679,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.07649590075016022,
"rewards/margins": 0.014174291864037514,
"rewards/rejected": 0.06232162192463875,
"step": 1350
},
{
"epoch": 0.71,
"learning_rate": 1.160124413210918e-06,
"logits/chosen": -0.24203363060951233,
"logits/rejected": -0.2396487444639206,
"logps/chosen": -2022.40625,
"logps/rejected": -1880.9495849609375,
"loss": 0.0406,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.07169513404369354,
"rewards/margins": 0.015681343153119087,
"rewards/rejected": 0.05601378530263901,
"step": 1360
},
{
"epoch": 0.72,
"learning_rate": 1.1217771864447396e-06,
"logits/chosen": -0.2442229688167572,
"logits/rejected": -0.2381734549999237,
"logps/chosen": -2145.31689453125,
"logps/rejected": -1823.568359375,
"loss": 0.0528,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.059424418956041336,
"rewards/margins": 0.017248233780264854,
"rewards/rejected": 0.042176179587841034,
"step": 1370
},
{
"epoch": 0.72,
"learning_rate": 1.08389027490255e-06,
"logits/chosen": -0.22975793480873108,
"logits/rejected": -0.2311103641986847,
"logps/chosen": -2078.482421875,
"logps/rejected": -1920.7864990234375,
"loss": 0.0372,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.05944003537297249,
"rewards/margins": 0.012296736240386963,
"rewards/rejected": 0.04714329540729523,
"step": 1380
},
{
"epoch": 0.73,
"learning_rate": 1.046476332505036e-06,
"logits/chosen": -0.22442571818828583,
"logits/rejected": -0.24409636855125427,
"logps/chosen": -1815.8922119140625,
"logps/rejected": -1615.753173828125,
"loss": 0.0427,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": 0.0425129197537899,
"rewards/margins": 0.005890417378395796,
"rewards/rejected": 0.03662250563502312,
"step": 1390
},
{
"epoch": 0.73,
"learning_rate": 1.0095478552050348e-06,
"logits/chosen": -0.23266033828258514,
"logits/rejected": -0.22884194552898407,
"logps/chosen": -2215.30078125,
"logps/rejected": -1916.711669921875,
"loss": 0.0524,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.05568776652216911,
"rewards/margins": 0.016158053651452065,
"rewards/rejected": 0.0395297110080719,
"step": 1400
},
{
"epoch": 0.73,
"eval_logits/chosen": -0.24221491813659668,
"eval_logits/rejected": -0.24745041131973267,
"eval_logps/chosen": -2163.659912109375,
"eval_logps/rejected": -1837.5037841796875,
"eval_loss": 0.04962093383073807,
"eval_rewards/accuracies": 0.5684999823570251,
"eval_rewards/chosen": 0.05274572595953941,
"eval_rewards/margins": 0.013837032951414585,
"eval_rewards/rejected": 0.0389086939394474,
"eval_runtime": 510.7706,
"eval_samples_per_second": 3.916,
"eval_steps_per_second": 0.979,
"step": 1400
},
{
"epoch": 0.74,
"learning_rate": 9.731171768139808e-07,
"logits/chosen": -0.2232085019350052,
"logits/rejected": -0.2379104197025299,
"logps/chosen": -2600.76953125,
"logps/rejected": -2177.601806640625,
"loss": 0.0462,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.06885553896427155,
"rewards/margins": 0.017164334654808044,
"rewards/rejected": 0.0516912117600441,
"step": 1410
},
{
"epoch": 0.74,
"learning_rate": 9.371964648825221e-07,
"logits/chosen": -0.2546294629573822,
"logits/rejected": -0.2645355761051178,
"logps/chosen": -1949.8623046875,
"logps/rejected": -1700.5302734375,
"loss": 0.0471,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.05458490923047066,
"rewards/margins": 0.015815045684576035,
"rewards/rejected": 0.03876986354589462,
"step": 1420
},
{
"epoch": 0.75,
"learning_rate": 9.017977166366445e-07,
"logits/chosen": -0.2653118669986725,
"logits/rejected": -0.26530537009239197,
"logps/chosen": -2068.649169921875,
"logps/rejected": -1767.718017578125,
"loss": 0.0417,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.065969318151474,
"rewards/margins": 0.021549370139837265,
"rewards/rejected": 0.044419944286346436,
"step": 1430
},
{
"epoch": 0.75,
"learning_rate": 8.669327549707096e-07,
"logits/chosen": -0.24164719879627228,
"logits/rejected": -0.2417771816253662,
"logps/chosen": -2113.025390625,
"logps/rejected": -1844.6246337890625,
"loss": 0.037,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.07654988765716553,
"rewards/margins": 0.013212883844971657,
"rewards/rejected": 0.06333700567483902,
"step": 1440
},
{
"epoch": 0.76,
"learning_rate": 8.326132244986932e-07,
"logits/chosen": -0.22752514481544495,
"logits/rejected": -0.23347719013690948,
"logps/chosen": -1921.048828125,
"logps/rejected": -1598.9993896484375,
"loss": 0.0546,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.06702348589897156,
"rewards/margins": 0.018387358635663986,
"rewards/rejected": 0.048636119812726974,
"step": 1450
},
{
"epoch": 0.76,
"learning_rate": 7.988505876649863e-07,
"logits/chosen": -0.22021660208702087,
"logits/rejected": -0.21698196232318878,
"logps/chosen": -2022.852783203125,
"logps/rejected": -1766.4072265625,
"loss": 0.0638,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.06179197505116463,
"rewards/margins": 0.01092799287289381,
"rewards/rejected": 0.05086398124694824,
"step": 1460
},
{
"epoch": 0.77,
"learning_rate": 7.656561209160248e-07,
"logits/chosen": -0.22338895499706268,
"logits/rejected": -0.2199423760175705,
"logps/chosen": -1993.245361328125,
"logps/rejected": -1980.831787109375,
"loss": 0.0596,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.06557862460613251,
"rewards/margins": 0.014192071743309498,
"rewards/rejected": 0.051386553794145584,
"step": 1470
},
{
"epoch": 0.77,
"learning_rate": 7.330409109340563e-07,
"logits/chosen": -0.21173898875713348,
"logits/rejected": -0.23590870201587677,
"logps/chosen": -2017.1929931640625,
"logps/rejected": -1657.1859130859375,
"loss": 0.0577,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.055760689079761505,
"rewards/margins": 0.013670523650944233,
"rewards/rejected": 0.04209016636013985,
"step": 1480
},
{
"epoch": 0.78,
"learning_rate": 7.010158509342682e-07,
"logits/chosen": -0.23114773631095886,
"logits/rejected": -0.23729057610034943,
"logps/chosen": -2105.003173828125,
"logps/rejected": -1778.635498046875,
"loss": 0.0626,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.06233568117022514,
"rewards/margins": 0.0166020505130291,
"rewards/rejected": 0.04573363438248634,
"step": 1490
},
{
"epoch": 0.79,
"learning_rate": 6.695916370265529e-07,
"logits/chosen": -0.2337017059326172,
"logits/rejected": -0.234249085187912,
"logps/chosen": -2289.919189453125,
"logps/rejected": -2049.482177734375,
"loss": 0.0425,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.06871043145656586,
"rewards/margins": 0.010876113548874855,
"rewards/rejected": 0.05783431604504585,
"step": 1500
},
{
"epoch": 0.79,
"eval_logits/chosen": -0.22742050886154175,
"eval_logits/rejected": -0.23352740705013275,
"eval_logps/chosen": -2154.34033203125,
"eval_logps/rejected": -1829.7928466796875,
"eval_loss": 0.04929284378886223,
"eval_rewards/accuracies": 0.5674999952316284,
"eval_rewards/chosen": 0.06206566095352173,
"eval_rewards/margins": 0.015445946715772152,
"eval_rewards/rejected": 0.0466197207570076,
"eval_runtime": 510.6117,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 1500
},
{
"epoch": 0.79,
"learning_rate": 6.387787646430854e-07,
"logits/chosen": -0.22113287448883057,
"logits/rejected": -0.21311786770820618,
"logps/chosen": -2256.49951171875,
"logps/rejected": -1961.322509765625,
"loss": 0.0569,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.06963467597961426,
"rewards/margins": 0.015023264102637768,
"rewards/rejected": 0.05461140722036362,
"step": 1510
},
{
"epoch": 0.8,
"learning_rate": 6.085875250329401e-07,
"logits/chosen": -0.24658381938934326,
"logits/rejected": -0.2602604925632477,
"logps/chosen": -1738.8958740234375,
"logps/rejected": -1530.5950927734375,
"loss": 0.0426,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.0541699044406414,
"rewards/margins": 0.007052128203213215,
"rewards/rejected": 0.04711777716875076,
"step": 1520
},
{
"epoch": 0.8,
"learning_rate": 5.79028001824894e-07,
"logits/chosen": -0.23341718316078186,
"logits/rejected": -0.2346893846988678,
"logps/chosen": -2004.2154541015625,
"logps/rejected": -1676.0980224609375,
"loss": 0.0508,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.06295724958181381,
"rewards/margins": 0.0180866289883852,
"rewards/rejected": 0.04487061873078346,
"step": 1530
},
{
"epoch": 0.81,
"learning_rate": 5.501100676595761e-07,
"logits/chosen": -0.23593036830425262,
"logits/rejected": -0.2430458515882492,
"logps/chosen": -2228.714111328125,
"logps/rejected": -1928.757080078125,
"loss": 0.0463,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.07435286045074463,
"rewards/margins": 0.01624133810400963,
"rewards/rejected": 0.0581115186214447,
"step": 1540
},
{
"epoch": 0.81,
"learning_rate": 5.218433808920884e-07,
"logits/chosen": -0.22192791104316711,
"logits/rejected": -0.23522309958934784,
"logps/chosen": -2106.592041015625,
"logps/rejected": -1706.7427978515625,
"loss": 0.0499,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.0745842233300209,
"rewards/margins": 0.023264039307832718,
"rewards/rejected": 0.05132018402218819,
"step": 1550
},
{
"epoch": 0.82,
"learning_rate": 4.942373823661928e-07,
"logits/chosen": -0.22204573452472687,
"logits/rejected": -0.22397270798683167,
"logps/chosen": -1921.546875,
"logps/rejected": -1686.9296875,
"loss": 0.0498,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.060963042080402374,
"rewards/margins": 0.011381834745407104,
"rewards/rejected": 0.04958119988441467,
"step": 1560
},
{
"epoch": 0.82,
"learning_rate": 4.6730129226114363e-07,
"logits/chosen": -0.22597956657409668,
"logits/rejected": -0.24938449263572693,
"logps/chosen": -1889.419189453125,
"logps/rejected": -1659.8782958984375,
"loss": 0.0636,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.06332211196422577,
"rewards/margins": 0.013857582584023476,
"rewards/rejected": 0.04946453124284744,
"step": 1570
},
{
"epoch": 0.83,
"learning_rate": 4.4104410701222703e-07,
"logits/chosen": -0.21659445762634277,
"logits/rejected": -0.22937150299549103,
"logps/chosen": -2046.7357177734375,
"logps/rejected": -1769.0198974609375,
"loss": 0.0451,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.06872855126857758,
"rewards/margins": 0.014504766091704369,
"rewards/rejected": 0.05422378331422806,
"step": 1580
},
{
"epoch": 0.83,
"learning_rate": 4.154745963060197e-07,
"logits/chosen": -0.21276862919330597,
"logits/rejected": -0.2097276896238327,
"logps/chosen": -2136.18505859375,
"logps/rejected": -2049.29931640625,
"loss": 0.0536,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": 0.07395701855421066,
"rewards/margins": 0.006907849106937647,
"rewards/rejected": 0.06704917550086975,
"step": 1590
},
{
"epoch": 0.84,
"learning_rate": 3.9060130015138863e-07,
"logits/chosen": -0.22508184611797333,
"logits/rejected": -0.23881450295448303,
"logps/chosen": -1998.213623046875,
"logps/rejected": -1759.222412109375,
"loss": 0.0387,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.06664810329675674,
"rewards/margins": 0.011612234637141228,
"rewards/rejected": 0.05503587797284126,
"step": 1600
},
{
"epoch": 0.84,
"eval_logits/chosen": -0.22299662232398987,
"eval_logits/rejected": -0.2297811657190323,
"eval_logps/chosen": -2145.159423828125,
"eval_logps/rejected": -1821.890869140625,
"eval_loss": 0.049171119928359985,
"eval_rewards/accuracies": 0.5705000162124634,
"eval_rewards/chosen": 0.07124640792608261,
"eval_rewards/margins": 0.01672479324042797,
"eval_rewards/rejected": 0.05452162027359009,
"eval_runtime": 510.5649,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 1600
},
{
"epoch": 0.84,
"learning_rate": 3.664325260271953e-07,
"logits/chosen": -0.21083417534828186,
"logits/rejected": -0.21836061775684357,
"logps/chosen": -1921.0771484375,
"logps/rejected": -1615.395263671875,
"loss": 0.0511,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.06516659259796143,
"rewards/margins": 0.010829558596014977,
"rewards/rejected": 0.0543370358645916,
"step": 1610
},
{
"epoch": 0.85,
"learning_rate": 3.429763461076677e-07,
"logits/chosen": -0.24559417366981506,
"logits/rejected": -0.23943760991096497,
"logps/chosen": -1962.534423828125,
"logps/rejected": -1884.5029296875,
"loss": 0.0509,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.06632138788700104,
"rewards/margins": 0.007880722172558308,
"rewards/rejected": 0.0584406740963459,
"step": 1620
},
{
"epoch": 0.85,
"learning_rate": 3.202405945663556e-07,
"logits/chosen": -0.1999385952949524,
"logits/rejected": -0.22029852867126465,
"logps/chosen": -2002.773193359375,
"logps/rejected": -1645.963623046875,
"loss": 0.0417,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.06611990928649902,
"rewards/margins": 0.018743688240647316,
"rewards/rejected": 0.04737623408436775,
"step": 1630
},
{
"epoch": 0.86,
"learning_rate": 2.982328649595856e-07,
"logits/chosen": -0.23098058998584747,
"logits/rejected": -0.2513691782951355,
"logps/chosen": -2234.155517578125,
"logps/rejected": -1985.674560546875,
"loss": 0.0346,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.08107596635818481,
"rewards/margins": 0.014828977175056934,
"rewards/rejected": 0.06624698638916016,
"step": 1640
},
{
"epoch": 0.86,
"learning_rate": 2.7696050769026954e-07,
"logits/chosen": -0.18273136019706726,
"logits/rejected": -0.18867138028144836,
"logps/chosen": -2058.58349609375,
"logps/rejected": -1610.1470947265625,
"loss": 0.054,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.06953487545251846,
"rewards/margins": 0.014529886655509472,
"rewards/rejected": 0.055004991590976715,
"step": 1650
},
{
"epoch": 0.87,
"learning_rate": 2.564306275529341e-07,
"logits/chosen": -0.21245570480823517,
"logits/rejected": -0.23336832225322723,
"logps/chosen": -1910.864013671875,
"logps/rejected": -1668.5755615234375,
"loss": 0.0652,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.06216276437044144,
"rewards/margins": 0.015415112487971783,
"rewards/rejected": 0.04674764350056648,
"step": 1660
},
{
"epoch": 0.87,
"learning_rate": 2.3665008136077332e-07,
"logits/chosen": -0.2325417697429657,
"logits/rejected": -0.2111097276210785,
"logps/chosen": -2226.126220703125,
"logps/rejected": -2146.695556640625,
"loss": 0.0541,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.07398100197315216,
"rewards/margins": 0.01466774009168148,
"rewards/rejected": 0.05931326001882553,
"step": 1670
},
{
"epoch": 0.88,
"learning_rate": 2.1762547565553293e-07,
"logits/chosen": -0.23489132523536682,
"logits/rejected": -0.26181578636169434,
"logps/chosen": -2163.59130859375,
"logps/rejected": -1799.791015625,
"loss": 0.045,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0649464800953865,
"rewards/margins": 0.013443303294479847,
"rewards/rejected": 0.051503174006938934,
"step": 1680
},
{
"epoch": 0.88,
"learning_rate": 1.993631645009747e-07,
"logits/chosen": -0.2324393093585968,
"logits/rejected": -0.2406429946422577,
"logps/chosen": -2243.14990234375,
"logps/rejected": -1793.9261474609375,
"loss": 0.0459,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.07493821531534195,
"rewards/margins": 0.016798479482531548,
"rewards/rejected": 0.05813973397016525,
"step": 1690
},
{
"epoch": 0.89,
"learning_rate": 1.818692473606748e-07,
"logits/chosen": -0.2428218573331833,
"logits/rejected": -0.22781512141227722,
"logps/chosen": -1967.1754150390625,
"logps/rejected": -1719.901123046875,
"loss": 0.0556,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.05938352271914482,
"rewards/margins": 0.014253886416554451,
"rewards/rejected": 0.04512963443994522,
"step": 1700
},
{
"epoch": 0.89,
"eval_logits/chosen": -0.21960072219371796,
"eval_logits/rejected": -0.22588692605495453,
"eval_logps/chosen": -2149.13818359375,
"eval_logps/rejected": -1825.278564453125,
"eval_loss": 0.04918248951435089,
"eval_rewards/accuracies": 0.5674999952316284,
"eval_rewards/chosen": 0.06726768612861633,
"eval_rewards/margins": 0.016133680939674377,
"eval_rewards/rejected": 0.05113400146365166,
"eval_runtime": 510.6325,
"eval_samples_per_second": 3.917,
"eval_steps_per_second": 0.979,
"step": 1700
},
{
"epoch": 0.9,
"learning_rate": 1.6514956706084885e-07,
"logits/chosen": -0.1846579611301422,
"logits/rejected": -0.21179255843162537,
"logps/chosen": -2128.083740234375,
"logps/rejected": -1733.182861328125,
"loss": 0.0404,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.06305380910634995,
"rewards/margins": 0.013125176541507244,
"rewards/rejected": 0.049928631633520126,
"step": 1710
},
{
"epoch": 0.9,
"learning_rate": 1.4920970783889737e-07,
"logits/chosen": -0.19069206714630127,
"logits/rejected": -0.2094193696975708,
"logps/chosen": -2153.69580078125,
"logps/rejected": -1683.8616943359375,
"loss": 0.053,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.06470336019992828,
"rewards/margins": 0.014464011415839195,
"rewards/rejected": 0.05023934692144394,
"step": 1720
},
{
"epoch": 0.91,
"learning_rate": 1.340549934783164e-07,
"logits/chosen": -0.20567326247692108,
"logits/rejected": -0.22026868164539337,
"logps/chosen": -2285.531005859375,
"logps/rejected": -1900.779052734375,
"loss": 0.0393,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.07268913835287094,
"rewards/margins": 0.022747965529561043,
"rewards/rejected": 0.04994116351008415,
"step": 1730
},
{
"epoch": 0.91,
"learning_rate": 1.196904855305961e-07,
"logits/chosen": -0.21483811736106873,
"logits/rejected": -0.23524871468544006,
"logps/chosen": -2039.6002197265625,
"logps/rejected": -1728.5648193359375,
"loss": 0.0567,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.06144179031252861,
"rewards/margins": 0.012769539840519428,
"rewards/rejected": 0.04867224767804146,
"step": 1740
},
{
"epoch": 0.92,
"learning_rate": 1.0612098162470302e-07,
"logits/chosen": -0.209224671125412,
"logits/rejected": -0.22041518986225128,
"logps/chosen": -1966.912353515625,
"logps/rejected": -1770.239990234375,
"loss": 0.0473,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0644414871931076,
"rewards/margins": 0.014524770900607109,
"rewards/rejected": 0.049916718155145645,
"step": 1750
},
{
"epoch": 0.92,
"learning_rate": 9.335101386471285e-08,
"logits/chosen": -0.1998235136270523,
"logits/rejected": -0.205234095454216,
"logps/chosen": -2081.581787109375,
"logps/rejected": -1733.4261474609375,
"loss": 0.0385,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.07012965530157089,
"rewards/margins": 0.018411414697766304,
"rewards/rejected": 0.05171824246644974,
"step": 1760
},
{
"epoch": 0.93,
"learning_rate": 8.138484731612273e-08,
"logits/chosen": -0.21281655132770538,
"logits/rejected": -0.2385600358247757,
"logps/chosen": -2208.79736328125,
"logps/rejected": -1743.3955078125,
"loss": 0.0508,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.06378593295812607,
"rewards/margins": 0.012727012857794762,
"rewards/rejected": 0.051058925688266754,
"step": 1770
},
{
"epoch": 0.93,
"learning_rate": 7.022647858135501e-08,
"logits/chosen": -0.22884276509284973,
"logits/rejected": -0.2317463457584381,
"logps/chosen": -2076.50048828125,
"logps/rejected": -1827.575439453125,
"loss": 0.0497,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.06807791441679001,
"rewards/margins": 0.011940672062337399,
"rewards/rejected": 0.05613725259900093,
"step": 1780
},
{
"epoch": 0.94,
"learning_rate": 5.987963446492384e-08,
"logits/chosen": -0.20104601979255676,
"logits/rejected": -0.19782570004463196,
"logps/chosen": -1880.1734619140625,
"logps/rejected": -1685.8695068359375,
"loss": 0.0577,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.05542607977986336,
"rewards/margins": 0.011624794453382492,
"rewards/rejected": 0.04380128160119057,
"step": 1790
},
{
"epoch": 0.94,
"learning_rate": 5.034777072871394e-08,
"logits/chosen": -0.1951800137758255,
"logits/rejected": -0.21924810111522675,
"logps/chosen": -1900.6998291015625,
"logps/rejected": -1563.48876953125,
"loss": 0.0519,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.05453474447131157,
"rewards/margins": 0.008965181186795235,
"rewards/rejected": 0.045569561421871185,
"step": 1800
},
{
"epoch": 0.94,
"eval_logits/chosen": -0.21762163937091827,
"eval_logits/rejected": -0.2241181582212448,
"eval_logps/chosen": -2148.109619140625,
"eval_logps/rejected": -1824.3348388671875,
"eval_loss": 0.04916713759303093,
"eval_rewards/accuracies": 0.5690000057220459,
"eval_rewards/chosen": 0.0682961568236351,
"eval_rewards/margins": 0.016218481585383415,
"eval_rewards/rejected": 0.05207766965031624,
"eval_runtime": 510.6606,
"eval_samples_per_second": 3.916,
"eval_steps_per_second": 0.979,
"step": 1800
},
{
"epoch": 0.95,
"learning_rate": 4.163407093778243e-08,
"logits/chosen": -0.1938626617193222,
"logits/rejected": -0.21109886467456818,
"logps/chosen": -1975.074951171875,
"logps/rejected": -1626.160888671875,
"loss": 0.0487,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.06530580669641495,
"rewards/margins": 0.017866965383291245,
"rewards/rejected": 0.0474388413131237,
"step": 1810
},
{
"epoch": 0.95,
"learning_rate": 3.37414453970758e-08,
"logits/chosen": -0.20253758132457733,
"logits/rejected": -0.21794748306274414,
"logps/chosen": -2259.384765625,
"logps/rejected": -2048.346435546875,
"loss": 0.055,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.07413917034864426,
"rewards/margins": 0.02017979882657528,
"rewards/rejected": 0.053959377110004425,
"step": 1820
},
{
"epoch": 0.96,
"learning_rate": 2.6672530179410183e-08,
"logits/chosen": -0.19098524749279022,
"logits/rejected": -0.19925786554813385,
"logps/chosen": -2076.921630859375,
"logps/rejected": -1773.484619140625,
"loss": 0.0567,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.06547501683235168,
"rewards/margins": 0.016652025282382965,
"rewards/rejected": 0.04882299154996872,
"step": 1830
},
{
"epoch": 0.96,
"learning_rate": 2.04296862450451e-08,
"logits/chosen": -0.20272760093212128,
"logits/rejected": -0.23647110164165497,
"logps/chosen": -2243.960205078125,
"logps/rejected": -2028.1578369140625,
"loss": 0.0483,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.07614084333181381,
"rewards/margins": 0.01566244289278984,
"rewards/rejected": 0.06047840043902397,
"step": 1840
},
{
"epoch": 0.97,
"learning_rate": 1.501499865314171e-08,
"logits/chosen": -0.22630052268505096,
"logits/rejected": -0.20891804993152618,
"logps/chosen": -1954.311279296875,
"logps/rejected": -1862.181640625,
"loss": 0.0493,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.06258489936590195,
"rewards/margins": 0.01086291205137968,
"rewards/rejected": 0.05172199010848999,
"step": 1850
},
{
"epoch": 0.97,
"learning_rate": 1.0430275865371265e-08,
"logits/chosen": -0.21302291750907898,
"logits/rejected": -0.22670722007751465,
"logps/chosen": -1873.691650390625,
"logps/rejected": -1755.8060302734375,
"loss": 0.0559,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.05816579982638359,
"rewards/margins": 0.013311423361301422,
"rewards/rejected": 0.04485438019037247,
"step": 1860
},
{
"epoch": 0.98,
"learning_rate": 6.677049141901315e-09,
"logits/chosen": -0.1987680345773697,
"logits/rejected": -0.2282913625240326,
"logps/chosen": -1935.877197265625,
"logps/rejected": -1745.5570068359375,
"loss": 0.0648,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.06320817768573761,
"rewards/margins": 0.013358126394450665,
"rewards/rejected": 0.049850039184093475,
"step": 1870
},
{
"epoch": 0.98,
"learning_rate": 3.756572029968708e-09,
"logits/chosen": -0.21312955021858215,
"logits/rejected": -0.22611579298973083,
"logps/chosen": -2255.653564453125,
"logps/rejected": -1939.3330078125,
"loss": 0.0459,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.07814554870128632,
"rewards/margins": 0.018966957926750183,
"rewards/rejected": 0.05917859077453613,
"step": 1880
},
{
"epoch": 0.99,
"learning_rate": 1.6698199452053199e-09,
"logits/chosen": -0.2335490882396698,
"logits/rejected": -0.22842903435230255,
"logps/chosen": -2171.2451171875,
"logps/rejected": -1897.8079833984375,
"loss": 0.0534,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.07153952866792679,
"rewards/margins": 0.017695123329758644,
"rewards/rejected": 0.053844403475522995,
"step": 1890
},
{
"epoch": 0.99,
"learning_rate": 4.1748984585560094e-10,
"logits/chosen": -0.18400521576404572,
"logits/rejected": -0.20925450325012207,
"logps/chosen": -2247.09765625,
"logps/rejected": -1860.405517578125,
"loss": 0.05,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0739186555147171,
"rewards/margins": 0.018731053918600082,
"rewards/rejected": 0.05518760159611702,
"step": 1900
},
{
"epoch": 0.99,
"eval_logits/chosen": -0.21872195601463318,
"eval_logits/rejected": -0.22539223730564117,
"eval_logps/chosen": -2148.457763671875,
"eval_logps/rejected": -1824.6458740234375,
"eval_loss": 0.04917627200484276,
"eval_rewards/accuracies": 0.5669999718666077,
"eval_rewards/chosen": 0.06794830411672592,
"eval_rewards/margins": 0.016181621700525284,
"eval_rewards/rejected": 0.05176668241620064,
"eval_runtime": 511.1542,
"eval_samples_per_second": 3.913,
"eval_steps_per_second": 0.978,
"step": 1900
},
{
"epoch": 1.0,
"learning_rate": 0.0,
"logits/chosen": -0.21931472420692444,
"logits/rejected": -0.21789617836475372,
"logps/chosen": -2259.647216796875,
"logps/rejected": -1940.0595703125,
"loss": 0.0525,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.07358353585004807,
"rewards/margins": 0.015202896669507027,
"rewards/rejected": 0.05838064104318619,
"step": 1910
},
{
"epoch": 1.0,
"step": 1910,
"total_flos": 0.0,
"train_loss": 0.05238237046290443,
"train_runtime": 26355.2814,
"train_samples_per_second": 1.16,
"train_steps_per_second": 0.072
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}