Zenith-7B-dpo-v2 / checkpoint-500 /trainer_state.json
gagan3012's picture
Upload folder using huggingface_hub
bedefbe verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5215803885773895,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.000000000000001e-07,
"logits/chosen": -2.2717783451080322,
"logits/rejected": -2.2640371322631836,
"logps/chosen": -200.07493591308594,
"logps/rejected": -200.70086669921875,
"loss": 0.6789,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.023946668952703476,
"rewards/margins": 0.029492639005184174,
"rewards/rejected": -0.005545974709093571,
"step": 1
},
{
"epoch": 0.0,
"learning_rate": 1.0000000000000002e-06,
"logits/chosen": -2.2359213829040527,
"logits/rejected": -2.2241828441619873,
"logps/chosen": -188.74400329589844,
"logps/rejected": -181.30078125,
"loss": 0.7042,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.022877119481563568,
"rewards/margins": -0.02025613933801651,
"rewards/rejected": -0.0026209834031760693,
"step": 2
},
{
"epoch": 0.0,
"learning_rate": 1.5e-06,
"logits/chosen": -2.2504844665527344,
"logits/rejected": -2.2917656898498535,
"logps/chosen": -182.1482391357422,
"logps/rejected": -201.4050750732422,
"loss": 0.6934,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.007386707700788975,
"rewards/margins": 0.0019398471340537071,
"rewards/rejected": 0.005446866154670715,
"step": 3
},
{
"epoch": 0.0,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": -2.166776180267334,
"logits/rejected": -2.0744781494140625,
"logps/chosen": -173.78936767578125,
"logps/rejected": -150.8326416015625,
"loss": 0.6891,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0020218612626194954,
"rewards/margins": 0.009007596410810947,
"rewards/rejected": -0.011029457673430443,
"step": 4
},
{
"epoch": 0.01,
"learning_rate": 2.5e-06,
"logits/chosen": -2.1799259185791016,
"logits/rejected": -2.3425800800323486,
"logps/chosen": -137.8708953857422,
"logps/rejected": -148.37060546875,
"loss": 0.6906,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.012697315774857998,
"rewards/margins": 0.00599064864218235,
"rewards/rejected": 0.006706667132675648,
"step": 5
},
{
"epoch": 0.01,
"learning_rate": 3e-06,
"logits/chosen": -2.1913797855377197,
"logits/rejected": -2.1852920055389404,
"logps/chosen": -127.57758331298828,
"logps/rejected": -138.31591796875,
"loss": 0.7037,
"rewards/accuracies": 0.125,
"rewards/chosen": -0.010897636413574219,
"rewards/margins": -0.02074580080807209,
"rewards/rejected": 0.009848165325820446,
"step": 6
},
{
"epoch": 0.01,
"learning_rate": 3.5000000000000004e-06,
"logits/chosen": -2.1357011795043945,
"logits/rejected": -2.136214256286621,
"logps/chosen": -139.83346557617188,
"logps/rejected": -145.2589111328125,
"loss": 0.6845,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0056040287017822266,
"rewards/margins": 0.01838543452322483,
"rewards/rejected": -0.01278140489012003,
"step": 7
},
{
"epoch": 0.01,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": -2.105727195739746,
"logits/rejected": -2.0614218711853027,
"logps/chosen": -195.34869384765625,
"logps/rejected": -206.2098388671875,
"loss": 0.6963,
"rewards/accuracies": 0.1875,
"rewards/chosen": 0.030789854004979134,
"rewards/margins": -0.005677317269146442,
"rewards/rejected": 0.03646716848015785,
"step": 8
},
{
"epoch": 0.01,
"learning_rate": 4.5e-06,
"logits/chosen": -2.3017406463623047,
"logits/rejected": -2.3441271781921387,
"logps/chosen": -174.49053955078125,
"logps/rejected": -197.5611572265625,
"loss": 0.6875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.006907796021550894,
"rewards/margins": 0.012369632720947266,
"rewards/rejected": -0.019277429208159447,
"step": 9
},
{
"epoch": 0.01,
"learning_rate": 5e-06,
"logits/chosen": -2.257244825363159,
"logits/rejected": -2.2989351749420166,
"logps/chosen": -182.47164916992188,
"logps/rejected": -170.6004180908203,
"loss": 0.6925,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.009080934338271618,
"rewards/margins": 0.0034890654496848583,
"rewards/rejected": 0.005591869354248047,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 5.500000000000001e-06,
"logits/chosen": -2.084881544113159,
"logits/rejected": -2.1664023399353027,
"logps/chosen": -147.18572998046875,
"logps/rejected": -154.4085693359375,
"loss": 0.7014,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.014512108638882637,
"rewards/margins": -0.0151824951171875,
"rewards/rejected": 0.0006703853141516447,
"step": 11
},
{
"epoch": 0.01,
"learning_rate": 6e-06,
"logits/chosen": -2.122969150543213,
"logits/rejected": -2.114781379699707,
"logps/chosen": -238.08493041992188,
"logps/rejected": -221.2827606201172,
"loss": 0.6798,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.003618289018049836,
"rewards/margins": 0.028017427772283554,
"rewards/rejected": -0.02439913898706436,
"step": 12
},
{
"epoch": 0.01,
"learning_rate": 6.5000000000000004e-06,
"logits/chosen": -2.3176565170288086,
"logits/rejected": -2.26943039894104,
"logps/chosen": -166.89556884765625,
"logps/rejected": -156.35850524902344,
"loss": 0.6901,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.013709260150790215,
"rewards/margins": 0.007927654311060905,
"rewards/rejected": 0.00578160397708416,
"step": 13
},
{
"epoch": 0.01,
"learning_rate": 7.000000000000001e-06,
"logits/chosen": -2.2808492183685303,
"logits/rejected": -2.295313596725464,
"logps/chosen": -158.31381225585938,
"logps/rejected": -165.48663330078125,
"loss": 0.6953,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.01445994433015585,
"rewards/margins": -0.002848696894943714,
"rewards/rejected": 0.017308639362454414,
"step": 14
},
{
"epoch": 0.02,
"learning_rate": 7.5e-06,
"logits/chosen": -2.428576707839966,
"logits/rejected": -2.4046826362609863,
"logps/chosen": -198.4075164794922,
"logps/rejected": -199.75180053710938,
"loss": 0.6883,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.012678648345172405,
"rewards/margins": 0.010429286397993565,
"rewards/rejected": 0.0022493600845336914,
"step": 15
},
{
"epoch": 0.02,
"learning_rate": 8.000000000000001e-06,
"logits/chosen": -2.102433681488037,
"logits/rejected": -2.098867893218994,
"logps/chosen": -130.66616821289062,
"logps/rejected": -129.43551635742188,
"loss": 0.6897,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00463492888957262,
"rewards/margins": 0.007227444555610418,
"rewards/rejected": -0.0025925161316990852,
"step": 16
},
{
"epoch": 0.02,
"learning_rate": 8.500000000000002e-06,
"logits/chosen": -2.2135839462280273,
"logits/rejected": -2.2183382511138916,
"logps/chosen": -157.07391357421875,
"logps/rejected": -173.192138671875,
"loss": 0.7028,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.00783538818359375,
"rewards/margins": -0.018341876566410065,
"rewards/rejected": 0.010506488382816315,
"step": 17
},
{
"epoch": 0.02,
"learning_rate": 9e-06,
"logits/chosen": -2.2919342517852783,
"logits/rejected": -2.3105809688568115,
"logps/chosen": -212.9804229736328,
"logps/rejected": -213.6470947265625,
"loss": 0.6944,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.017333555966615677,
"rewards/margins": -0.0019711018539965153,
"rewards/rejected": 0.019304655492305756,
"step": 18
},
{
"epoch": 0.02,
"learning_rate": 9.5e-06,
"logits/chosen": -2.150813579559326,
"logits/rejected": -2.1184804439544678,
"logps/chosen": -164.96514892578125,
"logps/rejected": -159.76754760742188,
"loss": 0.688,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.016628170385956764,
"rewards/margins": 0.01211006660014391,
"rewards/rejected": -0.02873823791742325,
"step": 19
},
{
"epoch": 0.02,
"learning_rate": 1e-05,
"logits/chosen": -2.357393264770508,
"logits/rejected": -2.284986734390259,
"logps/chosen": -206.59085083007812,
"logps/rejected": -196.2120819091797,
"loss": 0.6848,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.006941366009414196,
"rewards/margins": 0.018102647736668587,
"rewards/rejected": -0.011161278933286667,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 1.05e-05,
"logits/chosen": -2.1450111865997314,
"logits/rejected": -2.2165582180023193,
"logps/chosen": -157.14804077148438,
"logps/rejected": -169.00897216796875,
"loss": 0.6747,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.01565234735608101,
"rewards/margins": 0.03814287483692169,
"rewards/rejected": -0.022490523755550385,
"step": 21
},
{
"epoch": 0.02,
"learning_rate": 1.1000000000000001e-05,
"logits/chosen": -2.194695234298706,
"logits/rejected": -2.1587462425231934,
"logps/chosen": -180.54441833496094,
"logps/rejected": -179.88087463378906,
"loss": 0.6891,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.004329085350036621,
"rewards/margins": 0.008360721170902252,
"rewards/rejected": -0.0040316348895430565,
"step": 22
},
{
"epoch": 0.02,
"learning_rate": 1.1500000000000002e-05,
"logits/chosen": -2.224414825439453,
"logits/rejected": -2.2185750007629395,
"logps/chosen": -172.73263549804688,
"logps/rejected": -164.4583282470703,
"loss": 0.6916,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.007931852713227272,
"rewards/margins": 0.0039233677089214325,
"rewards/rejected": -0.011855222284793854,
"step": 23
},
{
"epoch": 0.03,
"learning_rate": 1.2e-05,
"logits/chosen": -2.2102274894714355,
"logits/rejected": -2.2018349170684814,
"logps/chosen": -187.68124389648438,
"logps/rejected": -195.62225341796875,
"loss": 0.7161,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.016138983890414238,
"rewards/margins": -0.04391060024499893,
"rewards/rejected": 0.027771614491939545,
"step": 24
},
{
"epoch": 0.03,
"learning_rate": 1.25e-05,
"logits/chosen": -2.124497413635254,
"logits/rejected": -2.180361270904541,
"logps/chosen": -173.36505126953125,
"logps/rejected": -188.89918518066406,
"loss": 0.7035,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.021524429321289062,
"rewards/margins": -0.02002444490790367,
"rewards/rejected": -0.0014999869745224714,
"step": 25
},
{
"epoch": 0.03,
"learning_rate": 1.3000000000000001e-05,
"logits/chosen": -2.154703140258789,
"logits/rejected": -2.2054295539855957,
"logps/chosen": -161.19815063476562,
"logps/rejected": -172.0135040283203,
"loss": 0.6995,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.007346701342612505,
"rewards/margins": -0.011799763888120651,
"rewards/rejected": 0.004453063011169434,
"step": 26
},
{
"epoch": 0.03,
"learning_rate": 1.3500000000000001e-05,
"logits/chosen": -2.2279651165008545,
"logits/rejected": -2.360706329345703,
"logps/chosen": -134.864501953125,
"logps/rejected": -170.91477966308594,
"loss": 0.6831,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.017622999846935272,
"rewards/margins": 0.021318625658750534,
"rewards/rejected": -0.03894162178039551,
"step": 27
},
{
"epoch": 0.03,
"learning_rate": 1.4000000000000001e-05,
"logits/chosen": -2.119718551635742,
"logits/rejected": -2.1303789615631104,
"logps/chosen": -153.42706298828125,
"logps/rejected": -149.59426879882812,
"loss": 0.7052,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.022306587547063828,
"rewards/margins": -0.02277979999780655,
"rewards/rejected": 0.00047321245074272156,
"step": 28
},
{
"epoch": 0.03,
"learning_rate": 1.45e-05,
"logits/chosen": -2.1661736965179443,
"logits/rejected": -2.200699806213379,
"logps/chosen": -134.8897705078125,
"logps/rejected": -173.7844696044922,
"loss": 0.6753,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.013965796679258347,
"rewards/margins": 0.03791213408112526,
"rewards/rejected": -0.05187792703509331,
"step": 29
},
{
"epoch": 0.03,
"learning_rate": 1.5e-05,
"logits/chosen": -2.1092920303344727,
"logits/rejected": -2.1575889587402344,
"logps/chosen": -156.42156982421875,
"logps/rejected": -184.9061737060547,
"loss": 0.6931,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.02043609507381916,
"rewards/margins": 0.0014666561037302017,
"rewards/rejected": -0.02190275304019451,
"step": 30
},
{
"epoch": 0.03,
"learning_rate": 1.55e-05,
"logits/chosen": -2.1494805812835693,
"logits/rejected": -2.235766887664795,
"logps/chosen": -147.58779907226562,
"logps/rejected": -176.71292114257812,
"loss": 0.6972,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.020243335515260696,
"rewards/margins": -0.007363701239228249,
"rewards/rejected": -0.012879634276032448,
"step": 31
},
{
"epoch": 0.03,
"learning_rate": 1.6000000000000003e-05,
"logits/chosen": -2.0302445888519287,
"logits/rejected": -2.072943687438965,
"logps/chosen": -161.97325134277344,
"logps/rejected": -169.5047149658203,
"loss": 0.7081,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.033078648149967194,
"rewards/margins": -0.026705406606197357,
"rewards/rejected": -0.006373238749802113,
"step": 32
},
{
"epoch": 0.03,
"learning_rate": 1.65e-05,
"logits/chosen": -2.1231038570404053,
"logits/rejected": -2.164695978164673,
"logps/chosen": -177.8040313720703,
"logps/rejected": -184.5164794921875,
"loss": 0.7067,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.02300238609313965,
"rewards/margins": -0.026141025125980377,
"rewards/rejected": 0.003138638101518154,
"step": 33
},
{
"epoch": 0.04,
"learning_rate": 1.7000000000000003e-05,
"logits/chosen": -2.297323703765869,
"logits/rejected": -2.2965850830078125,
"logps/chosen": -138.19195556640625,
"logps/rejected": -146.45855712890625,
"loss": 0.6963,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.01872837543487549,
"rewards/margins": -0.003586245933547616,
"rewards/rejected": -0.01514213066548109,
"step": 34
},
{
"epoch": 0.04,
"learning_rate": 1.75e-05,
"logits/chosen": -1.9681299924850464,
"logits/rejected": -2.0026988983154297,
"logps/chosen": -148.55194091796875,
"logps/rejected": -154.24107360839844,
"loss": 0.6965,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.027061177417635918,
"rewards/margins": -0.003932238090783358,
"rewards/rejected": -0.023128939792513847,
"step": 35
},
{
"epoch": 0.04,
"learning_rate": 1.8e-05,
"logits/chosen": -2.351780414581299,
"logits/rejected": -2.4137086868286133,
"logps/chosen": -206.77256774902344,
"logps/rejected": -187.95159912109375,
"loss": 0.6941,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.015692900866270065,
"rewards/margins": -0.0006634213495999575,
"rewards/rejected": -0.015029479749500751,
"step": 36
},
{
"epoch": 0.04,
"learning_rate": 1.85e-05,
"logits/chosen": -2.145651340484619,
"logits/rejected": -2.1530380249023438,
"logps/chosen": -174.49249267578125,
"logps/rejected": -170.29107666015625,
"loss": 0.6981,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.05813932418823242,
"rewards/margins": -0.007088134065270424,
"rewards/rejected": -0.05105118826031685,
"step": 37
},
{
"epoch": 0.04,
"learning_rate": 1.9e-05,
"logits/chosen": -2.119550943374634,
"logits/rejected": -2.1285836696624756,
"logps/chosen": -158.59103393554688,
"logps/rejected": -161.4877471923828,
"loss": 0.6626,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.004540919791907072,
"rewards/margins": 0.06354211270809174,
"rewards/rejected": -0.0680830180644989,
"step": 38
},
{
"epoch": 0.04,
"learning_rate": 1.9500000000000003e-05,
"logits/chosen": -2.0205461978912354,
"logits/rejected": -2.0473952293395996,
"logps/chosen": -132.5583953857422,
"logps/rejected": -158.61367797851562,
"loss": 0.7152,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.05132186412811279,
"rewards/margins": -0.04252650961279869,
"rewards/rejected": -0.008795355446636677,
"step": 39
},
{
"epoch": 0.04,
"learning_rate": 2e-05,
"logits/chosen": -2.1021435260772705,
"logits/rejected": -2.056175708770752,
"logps/chosen": -164.51136779785156,
"logps/rejected": -140.69076538085938,
"loss": 0.704,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.042324475944042206,
"rewards/margins": -0.01991286501288414,
"rewards/rejected": -0.022411609068512917,
"step": 40
},
{
"epoch": 0.04,
"learning_rate": 2.05e-05,
"logits/chosen": -2.0000710487365723,
"logits/rejected": -2.0445573329925537,
"logps/chosen": -144.15602111816406,
"logps/rejected": -158.8379364013672,
"loss": 0.687,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.007117677479982376,
"rewards/margins": 0.014511799439787865,
"rewards/rejected": -0.007394121494144201,
"step": 41
},
{
"epoch": 0.04,
"learning_rate": 2.1e-05,
"logits/chosen": -2.1619393825531006,
"logits/rejected": -2.2450132369995117,
"logps/chosen": -147.3644561767578,
"logps/rejected": -152.8286895751953,
"loss": 0.6965,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.050475671887397766,
"rewards/margins": -0.003598665352910757,
"rewards/rejected": -0.04687700420618057,
"step": 42
},
{
"epoch": 0.04,
"learning_rate": 2.15e-05,
"logits/chosen": -2.103001356124878,
"logits/rejected": -2.1376430988311768,
"logps/chosen": -197.96510314941406,
"logps/rejected": -183.20042419433594,
"loss": 0.6886,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.02689351886510849,
"rewards/margins": 0.01161129493266344,
"rewards/rejected": -0.038504816591739655,
"step": 43
},
{
"epoch": 0.05,
"learning_rate": 2.2000000000000003e-05,
"logits/chosen": -2.249006748199463,
"logits/rejected": -2.2560691833496094,
"logps/chosen": -183.13180541992188,
"logps/rejected": -191.20266723632812,
"loss": 0.6994,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0708339661359787,
"rewards/margins": -0.009988496080040932,
"rewards/rejected": -0.06084546819329262,
"step": 44
},
{
"epoch": 0.05,
"learning_rate": 2.25e-05,
"logits/chosen": -2.076373815536499,
"logits/rejected": -2.1558804512023926,
"logps/chosen": -173.52224731445312,
"logps/rejected": -189.98086547851562,
"loss": 0.6965,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.061136387288570404,
"rewards/margins": -0.0023721233010292053,
"rewards/rejected": -0.0587642677128315,
"step": 45
},
{
"epoch": 0.05,
"learning_rate": 2.3000000000000003e-05,
"logits/chosen": -2.2398018836975098,
"logits/rejected": -2.1674294471740723,
"logps/chosen": -162.8035125732422,
"logps/rejected": -156.17276000976562,
"loss": 0.7007,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.06224403530359268,
"rewards/margins": -0.008054491132497787,
"rewards/rejected": -0.054189540445804596,
"step": 46
},
{
"epoch": 0.05,
"learning_rate": 2.35e-05,
"logits/chosen": -2.152360200881958,
"logits/rejected": -2.221158742904663,
"logps/chosen": -241.5851593017578,
"logps/rejected": -263.8956298828125,
"loss": 0.6805,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.04438929632306099,
"rewards/margins": 0.027727916836738586,
"rewards/rejected": -0.07211720943450928,
"step": 47
},
{
"epoch": 0.05,
"learning_rate": 2.4e-05,
"logits/chosen": -2.180600166320801,
"logits/rejected": -2.184905529022217,
"logps/chosen": -146.00100708007812,
"logps/rejected": -172.64227294921875,
"loss": 0.7003,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.04197859391570091,
"rewards/margins": -0.011492157354950905,
"rewards/rejected": -0.030486440286040306,
"step": 48
},
{
"epoch": 0.05,
"learning_rate": 2.45e-05,
"logits/chosen": -2.113537549972534,
"logits/rejected": -2.1188883781433105,
"logps/chosen": -145.5101318359375,
"logps/rejected": -145.72979736328125,
"loss": 0.6969,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09581132233142853,
"rewards/margins": -0.0029689306393265724,
"rewards/rejected": -0.09284238517284393,
"step": 49
},
{
"epoch": 0.05,
"learning_rate": 2.5e-05,
"logits/chosen": -2.2486584186553955,
"logits/rejected": -2.2213432788848877,
"logps/chosen": -165.0408172607422,
"logps/rejected": -172.3516845703125,
"loss": 0.672,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07465384155511856,
"rewards/margins": 0.04853079840540886,
"rewards/rejected": -0.12318463623523712,
"step": 50
},
{
"epoch": 0.05,
"learning_rate": 2.5500000000000003e-05,
"logits/chosen": -2.0694539546966553,
"logits/rejected": -2.0400142669677734,
"logps/chosen": -177.0946502685547,
"logps/rejected": -164.363037109375,
"loss": 0.734,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.11727481335401535,
"rewards/margins": -0.07744203507900238,
"rewards/rejected": -0.03983278200030327,
"step": 51
},
{
"epoch": 0.05,
"learning_rate": 2.6000000000000002e-05,
"logits/chosen": -2.2477283477783203,
"logits/rejected": -2.322450876235962,
"logps/chosen": -180.88040161132812,
"logps/rejected": -194.59298706054688,
"loss": 0.7052,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.05896530672907829,
"rewards/margins": -0.019788123667240143,
"rewards/rejected": -0.03917717933654785,
"step": 52
},
{
"epoch": 0.06,
"learning_rate": 2.6500000000000004e-05,
"logits/chosen": -2.28759765625,
"logits/rejected": -2.2965030670166016,
"logps/chosen": -166.57757568359375,
"logps/rejected": -168.3147430419922,
"loss": 0.7066,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.11757899075746536,
"rewards/margins": -0.023945949971675873,
"rewards/rejected": -0.09363303333520889,
"step": 53
},
{
"epoch": 0.06,
"learning_rate": 2.7000000000000002e-05,
"logits/chosen": -2.1356818675994873,
"logits/rejected": -2.077864170074463,
"logps/chosen": -163.74700927734375,
"logps/rejected": -186.17466735839844,
"loss": 0.6966,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0876019299030304,
"rewards/margins": -0.0037088878452777863,
"rewards/rejected": -0.08389303088188171,
"step": 54
},
{
"epoch": 0.06,
"learning_rate": 2.7500000000000004e-05,
"logits/chosen": -2.3550398349761963,
"logits/rejected": -2.3566269874572754,
"logps/chosen": -201.12591552734375,
"logps/rejected": -196.99490356445312,
"loss": 0.6809,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.14813652634620667,
"rewards/margins": 0.03020734339952469,
"rewards/rejected": -0.17834386229515076,
"step": 55
},
{
"epoch": 0.06,
"learning_rate": 2.8000000000000003e-05,
"logits/chosen": -2.1153206825256348,
"logits/rejected": -2.172855854034424,
"logps/chosen": -176.15061950683594,
"logps/rejected": -198.8987579345703,
"loss": 0.7242,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.1643160581588745,
"rewards/margins": -0.05339653417468071,
"rewards/rejected": -0.1109195277094841,
"step": 56
},
{
"epoch": 0.06,
"learning_rate": 2.8499999999999998e-05,
"logits/chosen": -2.0671656131744385,
"logits/rejected": -2.146867036819458,
"logps/chosen": -190.34786987304688,
"logps/rejected": -229.0176544189453,
"loss": 0.6826,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12184757739305496,
"rewards/margins": 0.025761937722563744,
"rewards/rejected": -0.14760951697826385,
"step": 57
},
{
"epoch": 0.06,
"learning_rate": 2.9e-05,
"logits/chosen": -2.1337296962738037,
"logits/rejected": -2.1995086669921875,
"logps/chosen": -152.1367950439453,
"logps/rejected": -201.97044372558594,
"loss": 0.6778,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1797308623790741,
"rewards/margins": 0.04084575176239014,
"rewards/rejected": -0.22057661414146423,
"step": 58
},
{
"epoch": 0.06,
"learning_rate": 2.95e-05,
"logits/chosen": -1.802043080329895,
"logits/rejected": -1.7019507884979248,
"logps/chosen": -136.04141235351562,
"logps/rejected": -147.78366088867188,
"loss": 0.6867,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.14073634147644043,
"rewards/margins": 0.02230207994580269,
"rewards/rejected": -0.163038432598114,
"step": 59
},
{
"epoch": 0.06,
"learning_rate": 3e-05,
"logits/chosen": -2.1223766803741455,
"logits/rejected": -2.229828357696533,
"logps/chosen": -163.0341033935547,
"logps/rejected": -186.51254272460938,
"loss": 0.6933,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.15651509165763855,
"rewards/margins": 0.003519295249134302,
"rewards/rejected": -0.16003437340259552,
"step": 60
},
{
"epoch": 0.06,
"learning_rate": 3.05e-05,
"logits/chosen": -2.0530059337615967,
"logits/rejected": -2.076582908630371,
"logps/chosen": -140.6927490234375,
"logps/rejected": -159.9715576171875,
"loss": 0.7026,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1471368670463562,
"rewards/margins": -0.013904569670557976,
"rewards/rejected": -0.13323231041431427,
"step": 61
},
{
"epoch": 0.06,
"learning_rate": 3.1e-05,
"logits/chosen": -2.2076821327209473,
"logits/rejected": -2.2303273677825928,
"logps/chosen": -162.5172882080078,
"logps/rejected": -172.63409423828125,
"loss": 0.7205,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.21913205087184906,
"rewards/margins": -0.046682070940732956,
"rewards/rejected": -0.1724499762058258,
"step": 62
},
{
"epoch": 0.07,
"learning_rate": 3.15e-05,
"logits/chosen": -2.1623778343200684,
"logits/rejected": -2.109609365463257,
"logps/chosen": -190.25259399414062,
"logps/rejected": -181.3391876220703,
"loss": 0.6934,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2230222225189209,
"rewards/margins": 0.014317656867206097,
"rewards/rejected": -0.23733988404273987,
"step": 63
},
{
"epoch": 0.07,
"learning_rate": 3.2000000000000005e-05,
"logits/chosen": -2.1411020755767822,
"logits/rejected": -2.087689161300659,
"logps/chosen": -149.68978881835938,
"logps/rejected": -144.92745971679688,
"loss": 0.68,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.18927860260009766,
"rewards/margins": 0.03361104428768158,
"rewards/rejected": -0.22288964688777924,
"step": 64
},
{
"epoch": 0.07,
"learning_rate": 3.2500000000000004e-05,
"logits/chosen": -2.1835758686065674,
"logits/rejected": -2.1907880306243896,
"logps/chosen": -164.97738647460938,
"logps/rejected": -180.56930541992188,
"loss": 0.6891,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.20788520574569702,
"rewards/margins": 0.012470051646232605,
"rewards/rejected": -0.22035524249076843,
"step": 65
},
{
"epoch": 0.07,
"learning_rate": 3.3e-05,
"logits/chosen": -2.3262267112731934,
"logits/rejected": -2.2733724117279053,
"logps/chosen": -179.5419921875,
"logps/rejected": -180.0846710205078,
"loss": 0.7209,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.21241910755634308,
"rewards/margins": -0.04504784941673279,
"rewards/rejected": -0.16737127304077148,
"step": 66
},
{
"epoch": 0.07,
"learning_rate": 3.35e-05,
"logits/chosen": -2.183920383453369,
"logits/rejected": -2.155302047729492,
"logps/chosen": -165.67642211914062,
"logps/rejected": -179.09642028808594,
"loss": 0.6755,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.18955731391906738,
"rewards/margins": 0.04666180908679962,
"rewards/rejected": -0.2362191379070282,
"step": 67
},
{
"epoch": 0.07,
"learning_rate": 3.4000000000000007e-05,
"logits/chosen": -2.099074602127075,
"logits/rejected": -2.144994020462036,
"logps/chosen": -146.17672729492188,
"logps/rejected": -151.28427124023438,
"loss": 0.737,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.26217174530029297,
"rewards/margins": -0.07887978851795197,
"rewards/rejected": -0.1832919716835022,
"step": 68
},
{
"epoch": 0.07,
"learning_rate": 3.45e-05,
"logits/chosen": -2.2945265769958496,
"logits/rejected": -2.2669451236724854,
"logps/chosen": -189.3528594970703,
"logps/rejected": -166.80990600585938,
"loss": 0.6892,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2909608483314514,
"rewards/margins": 0.024565648287534714,
"rewards/rejected": -0.31552648544311523,
"step": 69
},
{
"epoch": 0.07,
"learning_rate": 3.5e-05,
"logits/chosen": -2.0787875652313232,
"logits/rejected": -2.090841054916382,
"logps/chosen": -192.3681640625,
"logps/rejected": -224.5333251953125,
"loss": 0.7097,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.29522770643234253,
"rewards/margins": -0.02591385506093502,
"rewards/rejected": -0.26931384205818176,
"step": 70
},
{
"epoch": 0.07,
"learning_rate": 3.55e-05,
"logits/chosen": -2.1215460300445557,
"logits/rejected": -2.0683701038360596,
"logps/chosen": -144.15745544433594,
"logps/rejected": -141.385498046875,
"loss": 0.7234,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.34091001749038696,
"rewards/margins": -0.048917919397354126,
"rewards/rejected": -0.2919921278953552,
"step": 71
},
{
"epoch": 0.08,
"learning_rate": 3.6e-05,
"logits/chosen": -2.1100878715515137,
"logits/rejected": -2.112070083618164,
"logps/chosen": -154.8167724609375,
"logps/rejected": -158.28297424316406,
"loss": 0.6744,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2560737431049347,
"rewards/margins": 0.04427195340394974,
"rewards/rejected": -0.30034565925598145,
"step": 72
},
{
"epoch": 0.08,
"learning_rate": 3.65e-05,
"logits/chosen": -2.129936933517456,
"logits/rejected": -2.2630691528320312,
"logps/chosen": -158.00205993652344,
"logps/rejected": -176.6663055419922,
"loss": 0.6952,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.27465441823005676,
"rewards/margins": 0.013333894312381744,
"rewards/rejected": -0.2879883348941803,
"step": 73
},
{
"epoch": 0.08,
"learning_rate": 3.7e-05,
"logits/chosen": -2.2137179374694824,
"logits/rejected": -2.219494342803955,
"logps/chosen": -175.63868713378906,
"logps/rejected": -165.37460327148438,
"loss": 0.7134,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.30476734042167664,
"rewards/margins": -0.019104812294244766,
"rewards/rejected": -0.28566253185272217,
"step": 74
},
{
"epoch": 0.08,
"learning_rate": 3.7500000000000003e-05,
"logits/chosen": -2.021366596221924,
"logits/rejected": -1.953249216079712,
"logps/chosen": -148.9893341064453,
"logps/rejected": -147.3232421875,
"loss": 0.6865,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2916088402271271,
"rewards/margins": 0.026065416634082794,
"rewards/rejected": -0.31767427921295166,
"step": 75
},
{
"epoch": 0.08,
"learning_rate": 3.8e-05,
"logits/chosen": -2.0788702964782715,
"logits/rejected": -2.081282377243042,
"logps/chosen": -142.72647094726562,
"logps/rejected": -155.7174835205078,
"loss": 0.6944,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.33365076780319214,
"rewards/margins": 0.008806329220533371,
"rewards/rejected": -0.342457115650177,
"step": 76
},
{
"epoch": 0.08,
"learning_rate": 3.85e-05,
"logits/chosen": -2.141986608505249,
"logits/rejected": -2.1325931549072266,
"logps/chosen": -166.86537170410156,
"logps/rejected": -197.6178436279297,
"loss": 0.7052,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.34245288372039795,
"rewards/margins": 0.011411521583795547,
"rewards/rejected": -0.3538644015789032,
"step": 77
},
{
"epoch": 0.08,
"learning_rate": 3.9000000000000006e-05,
"logits/chosen": -2.215611457824707,
"logits/rejected": -2.212402582168579,
"logps/chosen": -161.9552459716797,
"logps/rejected": -160.0537109375,
"loss": 0.6606,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.28822606801986694,
"rewards/margins": 0.07802614569664001,
"rewards/rejected": -0.36625221371650696,
"step": 78
},
{
"epoch": 0.08,
"learning_rate": 3.9500000000000005e-05,
"logits/chosen": -2.1117148399353027,
"logits/rejected": -2.152738094329834,
"logps/chosen": -187.3416290283203,
"logps/rejected": -185.2106170654297,
"loss": 0.7461,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.38387393951416016,
"rewards/margins": -0.09052477777004242,
"rewards/rejected": -0.2933492064476013,
"step": 79
},
{
"epoch": 0.08,
"learning_rate": 4e-05,
"logits/chosen": -2.081353187561035,
"logits/rejected": -2.1454594135284424,
"logps/chosen": -148.2335968017578,
"logps/rejected": -169.42164611816406,
"loss": 0.7009,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.34989604353904724,
"rewards/margins": 0.00529644638299942,
"rewards/rejected": -0.35519248247146606,
"step": 80
},
{
"epoch": 0.08,
"learning_rate": 4.05e-05,
"logits/chosen": -2.1243531703948975,
"logits/rejected": -2.1677510738372803,
"logps/chosen": -164.04400634765625,
"logps/rejected": -185.00840759277344,
"loss": 0.6988,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.3187049627304077,
"rewards/margins": 0.011884737759828568,
"rewards/rejected": -0.3305897116661072,
"step": 81
},
{
"epoch": 0.09,
"learning_rate": 4.1e-05,
"logits/chosen": -2.0324885845184326,
"logits/rejected": -1.9073715209960938,
"logps/chosen": -171.0150604248047,
"logps/rejected": -178.9537811279297,
"loss": 0.6898,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.262698233127594,
"rewards/margins": 0.012961791828274727,
"rewards/rejected": -0.27566003799438477,
"step": 82
},
{
"epoch": 0.09,
"learning_rate": 4.15e-05,
"logits/chosen": -2.1523077487945557,
"logits/rejected": -2.1646199226379395,
"logps/chosen": -164.2784881591797,
"logps/rejected": -170.58087158203125,
"loss": 0.6565,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2823156714439392,
"rewards/margins": 0.08943505585193634,
"rewards/rejected": -0.37175074219703674,
"step": 83
},
{
"epoch": 0.09,
"learning_rate": 4.2e-05,
"logits/chosen": -2.132960081100464,
"logits/rejected": -2.170064926147461,
"logps/chosen": -217.71466064453125,
"logps/rejected": -224.87718200683594,
"loss": 0.6449,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.338445246219635,
"rewards/margins": 0.1073441132903099,
"rewards/rejected": -0.4457893371582031,
"step": 84
},
{
"epoch": 0.09,
"learning_rate": 4.25e-05,
"logits/chosen": -2.263223648071289,
"logits/rejected": -2.3083293437957764,
"logps/chosen": -156.03329467773438,
"logps/rejected": -174.6391143798828,
"loss": 0.6873,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.22993193566799164,
"rewards/margins": 0.018939830362796783,
"rewards/rejected": -0.24887175858020782,
"step": 85
},
{
"epoch": 0.09,
"learning_rate": 4.3e-05,
"logits/chosen": -2.265439510345459,
"logits/rejected": -2.2670176029205322,
"logps/chosen": -166.77760314941406,
"logps/rejected": -166.4705352783203,
"loss": 0.737,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3399352431297302,
"rewards/margins": -0.06634283810853958,
"rewards/rejected": -0.27359241247177124,
"step": 86
},
{
"epoch": 0.09,
"learning_rate": 4.35e-05,
"logits/chosen": -2.210916757583618,
"logits/rejected": -2.2296037673950195,
"logps/chosen": -142.21340942382812,
"logps/rejected": -161.62600708007812,
"loss": 0.6545,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.301052987575531,
"rewards/margins": 0.09594573080539703,
"rewards/rejected": -0.39699873328208923,
"step": 87
},
{
"epoch": 0.09,
"learning_rate": 4.4000000000000006e-05,
"logits/chosen": -2.220825433731079,
"logits/rejected": -2.1372034549713135,
"logps/chosen": -156.62326049804688,
"logps/rejected": -150.1629638671875,
"loss": 0.6818,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2567201852798462,
"rewards/margins": 0.031475357711315155,
"rewards/rejected": -0.28819552063941956,
"step": 88
},
{
"epoch": 0.09,
"learning_rate": 4.4500000000000004e-05,
"logits/chosen": -2.080606698989868,
"logits/rejected": -2.16312575340271,
"logps/chosen": -162.34506225585938,
"logps/rejected": -175.51246643066406,
"loss": 0.7598,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4455791115760803,
"rewards/margins": -0.09895279258489609,
"rewards/rejected": -0.34662631154060364,
"step": 89
},
{
"epoch": 0.09,
"learning_rate": 4.5e-05,
"logits/chosen": -2.22636342048645,
"logits/rejected": -2.32147479057312,
"logps/chosen": -152.72509765625,
"logps/rejected": -210.8067169189453,
"loss": 0.6536,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2957713007926941,
"rewards/margins": 0.11437603086233139,
"rewards/rejected": -0.4101472795009613,
"step": 90
},
{
"epoch": 0.09,
"learning_rate": 4.55e-05,
"logits/chosen": -2.264246702194214,
"logits/rejected": -2.322878122329712,
"logps/chosen": -116.30667114257812,
"logps/rejected": -136.15834045410156,
"loss": 0.6652,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.21190989017486572,
"rewards/margins": 0.09101668000221252,
"rewards/rejected": -0.30292657017707825,
"step": 91
},
{
"epoch": 0.1,
"learning_rate": 4.600000000000001e-05,
"logits/chosen": -2.2319533824920654,
"logits/rejected": -2.272087335586548,
"logps/chosen": -209.2500762939453,
"logps/rejected": -213.80284118652344,
"loss": 0.7243,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5155326128005981,
"rewards/margins": -0.025821635499596596,
"rewards/rejected": -0.4897109568119049,
"step": 92
},
{
"epoch": 0.1,
"learning_rate": 4.6500000000000005e-05,
"logits/chosen": -2.3007993698120117,
"logits/rejected": -2.233243227005005,
"logps/chosen": -163.1545867919922,
"logps/rejected": -162.89089965820312,
"loss": 0.7095,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4529023766517639,
"rewards/margins": -0.020960787311196327,
"rewards/rejected": -0.43194156885147095,
"step": 93
},
{
"epoch": 0.1,
"learning_rate": 4.7e-05,
"logits/chosen": -2.0963847637176514,
"logits/rejected": -2.120976209640503,
"logps/chosen": -180.0958709716797,
"logps/rejected": -205.76585388183594,
"loss": 0.6049,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.30844229459762573,
"rewards/margins": 0.21104586124420166,
"rewards/rejected": -0.5194881558418274,
"step": 94
},
{
"epoch": 0.1,
"learning_rate": 4.75e-05,
"logits/chosen": -2.2864556312561035,
"logits/rejected": -2.241337299346924,
"logps/chosen": -202.34237670898438,
"logps/rejected": -200.288330078125,
"loss": 0.6701,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.35101816058158875,
"rewards/margins": 0.08223339170217514,
"rewards/rejected": -0.4332515299320221,
"step": 95
},
{
"epoch": 0.1,
"learning_rate": 4.8e-05,
"logits/chosen": -2.1984472274780273,
"logits/rejected": -2.230916976928711,
"logps/chosen": -176.45132446289062,
"logps/rejected": -189.4639892578125,
"loss": 0.7,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3947378993034363,
"rewards/margins": 0.01168506033718586,
"rewards/rejected": -0.4064229726791382,
"step": 96
},
{
"epoch": 0.1,
"learning_rate": 4.85e-05,
"logits/chosen": -2.1936614513397217,
"logits/rejected": -2.178769588470459,
"logps/chosen": -172.03759765625,
"logps/rejected": -184.67947387695312,
"loss": 0.7077,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5262473821640015,
"rewards/margins": -0.006830459460616112,
"rewards/rejected": -0.5194169282913208,
"step": 97
},
{
"epoch": 0.1,
"learning_rate": 4.9e-05,
"logits/chosen": -2.1688649654388428,
"logits/rejected": -2.1433870792388916,
"logps/chosen": -158.25515747070312,
"logps/rejected": -165.4871368408203,
"loss": 0.7229,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3400518298149109,
"rewards/margins": -0.027561640366911888,
"rewards/rejected": -0.31249016523361206,
"step": 98
},
{
"epoch": 0.1,
"learning_rate": 4.9500000000000004e-05,
"logits/chosen": -2.25595760345459,
"logits/rejected": -2.2448055744171143,
"logps/chosen": -175.17568969726562,
"logps/rejected": -179.33013916015625,
"loss": 0.7008,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4599221646785736,
"rewards/margins": 0.01116972602903843,
"rewards/rejected": -0.4710919260978699,
"step": 99
},
{
"epoch": 0.1,
"learning_rate": 5e-05,
"logits/chosen": -2.0946810245513916,
"logits/rejected": -2.0803956985473633,
"logps/chosen": -118.56491088867188,
"logps/rejected": -113.53369140625,
"loss": 0.6508,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2681715190410614,
"rewards/margins": 0.11040147393941879,
"rewards/rejected": -0.3785730004310608,
"step": 100
},
{
"epoch": 0.11,
"learning_rate": 4.9999832415172185e-05,
"logits/chosen": -2.1622209548950195,
"logits/rejected": -2.251312732696533,
"logps/chosen": -148.2783966064453,
"logps/rejected": -232.58053588867188,
"loss": 0.6485,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.36020269989967346,
"rewards/margins": 0.13017883896827698,
"rewards/rejected": -0.49038150906562805,
"step": 101
},
{
"epoch": 0.11,
"learning_rate": 4.9999329662935534e-05,
"logits/chosen": -2.1302261352539062,
"logits/rejected": -2.146063804626465,
"logps/chosen": -182.8227081298828,
"logps/rejected": -189.34913635253906,
"loss": 0.6736,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6098363995552063,
"rewards/margins": 0.08228112757205963,
"rewards/rejected": -0.6921175122261047,
"step": 102
},
{
"epoch": 0.11,
"learning_rate": 4.9998491750030315e-05,
"logits/chosen": -2.0695760250091553,
"logits/rejected": -2.151846170425415,
"logps/chosen": -157.34878540039062,
"logps/rejected": -173.14401245117188,
"loss": 0.7008,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5831299424171448,
"rewards/margins": 0.029685884714126587,
"rewards/rejected": -0.612815797328949,
"step": 103
},
{
"epoch": 0.11,
"learning_rate": 4.999731868769027e-05,
"logits/chosen": -2.1843104362487793,
"logits/rejected": -2.107654094696045,
"logps/chosen": -161.08914184570312,
"logps/rejected": -142.64749145507812,
"loss": 0.7995,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.6787440776824951,
"rewards/margins": -0.16208001971244812,
"rewards/rejected": -0.5166640877723694,
"step": 104
},
{
"epoch": 0.11,
"learning_rate": 4.999581049164237e-05,
"logits/chosen": -2.0645735263824463,
"logits/rejected": -2.147791862487793,
"logps/chosen": -149.2325897216797,
"logps/rejected": -173.02069091796875,
"loss": 0.623,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3364582061767578,
"rewards/margins": 0.1654883325099945,
"rewards/rejected": -0.5019465684890747,
"step": 105
},
{
"epoch": 0.11,
"learning_rate": 4.99939671821067e-05,
"logits/chosen": -2.238675832748413,
"logits/rejected": -2.308804988861084,
"logps/chosen": -187.11219787597656,
"logps/rejected": -192.90240478515625,
"loss": 0.6952,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4883981943130493,
"rewards/margins": 0.011134681291878223,
"rewards/rejected": -0.49953290820121765,
"step": 106
},
{
"epoch": 0.11,
"learning_rate": 4.999178878379611e-05,
"logits/chosen": -2.1448211669921875,
"logits/rejected": -2.163353681564331,
"logps/chosen": -151.56005859375,
"logps/rejected": -149.57290649414062,
"loss": 0.7033,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5375577807426453,
"rewards/margins": 0.005782928317785263,
"rewards/rejected": -0.5433407425880432,
"step": 107
},
{
"epoch": 0.11,
"learning_rate": 4.998927532591592e-05,
"logits/chosen": -2.1543734073638916,
"logits/rejected": -2.1483535766601562,
"logps/chosen": -168.912841796875,
"logps/rejected": -163.83612060546875,
"loss": 0.7009,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.40299952030181885,
"rewards/margins": 0.021744927391409874,
"rewards/rejected": -0.42474448680877686,
"step": 108
},
{
"epoch": 0.11,
"learning_rate": 4.9986426842163515e-05,
"logits/chosen": -2.196964979171753,
"logits/rejected": -2.1282408237457275,
"logps/chosen": -148.19366455078125,
"logps/rejected": -141.83212280273438,
"loss": 0.622,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.36278650164604187,
"rewards/margins": 0.18898257613182068,
"rewards/rejected": -0.5517690777778625,
"step": 109
},
{
"epoch": 0.11,
"learning_rate": 4.9983243370727914e-05,
"logits/chosen": -2.136972665786743,
"logits/rejected": -2.154928684234619,
"logps/chosen": -146.0774383544922,
"logps/rejected": -132.6929473876953,
"loss": 0.6987,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5064787268638611,
"rewards/margins": 0.05323922634124756,
"rewards/rejected": -0.5597178936004639,
"step": 110
},
{
"epoch": 0.12,
"learning_rate": 4.9979724954289244e-05,
"logits/chosen": -2.1273446083068848,
"logits/rejected": -2.1698343753814697,
"logps/chosen": -140.70408630371094,
"logps/rejected": -164.12890625,
"loss": 0.5856,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.27825814485549927,
"rewards/margins": 0.30408480763435364,
"rewards/rejected": -0.5823429822921753,
"step": 111
},
{
"epoch": 0.12,
"learning_rate": 4.9975871640018154e-05,
"logits/chosen": -2.156425952911377,
"logits/rejected": -2.149127244949341,
"logps/chosen": -209.48033142089844,
"logps/rejected": -183.49618530273438,
"loss": 0.6814,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3876263499259949,
"rewards/margins": 0.07094159722328186,
"rewards/rejected": -0.45856791734695435,
"step": 112
},
{
"epoch": 0.12,
"learning_rate": 4.99716834795752e-05,
"logits/chosen": -2.195077896118164,
"logits/rejected": -2.171152353286743,
"logps/chosen": -137.59030151367188,
"logps/rejected": -144.78488159179688,
"loss": 0.6117,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.201141357421875,
"rewards/margins": 0.24156084656715393,
"rewards/rejected": -0.4427022337913513,
"step": 113
},
{
"epoch": 0.12,
"learning_rate": 4.996716052911017e-05,
"logits/chosen": -2.0847880840301514,
"logits/rejected": -2.141540050506592,
"logps/chosen": -197.46426391601562,
"logps/rejected": -199.35997009277344,
"loss": 0.6728,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4001588821411133,
"rewards/margins": 0.0924140065908432,
"rewards/rejected": -0.4925729036331177,
"step": 114
},
{
"epoch": 0.12,
"learning_rate": 4.996230284926128e-05,
"logits/chosen": -1.791740894317627,
"logits/rejected": -1.7912871837615967,
"logps/chosen": -181.44020080566406,
"logps/rejected": -201.87872314453125,
"loss": 0.6437,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.40906965732574463,
"rewards/margins": 0.15568459033966064,
"rewards/rejected": -0.5647542476654053,
"step": 115
},
{
"epoch": 0.12,
"learning_rate": 4.99571105051544e-05,
"logits/chosen": -2.1246492862701416,
"logits/rejected": -2.122555732727051,
"logps/chosen": -175.01571655273438,
"logps/rejected": -176.2000732421875,
"loss": 0.7431,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.36241090297698975,
"rewards/margins": -0.04763410612940788,
"rewards/rejected": -0.314776748418808,
"step": 116
},
{
"epoch": 0.12,
"learning_rate": 4.99515835664022e-05,
"logits/chosen": -2.066060781478882,
"logits/rejected": -2.0761799812316895,
"logps/chosen": -135.93301391601562,
"logps/rejected": -162.29739379882812,
"loss": 0.8265,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.39111948013305664,
"rewards/margins": -0.14791490137577057,
"rewards/rejected": -0.2432045191526413,
"step": 117
},
{
"epoch": 0.12,
"learning_rate": 4.994572210710315e-05,
"logits/chosen": -2.0879368782043457,
"logits/rejected": -2.1556289196014404,
"logps/chosen": -197.7449951171875,
"logps/rejected": -203.45278930664062,
"loss": 0.6753,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2648780345916748,
"rewards/margins": 0.068690724670887,
"rewards/rejected": -0.3335687518119812,
"step": 118
},
{
"epoch": 0.12,
"learning_rate": 4.993952620584058e-05,
"logits/chosen": -2.2932987213134766,
"logits/rejected": -2.330873489379883,
"logps/chosen": -139.2769775390625,
"logps/rejected": -152.76504516601562,
"loss": 0.6608,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.23606640100479126,
"rewards/margins": 0.08653931319713593,
"rewards/rejected": -0.3226057291030884,
"step": 119
},
{
"epoch": 0.13,
"learning_rate": 4.993299594568163e-05,
"logits/chosen": -2.1717681884765625,
"logits/rejected": -2.176131010055542,
"logps/chosen": -171.66172790527344,
"logps/rejected": -188.26991271972656,
"loss": 0.6917,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.43707746267318726,
"rewards/margins": 0.05936805531382561,
"rewards/rejected": -0.49644553661346436,
"step": 120
},
{
"epoch": 0.13,
"learning_rate": 4.992613141417608e-05,
"logits/chosen": -2.1105880737304688,
"logits/rejected": -2.0103209018707275,
"logps/chosen": -155.08033752441406,
"logps/rejected": -141.55419921875,
"loss": 0.5905,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2041604220867157,
"rewards/margins": 0.32256096601486206,
"rewards/rejected": -0.5267213582992554,
"step": 121
},
{
"epoch": 0.13,
"learning_rate": 4.9918932703355256e-05,
"logits/chosen": -2.1467816829681396,
"logits/rejected": -2.115278720855713,
"logps/chosen": -152.28237915039062,
"logps/rejected": -133.01211547851562,
"loss": 0.7446,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.44335490465164185,
"rewards/margins": -0.022354908287525177,
"rewards/rejected": -0.42100000381469727,
"step": 122
},
{
"epoch": 0.13,
"learning_rate": 4.9911399909730714e-05,
"logits/chosen": -2.318913221359253,
"logits/rejected": -2.275251865386963,
"logps/chosen": -160.89625549316406,
"logps/rejected": -159.76943969726562,
"loss": 0.6249,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.21618026494979858,
"rewards/margins": 0.20579932630062103,
"rewards/rejected": -0.42197954654693604,
"step": 123
},
{
"epoch": 0.13,
"learning_rate": 4.990353313429303e-05,
"logits/chosen": -2.1120331287384033,
"logits/rejected": -2.0410468578338623,
"logps/chosen": -183.7588653564453,
"logps/rejected": -222.8588104248047,
"loss": 0.684,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4172906279563904,
"rewards/margins": 0.05181103199720383,
"rewards/rejected": -0.4691016674041748,
"step": 124
},
{
"epoch": 0.13,
"learning_rate": 4.989533248251037e-05,
"logits/chosen": -2.0413904190063477,
"logits/rejected": -2.050387382507324,
"logps/chosen": -221.38436889648438,
"logps/rejected": -220.9242401123047,
"loss": 0.5889,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.18506890535354614,
"rewards/margins": 0.2643589675426483,
"rewards/rejected": -0.44942787289619446,
"step": 125
},
{
"epoch": 0.13,
"learning_rate": 4.988679806432712e-05,
"logits/chosen": -2.119227170944214,
"logits/rejected": -2.140362024307251,
"logps/chosen": -164.545166015625,
"logps/rejected": -171.43540954589844,
"loss": 0.727,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.41472572088241577,
"rewards/margins": -0.024414831772446632,
"rewards/rejected": -0.3903109133243561,
"step": 126
},
{
"epoch": 0.13,
"learning_rate": 4.98779299941624e-05,
"logits/chosen": -2.008235454559326,
"logits/rejected": -2.0146234035491943,
"logps/chosen": -182.02488708496094,
"logps/rejected": -173.46522521972656,
"loss": 0.7665,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.39273571968078613,
"rewards/margins": -0.07521196454763412,
"rewards/rejected": -0.3175237476825714,
"step": 127
},
{
"epoch": 0.13,
"learning_rate": 4.9868728390908526e-05,
"logits/chosen": -2.194214105606079,
"logits/rejected": -2.183523654937744,
"logps/chosen": -144.79937744140625,
"logps/rejected": -145.96029663085938,
"loss": 0.71,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.15473532676696777,
"rewards/margins": 0.0015310226008296013,
"rewards/rejected": -0.15626637637615204,
"step": 128
},
{
"epoch": 0.13,
"learning_rate": 4.985919337792944e-05,
"logits/chosen": -2.0170881748199463,
"logits/rejected": -2.021702527999878,
"logps/chosen": -157.9308624267578,
"logps/rejected": -187.35256958007812,
"loss": 0.682,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.23143672943115234,
"rewards/margins": 0.07466793060302734,
"rewards/rejected": -0.3061046600341797,
"step": 129
},
{
"epoch": 0.14,
"learning_rate": 4.9849325083059e-05,
"logits/chosen": -2.345395088195801,
"logits/rejected": -2.294443130493164,
"logps/chosen": -202.0467987060547,
"logps/rejected": -186.38247680664062,
"loss": 0.692,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.19066600501537323,
"rewards/margins": 0.07181324809789658,
"rewards/rejected": -0.2624792754650116,
"step": 130
},
{
"epoch": 0.14,
"learning_rate": 4.983912363859935e-05,
"logits/chosen": -2.14298152923584,
"logits/rejected": -2.1677587032318115,
"logps/chosen": -125.33621215820312,
"logps/rejected": -137.17404174804688,
"loss": 0.6617,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.23150649666786194,
"rewards/margins": 0.10384988784790039,
"rewards/rejected": -0.33535638451576233,
"step": 131
},
{
"epoch": 0.14,
"learning_rate": 4.982858918131906e-05,
"logits/chosen": -2.190133810043335,
"logits/rejected": -2.247300863265991,
"logps/chosen": -167.1696014404297,
"logps/rejected": -168.60281372070312,
"loss": 0.7058,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.43472814559936523,
"rewards/margins": 0.03844255581498146,
"rewards/rejected": -0.473170667886734,
"step": 132
},
{
"epoch": 0.14,
"learning_rate": 4.981772185245135e-05,
"logits/chosen": -2.154466152191162,
"logits/rejected": -2.1573760509490967,
"logps/chosen": -168.56600952148438,
"logps/rejected": -188.00607299804688,
"loss": 0.6659,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.3921719491481781,
"rewards/margins": 0.10123846679925919,
"rewards/rejected": -0.4934104084968567,
"step": 133
},
{
"epoch": 0.14,
"learning_rate": 4.980652179769218e-05,
"logits/chosen": -2.139244318008423,
"logits/rejected": -2.170041561126709,
"logps/chosen": -194.27749633789062,
"logps/rejected": -194.87745666503906,
"loss": 0.6939,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.19869127869606018,
"rewards/margins": 0.15997806191444397,
"rewards/rejected": -0.3586694002151489,
"step": 134
},
{
"epoch": 0.14,
"learning_rate": 4.979498916719828e-05,
"logits/chosen": -2.0148520469665527,
"logits/rejected": -2.0170676708221436,
"logps/chosen": -178.43853759765625,
"logps/rejected": -177.73255920410156,
"loss": 0.6163,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.15721122920513153,
"rewards/margins": 0.2595069408416748,
"rewards/rejected": -0.41671818494796753,
"step": 135
},
{
"epoch": 0.14,
"learning_rate": 4.978312411558518e-05,
"logits/chosen": -2.2386107444763184,
"logits/rejected": -2.2416539192199707,
"logps/chosen": -153.1583251953125,
"logps/rejected": -165.6448516845703,
"loss": 0.679,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.48863306641578674,
"rewards/margins": 0.107185497879982,
"rewards/rejected": -0.5958185195922852,
"step": 136
},
{
"epoch": 0.14,
"learning_rate": 4.977092680192507e-05,
"logits/chosen": -1.9784085750579834,
"logits/rejected": -1.9997234344482422,
"logps/chosen": -148.26358032226562,
"logps/rejected": -131.2608642578125,
"loss": 0.7235,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.3870002031326294,
"rewards/margins": -0.03670747950673103,
"rewards/rejected": -0.35029271245002747,
"step": 137
},
{
"epoch": 0.14,
"learning_rate": 4.9758397389744734e-05,
"logits/chosen": -2.2231109142303467,
"logits/rejected": -2.1324055194854736,
"logps/chosen": -175.7447052001953,
"logps/rejected": -165.63980102539062,
"loss": 0.6117,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.3166625499725342,
"rewards/margins": 0.23041030764579773,
"rewards/rejected": -0.5470728278160095,
"step": 138
},
{
"epoch": 0.14,
"learning_rate": 4.9745536047023324e-05,
"logits/chosen": -2.1007936000823975,
"logits/rejected": -2.2264654636383057,
"logps/chosen": -205.53456115722656,
"logps/rejected": -180.32980346679688,
"loss": 0.6427,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.45254045724868774,
"rewards/margins": 0.15192103385925293,
"rewards/rejected": -0.6044614911079407,
"step": 139
},
{
"epoch": 0.15,
"learning_rate": 4.973234294619011e-05,
"logits/chosen": -1.936387062072754,
"logits/rejected": -2.0128026008605957,
"logps/chosen": -145.3013916015625,
"logps/rejected": -161.27244567871094,
"loss": 0.6728,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3800917863845825,
"rewards/margins": 0.11550942063331604,
"rewards/rejected": -0.49560117721557617,
"step": 140
},
{
"epoch": 0.15,
"learning_rate": 4.971881826412218e-05,
"logits/chosen": -2.154330253601074,
"logits/rejected": -2.237229585647583,
"logps/chosen": -156.77369689941406,
"logps/rejected": -176.82176208496094,
"loss": 0.6794,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.36407431960105896,
"rewards/margins": 0.1079927533864975,
"rewards/rejected": -0.47206708788871765,
"step": 141
},
{
"epoch": 0.15,
"learning_rate": 4.9704962182142044e-05,
"logits/chosen": -2.1118252277374268,
"logits/rejected": -2.134640693664551,
"logps/chosen": -166.844482421875,
"logps/rejected": -167.47900390625,
"loss": 0.6575,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4557605981826782,
"rewards/margins": 0.10824623703956604,
"rewards/rejected": -0.5640068054199219,
"step": 142
},
{
"epoch": 0.15,
"learning_rate": 4.9690774886015244e-05,
"logits/chosen": -2.1018967628479004,
"logits/rejected": -2.1379857063293457,
"logps/chosen": -176.53829956054688,
"logps/rejected": -194.2880859375,
"loss": 0.6878,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4790266156196594,
"rewards/margins": 0.10418644547462463,
"rewards/rejected": -0.5832130312919617,
"step": 143
},
{
"epoch": 0.15,
"learning_rate": 4.967625656594782e-05,
"logits/chosen": -2.0278987884521484,
"logits/rejected": -2.0434699058532715,
"logps/chosen": -145.874267578125,
"logps/rejected": -151.4019317626953,
"loss": 0.7109,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.38604238629341125,
"rewards/margins": 0.0024005472660064697,
"rewards/rejected": -0.3884429931640625,
"step": 144
},
{
"epoch": 0.15,
"learning_rate": 4.966140741658379e-05,
"logits/chosen": -2.128117084503174,
"logits/rejected": -2.169542074203491,
"logps/chosen": -166.9932098388672,
"logps/rejected": -166.7843475341797,
"loss": 0.699,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.42920929193496704,
"rewards/margins": 0.09982432425022125,
"rewards/rejected": -0.5290336608886719,
"step": 145
},
{
"epoch": 0.15,
"learning_rate": 4.9646227637002515e-05,
"logits/chosen": -2.2982516288757324,
"logits/rejected": -2.3041136264801025,
"logps/chosen": -174.27261352539062,
"logps/rejected": -181.32333374023438,
"loss": 0.7344,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5106673836708069,
"rewards/margins": 0.0017841942608356476,
"rewards/rejected": -0.5124515891075134,
"step": 146
},
{
"epoch": 0.15,
"learning_rate": 4.963071743071607e-05,
"logits/chosen": -2.1971347332000732,
"logits/rejected": -2.252727508544922,
"logps/chosen": -164.0286102294922,
"logps/rejected": -174.8863525390625,
"loss": 0.7943,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5613827705383301,
"rewards/margins": -0.05153023824095726,
"rewards/rejected": -0.5098525285720825,
"step": 147
},
{
"epoch": 0.15,
"learning_rate": 4.961487700566646e-05,
"logits/chosen": -2.056870222091675,
"logits/rejected": -2.05178165435791,
"logps/chosen": -143.26681518554688,
"logps/rejected": -183.6707000732422,
"loss": 0.744,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.3036291301250458,
"rewards/margins": 0.006056658923625946,
"rewards/rejected": -0.3096857964992523,
"step": 148
},
{
"epoch": 0.16,
"learning_rate": 4.9598706574222886e-05,
"logits/chosen": -2.159867286682129,
"logits/rejected": -2.2232213020324707,
"logps/chosen": -161.9599609375,
"logps/rejected": -184.70950317382812,
"loss": 0.7647,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.32318681478500366,
"rewards/margins": -0.07305724918842316,
"rewards/rejected": -0.2501295506954193,
"step": 149
},
{
"epoch": 0.16,
"learning_rate": 4.958220635317886e-05,
"logits/chosen": -2.0702919960021973,
"logits/rejected": -2.1789698600769043,
"logps/chosen": -159.52511596679688,
"logps/rejected": -187.4576416015625,
"loss": 0.6574,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.264212042093277,
"rewards/margins": 0.14712117612361908,
"rewards/rejected": -0.41133320331573486,
"step": 150
},
{
"epoch": 0.16,
"learning_rate": 4.956537656374933e-05,
"logits/chosen": -2.1290884017944336,
"logits/rejected": -2.1409378051757812,
"logps/chosen": -156.96994018554688,
"logps/rejected": -166.66525268554688,
"loss": 0.7229,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4065425992012024,
"rewards/margins": 0.03063153848052025,
"rewards/rejected": -0.43717408180236816,
"step": 151
},
{
"epoch": 0.16,
"learning_rate": 4.9548217431567665e-05,
"logits/chosen": -2.1941640377044678,
"logits/rejected": -2.2345049381256104,
"logps/chosen": -141.96652221679688,
"logps/rejected": -145.7759552001953,
"loss": 0.6756,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.2183290421962738,
"rewards/margins": 0.11311781406402588,
"rewards/rejected": -0.3314468264579773,
"step": 152
},
{
"epoch": 0.16,
"learning_rate": 4.95307291866827e-05,
"logits/chosen": -2.1858081817626953,
"logits/rejected": -2.1450536251068115,
"logps/chosen": -155.87149047851562,
"logps/rejected": -157.30197143554688,
"loss": 0.6938,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.34080612659454346,
"rewards/margins": 0.06700116395950317,
"rewards/rejected": -0.40780726075172424,
"step": 153
},
{
"epoch": 0.16,
"learning_rate": 4.95129120635556e-05,
"logits/chosen": -2.240177869796753,
"logits/rejected": -2.171326160430908,
"logps/chosen": -165.3470458984375,
"logps/rejected": -152.51541137695312,
"loss": 0.7462,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.3122984766960144,
"rewards/margins": -0.0709066390991211,
"rewards/rejected": -0.24139180779457092,
"step": 154
},
{
"epoch": 0.16,
"learning_rate": 4.949476630105669e-05,
"logits/chosen": -2.2033369541168213,
"logits/rejected": -2.219047784805298,
"logps/chosen": -201.08102416992188,
"logps/rejected": -198.62356567382812,
"loss": 0.6226,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3462004065513611,
"rewards/margins": 0.1935741901397705,
"rewards/rejected": -0.5397745966911316,
"step": 155
},
{
"epoch": 0.16,
"learning_rate": 4.9476292142462374e-05,
"logits/chosen": -2.0294063091278076,
"logits/rejected": -2.0089852809906006,
"logps/chosen": -141.88330078125,
"logps/rejected": -145.25686645507812,
"loss": 0.6791,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2922942638397217,
"rewards/margins": 0.08209052681922913,
"rewards/rejected": -0.3743847906589508,
"step": 156
},
{
"epoch": 0.16,
"learning_rate": 4.945748983545172e-05,
"logits/chosen": -2.1608543395996094,
"logits/rejected": -2.099316358566284,
"logps/chosen": -140.04598999023438,
"logps/rejected": -135.99771118164062,
"loss": 0.5853,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.14180442690849304,
"rewards/margins": 0.3191547393798828,
"rewards/rejected": -0.46095913648605347,
"step": 157
},
{
"epoch": 0.16,
"learning_rate": 4.943835963210324e-05,
"logits/chosen": -2.23140811920166,
"logits/rejected": -2.160383701324463,
"logps/chosen": -176.86737060546875,
"logps/rejected": -173.28509521484375,
"loss": 0.6432,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.16037102043628693,
"rewards/margins": 0.1384631097316742,
"rewards/rejected": -0.2988341450691223,
"step": 158
},
{
"epoch": 0.17,
"learning_rate": 4.941890178889149e-05,
"logits/chosen": -2.226886034011841,
"logits/rejected": -2.2156994342803955,
"logps/chosen": -151.04501342773438,
"logps/rejected": -152.1208953857422,
"loss": 0.7934,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.4307902157306671,
"rewards/margins": -0.15255290269851685,
"rewards/rejected": -0.2782372236251831,
"step": 159
},
{
"epoch": 0.17,
"learning_rate": 4.939911656668361e-05,
"logits/chosen": -2.1915454864501953,
"logits/rejected": -2.1480343341827393,
"logps/chosen": -127.00537109375,
"logps/rejected": -118.40946960449219,
"loss": 0.6693,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.05850663036108017,
"rewards/margins": 0.07123789936304092,
"rewards/rejected": -0.1297445297241211,
"step": 160
},
{
"epoch": 0.17,
"learning_rate": 4.937900423073585e-05,
"logits/chosen": -2.2268741130828857,
"logits/rejected": -2.223992347717285,
"logps/chosen": -180.8136749267578,
"logps/rejected": -194.1834259033203,
"loss": 0.7348,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.2488463819026947,
"rewards/margins": 0.01856984756886959,
"rewards/rejected": -0.26741623878479004,
"step": 161
},
{
"epoch": 0.17,
"learning_rate": 4.9358565050689985e-05,
"logits/chosen": -2.163513660430908,
"logits/rejected": -2.1871607303619385,
"logps/chosen": -196.00209045410156,
"logps/rejected": -197.71170043945312,
"loss": 0.6894,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3192088305950165,
"rewards/margins": 0.049343544989824295,
"rewards/rejected": -0.36855238676071167,
"step": 162
},
{
"epoch": 0.17,
"learning_rate": 4.933779930056975e-05,
"logits/chosen": -2.1793665885925293,
"logits/rejected": -2.140435218811035,
"logps/chosen": -141.68283081054688,
"logps/rejected": -157.8527069091797,
"loss": 0.8341,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.5028910636901855,
"rewards/margins": -0.241933673620224,
"rewards/rejected": -0.26095739006996155,
"step": 163
},
{
"epoch": 0.17,
"learning_rate": 4.93167072587771e-05,
"logits/chosen": -2.1162264347076416,
"logits/rejected": -2.159403085708618,
"logps/chosen": -167.28477478027344,
"logps/rejected": -159.3861083984375,
"loss": 0.6934,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.32176950573921204,
"rewards/margins": 0.0923306867480278,
"rewards/rejected": -0.41410019993782043,
"step": 164
},
{
"epoch": 0.17,
"learning_rate": 4.929528920808854e-05,
"logits/chosen": -2.1691784858703613,
"logits/rejected": -2.151355266571045,
"logps/chosen": -210.38043212890625,
"logps/rejected": -182.91961669921875,
"loss": 0.5909,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3064863383769989,
"rewards/margins": 0.28497347235679626,
"rewards/rejected": -0.5914597511291504,
"step": 165
},
{
"epoch": 0.17,
"learning_rate": 4.92735454356513e-05,
"logits/chosen": -2.105543613433838,
"logits/rejected": -2.123973846435547,
"logps/chosen": -136.9002685546875,
"logps/rejected": -134.88809204101562,
"loss": 0.7425,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.40990638732910156,
"rewards/margins": -0.044356442987918854,
"rewards/rejected": -0.3655499517917633,
"step": 166
},
{
"epoch": 0.17,
"learning_rate": 4.925147623297949e-05,
"logits/chosen": -2.324575901031494,
"logits/rejected": -2.318359613418579,
"logps/chosen": -196.5352020263672,
"logps/rejected": -168.95431518554688,
"loss": 0.8018,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.35998621582984924,
"rewards/margins": -0.16241417825222015,
"rewards/rejected": -0.1975720375776291,
"step": 167
},
{
"epoch": 0.18,
"learning_rate": 4.922908189595018e-05,
"logits/chosen": -2.0524559020996094,
"logits/rejected": -2.0096793174743652,
"logps/chosen": -154.45022583007812,
"logps/rejected": -153.45050048828125,
"loss": 0.6433,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.21955004334449768,
"rewards/margins": 0.17729638516902924,
"rewards/rejected": -0.3968464732170105,
"step": 168
},
{
"epoch": 0.18,
"learning_rate": 4.920636272479946e-05,
"logits/chosen": -2.305999755859375,
"logits/rejected": -2.299030065536499,
"logps/chosen": -164.8284149169922,
"logps/rejected": -151.80828857421875,
"loss": 0.6648,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.31743210554122925,
"rewards/margins": 0.11833076179027557,
"rewards/rejected": -0.43576285243034363,
"step": 169
},
{
"epoch": 0.18,
"learning_rate": 4.9183319024118415e-05,
"logits/chosen": -2.1119515895843506,
"logits/rejected": -2.1947803497314453,
"logps/chosen": -135.42007446289062,
"logps/rejected": -142.30645751953125,
"loss": 0.7242,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.29650259017944336,
"rewards/margins": -0.007799305021762848,
"rewards/rejected": -0.2887033224105835,
"step": 170
},
{
"epoch": 0.18,
"learning_rate": 4.915995110284901e-05,
"logits/chosen": -2.1119866371154785,
"logits/rejected": -2.133423328399658,
"logps/chosen": -180.34866333007812,
"logps/rejected": -194.75259399414062,
"loss": 0.712,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3628592789173126,
"rewards/margins": 0.04306137561798096,
"rewards/rejected": -0.4059206247329712,
"step": 171
},
{
"epoch": 0.18,
"learning_rate": 4.9136259274279955e-05,
"logits/chosen": -2.2553658485412598,
"logits/rejected": -2.261441707611084,
"logps/chosen": -146.23861694335938,
"logps/rejected": -143.55841064453125,
"loss": 0.7566,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4861433506011963,
"rewards/margins": -0.029232196509838104,
"rewards/rejected": -0.4569111168384552,
"step": 172
},
{
"epoch": 0.18,
"learning_rate": 4.911224385604255e-05,
"logits/chosen": -2.3380446434020996,
"logits/rejected": -2.278407335281372,
"logps/chosen": -160.54949951171875,
"logps/rejected": -151.08018493652344,
"loss": 0.8986,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.5705781579017639,
"rewards/margins": -0.310020387172699,
"rewards/rejected": -0.26055777072906494,
"step": 173
},
{
"epoch": 0.18,
"learning_rate": 4.908790517010636e-05,
"logits/chosen": -2.298267364501953,
"logits/rejected": -2.317840814590454,
"logps/chosen": -148.01202392578125,
"logps/rejected": -157.5655517578125,
"loss": 0.5926,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.20174311101436615,
"rewards/margins": 0.3064672350883484,
"rewards/rejected": -0.5082104206085205,
"step": 174
},
{
"epoch": 0.18,
"learning_rate": 4.906324354277495e-05,
"logits/chosen": -2.31154727935791,
"logits/rejected": -2.3395073413848877,
"logps/chosen": -208.95376586914062,
"logps/rejected": -197.12318420410156,
"loss": 0.8049,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.4388049244880676,
"rewards/margins": -0.17647361755371094,
"rewards/rejected": -0.2623312771320343,
"step": 175
},
{
"epoch": 0.18,
"learning_rate": 4.903825930468149e-05,
"logits/chosen": -2.3014442920684814,
"logits/rejected": -2.2676706314086914,
"logps/chosen": -176.29176330566406,
"logps/rejected": -158.9712371826172,
"loss": 0.6158,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.17731614410877228,
"rewards/margins": 0.219602569937706,
"rewards/rejected": -0.3969186842441559,
"step": 176
},
{
"epoch": 0.18,
"learning_rate": 4.901295279078431e-05,
"logits/chosen": -2.3018527030944824,
"logits/rejected": -2.250325918197632,
"logps/chosen": -198.3526611328125,
"logps/rejected": -224.3977508544922,
"loss": 0.749,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.6051126718521118,
"rewards/margins": -0.05212073400616646,
"rewards/rejected": -0.5529919862747192,
"step": 177
},
{
"epoch": 0.19,
"learning_rate": 4.898732434036244e-05,
"logits/chosen": -2.299297571182251,
"logits/rejected": -2.3030190467834473,
"logps/chosen": -167.0242156982422,
"logps/rejected": -163.21133422851562,
"loss": 0.818,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4616413414478302,
"rewards/margins": -0.1739044189453125,
"rewards/rejected": -0.2877369225025177,
"step": 178
},
{
"epoch": 0.19,
"learning_rate": 4.896137429701102e-05,
"logits/chosen": -2.186522960662842,
"logits/rejected": -2.127988338470459,
"logps/chosen": -173.8786163330078,
"logps/rejected": -175.3472137451172,
"loss": 0.6219,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.33478206396102905,
"rewards/margins": 0.27421796321868896,
"rewards/rejected": -0.609000027179718,
"step": 179
},
{
"epoch": 0.19,
"learning_rate": 4.893510300863676e-05,
"logits/chosen": -2.1868398189544678,
"logits/rejected": -2.2052829265594482,
"logps/chosen": -226.92471313476562,
"logps/rejected": -219.23912048339844,
"loss": 0.7715,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5434471964836121,
"rewards/margins": -0.10749813914299011,
"rewards/rejected": -0.43594905734062195,
"step": 180
},
{
"epoch": 0.19,
"learning_rate": 4.890851082745319e-05,
"logits/chosen": -2.2493791580200195,
"logits/rejected": -2.300419330596924,
"logps/chosen": -181.13748168945312,
"logps/rejected": -190.27645874023438,
"loss": 0.6568,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.10172881931066513,
"rewards/margins": 0.09888561069965363,
"rewards/rejected": -0.20061442255973816,
"step": 181
},
{
"epoch": 0.19,
"learning_rate": 4.8881598109976004e-05,
"logits/chosen": -2.3254058361053467,
"logits/rejected": -2.3100500106811523,
"logps/chosen": -208.1369171142578,
"logps/rejected": -213.24652099609375,
"loss": 0.7061,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5665594339370728,
"rewards/margins": 0.10720066726207733,
"rewards/rejected": -0.673760175704956,
"step": 182
},
{
"epoch": 0.19,
"learning_rate": 4.885436521701824e-05,
"logits/chosen": -2.3967294692993164,
"logits/rejected": -2.4122745990753174,
"logps/chosen": -123.20140838623047,
"logps/rejected": -133.7183837890625,
"loss": 0.6325,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3578134775161743,
"rewards/margins": 0.17870007455348969,
"rewards/rejected": -0.5365135669708252,
"step": 183
},
{
"epoch": 0.19,
"learning_rate": 4.8826812513685487e-05,
"logits/chosen": -2.327406167984009,
"logits/rejected": -2.3479795455932617,
"logps/chosen": -169.1548614501953,
"logps/rejected": -181.2438201904297,
"loss": 0.6755,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5492762327194214,
"rewards/margins": 0.12284587323665619,
"rewards/rejected": -0.672122061252594,
"step": 184
},
{
"epoch": 0.19,
"learning_rate": 4.8798940369370944e-05,
"logits/chosen": -2.2139463424682617,
"logits/rejected": -2.182992935180664,
"logps/chosen": -169.96737670898438,
"logps/rejected": -162.341552734375,
"loss": 0.7834,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5437475442886353,
"rewards/margins": -0.11101134121417999,
"rewards/rejected": -0.43273624777793884,
"step": 185
},
{
"epoch": 0.19,
"learning_rate": 4.877074915775049e-05,
"logits/chosen": -2.370654582977295,
"logits/rejected": -2.2954137325286865,
"logps/chosen": -197.25172424316406,
"logps/rejected": -176.16543579101562,
"loss": 0.7289,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.43914932012557983,
"rewards/margins": -0.006785091012716293,
"rewards/rejected": -0.43236425518989563,
"step": 186
},
{
"epoch": 0.2,
"learning_rate": 4.8742239256777674e-05,
"logits/chosen": -2.179440975189209,
"logits/rejected": -2.1714303493499756,
"logps/chosen": -151.60626220703125,
"logps/rejected": -179.89920043945312,
"loss": 0.7239,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5735316872596741,
"rewards/margins": 0.02325405180454254,
"rewards/rejected": -0.5967857241630554,
"step": 187
},
{
"epoch": 0.2,
"learning_rate": 4.8713411048678635e-05,
"logits/chosen": -2.132361650466919,
"logits/rejected": -1.9567621946334839,
"logps/chosen": -185.78890991210938,
"logps/rejected": -146.61630249023438,
"loss": 0.7341,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.5060631632804871,
"rewards/margins": -0.03276711702346802,
"rewards/rejected": -0.47329598665237427,
"step": 188
},
{
"epoch": 0.2,
"learning_rate": 4.868426491994702e-05,
"logits/chosen": -2.0946998596191406,
"logits/rejected": -2.0924675464630127,
"logps/chosen": -182.0203857421875,
"logps/rejected": -189.61448669433594,
"loss": 0.6746,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5287783741950989,
"rewards/margins": 0.13924337923526764,
"rewards/rejected": -0.6680217981338501,
"step": 189
},
{
"epoch": 0.2,
"learning_rate": 4.865480126133872e-05,
"logits/chosen": -2.1674387454986572,
"logits/rejected": -2.1639039516448975,
"logps/chosen": -215.89862060546875,
"logps/rejected": -194.6399383544922,
"loss": 0.6866,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4448810815811157,
"rewards/margins": 0.05918551981449127,
"rewards/rejected": -0.5040666460990906,
"step": 190
},
{
"epoch": 0.2,
"learning_rate": 4.862502046786671e-05,
"logits/chosen": -2.1711835861206055,
"logits/rejected": -2.3064002990722656,
"logps/chosen": -173.3459014892578,
"logps/rejected": -198.23008728027344,
"loss": 0.6805,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4160701632499695,
"rewards/margins": 0.11089640110731125,
"rewards/rejected": -0.5269665122032166,
"step": 191
},
{
"epoch": 0.2,
"learning_rate": 4.859492293879574e-05,
"logits/chosen": -2.2058756351470947,
"logits/rejected": -2.178715944290161,
"logps/chosen": -227.00070190429688,
"logps/rejected": -238.19232177734375,
"loss": 0.6392,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3010501563549042,
"rewards/margins": 0.1993046998977661,
"rewards/rejected": -0.5003548264503479,
"step": 192
},
{
"epoch": 0.2,
"learning_rate": 4.856450907763693e-05,
"logits/chosen": -2.2074475288391113,
"logits/rejected": -2.1603050231933594,
"logps/chosen": -149.5859832763672,
"logps/rejected": -149.609130859375,
"loss": 0.777,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.4570000171661377,
"rewards/margins": -0.09236741065979004,
"rewards/rejected": -0.36463260650634766,
"step": 193
},
{
"epoch": 0.2,
"learning_rate": 4.853377929214243e-05,
"logits/chosen": -2.123965263366699,
"logits/rejected": -2.116884469985962,
"logps/chosen": -172.16937255859375,
"logps/rejected": -180.29434204101562,
"loss": 0.6691,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4884795546531677,
"rewards/margins": 0.12548628449440002,
"rewards/rejected": -0.6139658689498901,
"step": 194
},
{
"epoch": 0.2,
"learning_rate": 4.85027339942999e-05,
"logits/chosen": -2.302314281463623,
"logits/rejected": -2.272852659225464,
"logps/chosen": -220.17941284179688,
"logps/rejected": -226.3072509765625,
"loss": 0.6509,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4287835657596588,
"rewards/margins": 0.1545097827911377,
"rewards/rejected": -0.5832933783531189,
"step": 195
},
{
"epoch": 0.2,
"learning_rate": 4.8471373600326996e-05,
"logits/chosen": -2.139336347579956,
"logits/rejected": -2.087122678756714,
"logps/chosen": -140.20321655273438,
"logps/rejected": -131.48948669433594,
"loss": 0.7291,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.3434653580188751,
"rewards/margins": -0.03645901009440422,
"rewards/rejected": -0.3070063292980194,
"step": 196
},
{
"epoch": 0.21,
"learning_rate": 4.843969853066584e-05,
"logits/chosen": -2.292895793914795,
"logits/rejected": -2.316250801086426,
"logps/chosen": -147.1464385986328,
"logps/rejected": -161.3351593017578,
"loss": 0.6387,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1997198909521103,
"rewards/margins": 0.2121596336364746,
"rewards/rejected": -0.4118794798851013,
"step": 197
},
{
"epoch": 0.21,
"learning_rate": 4.8407709209977305e-05,
"logits/chosen": -2.4767165184020996,
"logits/rejected": -2.521482467651367,
"logps/chosen": -206.472412109375,
"logps/rejected": -215.40516662597656,
"loss": 0.6462,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5403072237968445,
"rewards/margins": 0.17954078316688538,
"rewards/rejected": -0.7198480367660522,
"step": 198
},
{
"epoch": 0.21,
"learning_rate": 4.837540606713538e-05,
"logits/chosen": -2.2345879077911377,
"logits/rejected": -2.221876621246338,
"logps/chosen": -166.65631103515625,
"logps/rejected": -146.72183227539062,
"loss": 0.8822,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7132791876792908,
"rewards/margins": -0.2818105220794678,
"rewards/rejected": -0.43146878480911255,
"step": 199
},
{
"epoch": 0.21,
"learning_rate": 4.834278953522138e-05,
"logits/chosen": -2.1550986766815186,
"logits/rejected": -2.2200095653533936,
"logps/chosen": -141.46798706054688,
"logps/rejected": -152.30160522460938,
"loss": 0.6853,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3020961284637451,
"rewards/margins": 0.08537312597036362,
"rewards/rejected": -0.3874692916870117,
"step": 200
},
{
"epoch": 0.21,
"learning_rate": 4.8309860051518204e-05,
"logits/chosen": -2.1666178703308105,
"logits/rejected": -2.195492744445801,
"logps/chosen": -154.3419647216797,
"logps/rejected": -154.75958251953125,
"loss": 0.7683,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5529704093933105,
"rewards/margins": 0.002926960587501526,
"rewards/rejected": -0.5558973550796509,
"step": 201
},
{
"epoch": 0.21,
"learning_rate": 4.8276618057504376e-05,
"logits/chosen": -2.2236335277557373,
"logits/rejected": -2.2724032402038574,
"logps/chosen": -143.28988647460938,
"logps/rejected": -150.9318084716797,
"loss": 0.7539,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.256462037563324,
"rewards/margins": -0.02850787341594696,
"rewards/rejected": -0.2279541939496994,
"step": 202
},
{
"epoch": 0.21,
"learning_rate": 4.824306399884822e-05,
"logits/chosen": -2.2802443504333496,
"logits/rejected": -2.2836642265319824,
"logps/chosen": -179.90365600585938,
"logps/rejected": -169.27171325683594,
"loss": 0.7907,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.4755932092666626,
"rewards/margins": -0.1532573252916336,
"rewards/rejected": -0.3223358988761902,
"step": 203
},
{
"epoch": 0.21,
"learning_rate": 4.8209198325401815e-05,
"logits/chosen": -2.3384103775024414,
"logits/rejected": -2.314667224884033,
"logps/chosen": -170.7639617919922,
"logps/rejected": -162.15753173828125,
"loss": 0.6634,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.26886916160583496,
"rewards/margins": 0.1269666701555252,
"rewards/rejected": -0.39583584666252136,
"step": 204
},
{
"epoch": 0.21,
"learning_rate": 4.817502149119502e-05,
"logits/chosen": -2.250046491622925,
"logits/rejected": -2.225694179534912,
"logps/chosen": -165.20504760742188,
"logps/rejected": -172.9903106689453,
"loss": 0.6962,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.45896613597869873,
"rewards/margins": 0.0778706818819046,
"rewards/rejected": -0.5368368029594421,
"step": 205
},
{
"epoch": 0.21,
"learning_rate": 4.8140533954429327e-05,
"logits/chosen": -2.313793420791626,
"logits/rejected": -2.2939281463623047,
"logps/chosen": -143.09251403808594,
"logps/rejected": -150.27903747558594,
"loss": 0.6284,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.2762828767299652,
"rewards/margins": 0.2016773521900177,
"rewards/rejected": -0.4779602289199829,
"step": 206
},
{
"epoch": 0.22,
"learning_rate": 4.810573617747178e-05,
"logits/chosen": -2.332127809524536,
"logits/rejected": -2.327253818511963,
"logps/chosen": -162.88739013671875,
"logps/rejected": -167.6454315185547,
"loss": 0.6114,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.22507114708423615,
"rewards/margins": 0.24488690495491028,
"rewards/rejected": -0.4699580669403076,
"step": 207
},
{
"epoch": 0.22,
"learning_rate": 4.8070628626848735e-05,
"logits/chosen": -2.12371563911438,
"logits/rejected": -2.1890509128570557,
"logps/chosen": -169.19342041015625,
"logps/rejected": -191.41851806640625,
"loss": 0.7002,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5895254611968994,
"rewards/margins": 0.07979288697242737,
"rewards/rejected": -0.6693182587623596,
"step": 208
},
{
"epoch": 0.22,
"learning_rate": 4.803521177323962e-05,
"logits/chosen": -2.2081453800201416,
"logits/rejected": -2.1713621616363525,
"logps/chosen": -163.9095458984375,
"logps/rejected": -169.9920654296875,
"loss": 0.6973,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3412324786186218,
"rewards/margins": 0.045901067554950714,
"rewards/rejected": -0.38713356852531433,
"step": 209
},
{
"epoch": 0.22,
"learning_rate": 4.799948609147061e-05,
"logits/chosen": -2.161041259765625,
"logits/rejected": -2.1152327060699463,
"logps/chosen": -165.09188842773438,
"logps/rejected": -156.6759796142578,
"loss": 0.7462,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.48258787393569946,
"rewards/margins": -0.04187957942485809,
"rewards/rejected": -0.44070830941200256,
"step": 210
},
{
"epoch": 0.22,
"learning_rate": 4.796345206050829e-05,
"logits/chosen": -2.106369733810425,
"logits/rejected": -2.2182164192199707,
"logps/chosen": -176.6033477783203,
"logps/rejected": -213.2372589111328,
"loss": 0.7045,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4269718527793884,
"rewards/margins": 0.00813320279121399,
"rewards/rejected": -0.4351051151752472,
"step": 211
},
{
"epoch": 0.22,
"learning_rate": 4.792711016345321e-05,
"logits/chosen": -2.1926729679107666,
"logits/rejected": -2.137371063232422,
"logps/chosen": -157.62571716308594,
"logps/rejected": -141.09170532226562,
"loss": 0.6535,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3278404176235199,
"rewards/margins": 0.16385424137115479,
"rewards/rejected": -0.49169468879699707,
"step": 212
},
{
"epoch": 0.22,
"learning_rate": 4.7890460887533417e-05,
"logits/chosen": -2.1466121673583984,
"logits/rejected": -2.181826114654541,
"logps/chosen": -163.6356964111328,
"logps/rejected": -176.73495483398438,
"loss": 0.6503,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.25386762619018555,
"rewards/margins": 0.17454546689987183,
"rewards/rejected": -0.4284130930900574,
"step": 213
},
{
"epoch": 0.22,
"learning_rate": 4.785350472409792e-05,
"logits/chosen": -2.1585237979888916,
"logits/rejected": -2.2346014976501465,
"logps/chosen": -171.45030212402344,
"logps/rejected": -226.79852294921875,
"loss": 0.7963,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.43708741664886475,
"rewards/margins": -0.08538319170475006,
"rewards/rejected": -0.3517042398452759,
"step": 214
},
{
"epoch": 0.22,
"learning_rate": 4.7816242168610093e-05,
"logits/chosen": -2.247028350830078,
"logits/rejected": -2.2580173015594482,
"logps/chosen": -190.72055053710938,
"logps/rejected": -186.36619567871094,
"loss": 0.6804,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.447765588760376,
"rewards/margins": 0.06104414537549019,
"rewards/rejected": -0.5088096857070923,
"step": 215
},
{
"epoch": 0.23,
"learning_rate": 4.777867372064105e-05,
"logits/chosen": -2.222308874130249,
"logits/rejected": -2.2742927074432373,
"logps/chosen": -165.11985778808594,
"logps/rejected": -187.6106719970703,
"loss": 0.7121,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.37397119402885437,
"rewards/margins": 0.040555573999881744,
"rewards/rejected": -0.4145267605781555,
"step": 216
},
{
"epoch": 0.23,
"learning_rate": 4.774079988386296e-05,
"logits/chosen": -2.2380332946777344,
"logits/rejected": -2.3339126110076904,
"logps/chosen": -137.3435821533203,
"logps/rejected": -169.9375,
"loss": 0.7015,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.36366236209869385,
"rewards/margins": 0.009570196270942688,
"rewards/rejected": -0.37323257327079773,
"step": 217
},
{
"epoch": 0.23,
"learning_rate": 4.770262116604224e-05,
"logits/chosen": -2.2831175327301025,
"logits/rejected": -2.260009765625,
"logps/chosen": -207.33409118652344,
"logps/rejected": -217.00962829589844,
"loss": 0.6735,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.27594539523124695,
"rewards/margins": 0.09640569984912872,
"rewards/rejected": -0.3723510801792145,
"step": 218
},
{
"epoch": 0.23,
"learning_rate": 4.76641380790328e-05,
"logits/chosen": -2.2921030521392822,
"logits/rejected": -2.2923154830932617,
"logps/chosen": -168.2577667236328,
"logps/rejected": -166.49632263183594,
"loss": 0.6891,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2567214369773865,
"rewards/margins": 0.15365271270275116,
"rewards/rejected": -0.41037416458129883,
"step": 219
},
{
"epoch": 0.23,
"learning_rate": 4.762535113876917e-05,
"logits/chosen": -2.273233652114868,
"logits/rejected": -2.246183395385742,
"logps/chosen": -208.0199737548828,
"logps/rejected": -209.45909118652344,
"loss": 0.6409,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.12088227272033691,
"rewards/margins": 0.15805310010910034,
"rewards/rejected": -0.27893537282943726,
"step": 220
},
{
"epoch": 0.23,
"learning_rate": 4.758626086525956e-05,
"logits/chosen": -2.2132441997528076,
"logits/rejected": -2.22251296043396,
"logps/chosen": -167.60195922851562,
"logps/rejected": -189.9409942626953,
"loss": 0.6678,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3752858340740204,
"rewards/margins": 0.10380817204713821,
"rewards/rejected": -0.479093998670578,
"step": 221
},
{
"epoch": 0.23,
"learning_rate": 4.754686778257891e-05,
"logits/chosen": -2.282052755355835,
"logits/rejected": -2.2734806537628174,
"logps/chosen": -132.63140869140625,
"logps/rejected": -136.8794708251953,
"loss": 0.6003,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.009242314845323563,
"rewards/margins": 0.26125800609588623,
"rewards/rejected": -0.25201570987701416,
"step": 222
},
{
"epoch": 0.23,
"learning_rate": 4.750717241886185e-05,
"logits/chosen": -2.189680337905884,
"logits/rejected": -2.1535582542419434,
"logps/chosen": -129.69590759277344,
"logps/rejected": -127.54085540771484,
"loss": 0.7529,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5190645456314087,
"rewards/margins": -0.08417561650276184,
"rewards/rejected": -0.43488895893096924,
"step": 223
},
{
"epoch": 0.23,
"learning_rate": 4.7467175306295655e-05,
"logits/chosen": -2.2347896099090576,
"logits/rejected": -2.2738678455352783,
"logps/chosen": -158.85928344726562,
"logps/rejected": -158.6512908935547,
"loss": 0.683,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.4156660735607147,
"rewards/margins": 0.1208164319396019,
"rewards/rejected": -0.5364825129508972,
"step": 224
},
{
"epoch": 0.23,
"learning_rate": 4.7426876981113044e-05,
"logits/chosen": -2.1925535202026367,
"logits/rejected": -2.140517234802246,
"logps/chosen": -165.46412658691406,
"logps/rejected": -160.30532836914062,
"loss": 0.6721,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.22652333974838257,
"rewards/margins": 0.09546832740306854,
"rewards/rejected": -0.3219916820526123,
"step": 225
},
{
"epoch": 0.24,
"learning_rate": 4.738627798358506e-05,
"logits/chosen": -2.3328022956848145,
"logits/rejected": -2.3571670055389404,
"logps/chosen": -214.76580810546875,
"logps/rejected": -231.52804565429688,
"loss": 0.5743,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2423395961523056,
"rewards/margins": 0.3217008709907532,
"rewards/rejected": -0.5640404224395752,
"step": 226
},
{
"epoch": 0.24,
"learning_rate": 4.7345378858013776e-05,
"logits/chosen": -2.250012159347534,
"logits/rejected": -2.26370906829834,
"logps/chosen": -208.80230712890625,
"logps/rejected": -210.8206329345703,
"loss": 0.8003,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5306903719902039,
"rewards/margins": -0.14622673392295837,
"rewards/rejected": -0.3844636082649231,
"step": 227
},
{
"epoch": 0.24,
"learning_rate": 4.730418015272503e-05,
"logits/chosen": -2.3351643085479736,
"logits/rejected": -2.33880352973938,
"logps/chosen": -209.63296508789062,
"logps/rejected": -219.62771606445312,
"loss": 0.6265,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4614105522632599,
"rewards/margins": 0.28935885429382324,
"rewards/rejected": -0.7507694959640503,
"step": 228
},
{
"epoch": 0.24,
"learning_rate": 4.726268242006106e-05,
"logits/chosen": -2.051967144012451,
"logits/rejected": -2.0519886016845703,
"logps/chosen": -142.45199584960938,
"logps/rejected": -140.49044799804688,
"loss": 0.6649,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4592810571193695,
"rewards/margins": 0.15481841564178467,
"rewards/rejected": -0.6140995025634766,
"step": 229
},
{
"epoch": 0.24,
"learning_rate": 4.722088621637309e-05,
"logits/chosen": -2.2426981925964355,
"logits/rejected": -2.287612199783325,
"logps/chosen": -169.47291564941406,
"logps/rejected": -179.918212890625,
"loss": 0.7675,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.6015586853027344,
"rewards/margins": -0.048168424516916275,
"rewards/rejected": -0.5533902645111084,
"step": 230
},
{
"epoch": 0.24,
"learning_rate": 4.717879210201389e-05,
"logits/chosen": -2.192275047302246,
"logits/rejected": -2.308380365371704,
"logps/chosen": -156.87548828125,
"logps/rejected": -180.34512329101562,
"loss": 0.714,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.42379727959632874,
"rewards/margins": 0.08124817907810211,
"rewards/rejected": -0.505045473575592,
"step": 231
},
{
"epoch": 0.24,
"learning_rate": 4.713640064133025e-05,
"logits/chosen": -2.057934045791626,
"logits/rejected": -1.9755558967590332,
"logps/chosen": -156.72357177734375,
"logps/rejected": -158.1801300048828,
"loss": 0.7132,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5255974531173706,
"rewards/margins": 0.04890578240156174,
"rewards/rejected": -0.5745032429695129,
"step": 232
},
{
"epoch": 0.24,
"learning_rate": 4.7093712402655427e-05,
"logits/chosen": -2.055185079574585,
"logits/rejected": -2.0594377517700195,
"logps/chosen": -139.1905517578125,
"logps/rejected": -141.0957489013672,
"loss": 0.7112,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4041973948478699,
"rewards/margins": 0.0033319219946861267,
"rewards/rejected": -0.407529354095459,
"step": 233
},
{
"epoch": 0.24,
"learning_rate": 4.7050727958301506e-05,
"logits/chosen": -2.1757850646972656,
"logits/rejected": -2.1435933113098145,
"logps/chosen": -177.0955810546875,
"logps/rejected": -161.39651489257812,
"loss": 0.6564,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3865699768066406,
"rewards/margins": 0.11041317880153656,
"rewards/rejected": -0.4969831705093384,
"step": 234
},
{
"epoch": 0.25,
"learning_rate": 4.7007447884551745e-05,
"logits/chosen": -2.055013418197632,
"logits/rejected": -2.0445072650909424,
"logps/chosen": -154.93429565429688,
"logps/rejected": -166.30482482910156,
"loss": 0.7354,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.48415255546569824,
"rewards/margins": 0.04046877846121788,
"rewards/rejected": -0.5246213674545288,
"step": 235
},
{
"epoch": 0.25,
"learning_rate": 4.6963872761652835e-05,
"logits/chosen": -2.202390670776367,
"logits/rejected": -2.2232775688171387,
"logps/chosen": -216.69921875,
"logps/rejected": -206.72483825683594,
"loss": 0.79,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.47140538692474365,
"rewards/margins": 0.04289142042398453,
"rewards/rejected": -0.51429682970047,
"step": 236
},
{
"epoch": 0.25,
"learning_rate": 4.692000317380715e-05,
"logits/chosen": -2.2061116695404053,
"logits/rejected": -2.2866199016571045,
"logps/chosen": -174.46450805664062,
"logps/rejected": -185.262939453125,
"loss": 0.628,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4978125989437103,
"rewards/margins": 0.25329017639160156,
"rewards/rejected": -0.7511026859283447,
"step": 237
},
{
"epoch": 0.25,
"learning_rate": 4.687583970916487e-05,
"logits/chosen": -2.2171239852905273,
"logits/rejected": -2.2869226932525635,
"logps/chosen": -186.90225219726562,
"logps/rejected": -210.4280548095703,
"loss": 0.7644,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4735070765018463,
"rewards/margins": -0.040224503725767136,
"rewards/rejected": -0.4332825839519501,
"step": 238
},
{
"epoch": 0.25,
"learning_rate": 4.683138295981611e-05,
"logits/chosen": -2.1964111328125,
"logits/rejected": -2.212043046951294,
"logps/chosen": -153.104248046875,
"logps/rejected": -165.96299743652344,
"loss": 0.6534,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.24953705072402954,
"rewards/margins": 0.14786407351493835,
"rewards/rejected": -0.3974011242389679,
"step": 239
},
{
"epoch": 0.25,
"learning_rate": 4.678663352178301e-05,
"logits/chosen": -1.9726707935333252,
"logits/rejected": -2.0120890140533447,
"logps/chosen": -148.98944091796875,
"logps/rejected": -149.28582763671875,
"loss": 0.6168,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.33494099974632263,
"rewards/margins": 0.3014180362224579,
"rewards/rejected": -0.6363590359687805,
"step": 240
},
{
"epoch": 0.25,
"learning_rate": 4.674159199501173e-05,
"logits/chosen": -2.143721103668213,
"logits/rejected": -2.1720468997955322,
"logps/chosen": -131.01319885253906,
"logps/rejected": -147.077392578125,
"loss": 0.7585,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4145124554634094,
"rewards/margins": -0.08547190576791763,
"rewards/rejected": -0.3290405869483948,
"step": 241
},
{
"epoch": 0.25,
"learning_rate": 4.6696258983364385e-05,
"logits/chosen": -2.2236969470977783,
"logits/rejected": -2.2577428817749023,
"logps/chosen": -184.3029327392578,
"logps/rejected": -190.77215576171875,
"loss": 0.6862,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.23868240416049957,
"rewards/margins": 0.11198949813842773,
"rewards/rejected": -0.3506719172000885,
"step": 242
},
{
"epoch": 0.25,
"learning_rate": 4.665063509461097e-05,
"logits/chosen": -2.060711622238159,
"logits/rejected": -2.0380258560180664,
"logps/chosen": -174.40530395507812,
"logps/rejected": -171.62820434570312,
"loss": 0.6911,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4223189949989319,
"rewards/margins": 0.14973345398902893,
"rewards/rejected": -0.5720524191856384,
"step": 243
},
{
"epoch": 0.25,
"learning_rate": 4.660472094042121e-05,
"logits/chosen": -2.1594133377075195,
"logits/rejected": -2.1700899600982666,
"logps/chosen": -207.55848693847656,
"logps/rejected": -200.78297424316406,
"loss": 0.7768,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5937251448631287,
"rewards/margins": -0.08584102243185043,
"rewards/rejected": -0.50788414478302,
"step": 244
},
{
"epoch": 0.26,
"learning_rate": 4.655851713635635e-05,
"logits/chosen": -2.305196762084961,
"logits/rejected": -2.2462093830108643,
"logps/chosen": -243.9378662109375,
"logps/rejected": -215.36997985839844,
"loss": 0.724,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5542324185371399,
"rewards/margins": 0.021042201668024063,
"rewards/rejected": -0.5752745866775513,
"step": 245
},
{
"epoch": 0.26,
"learning_rate": 4.651202430186092e-05,
"logits/chosen": -2.025132179260254,
"logits/rejected": -1.9792908430099487,
"logps/chosen": -195.5504608154297,
"logps/rejected": -195.3519744873047,
"loss": 0.7289,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.539334237575531,
"rewards/margins": 0.23235680162906647,
"rewards/rejected": -0.771691083908081,
"step": 246
},
{
"epoch": 0.26,
"learning_rate": 4.6465243060254415e-05,
"logits/chosen": -2.147789239883423,
"logits/rejected": -2.1263253688812256,
"logps/chosen": -189.7777099609375,
"logps/rejected": -175.15142822265625,
"loss": 0.7963,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.40796539187431335,
"rewards/margins": -0.13925030827522278,
"rewards/rejected": -0.2687150835990906,
"step": 247
},
{
"epoch": 0.26,
"learning_rate": 4.641817403872293e-05,
"logits/chosen": -2.0318384170532227,
"logits/rejected": -2.0659642219543457,
"logps/chosen": -165.9607391357422,
"logps/rejected": -182.31895446777344,
"loss": 0.8084,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.3802737891674042,
"rewards/margins": -0.0964832603931427,
"rewards/rejected": -0.28379055857658386,
"step": 248
},
{
"epoch": 0.26,
"learning_rate": 4.637081786831079e-05,
"logits/chosen": -2.0719449520111084,
"logits/rejected": -2.026743173599243,
"logps/chosen": -191.45486450195312,
"logps/rejected": -185.71011352539062,
"loss": 0.7078,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3257860839366913,
"rewards/margins": 0.07691369950771332,
"rewards/rejected": -0.4026997983455658,
"step": 249
},
{
"epoch": 0.26,
"learning_rate": 4.6323175183912024e-05,
"logits/chosen": -2.0912959575653076,
"logits/rejected": -2.156938314437866,
"logps/chosen": -167.86541748046875,
"logps/rejected": -198.28921508789062,
"loss": 0.4872,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.35692939162254333,
"rewards/margins": 0.6100252866744995,
"rewards/rejected": -0.9669547080993652,
"step": 250
},
{
"epoch": 0.26,
"learning_rate": 4.627524662426194e-05,
"logits/chosen": -1.8769241571426392,
"logits/rejected": -1.8571780920028687,
"logps/chosen": -175.3948974609375,
"logps/rejected": -183.54019165039062,
"loss": 0.8564,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.6141475439071655,
"rewards/margins": -0.18047203123569489,
"rewards/rejected": -0.43367546796798706,
"step": 251
},
{
"epoch": 0.26,
"learning_rate": 4.6227032831928484e-05,
"logits/chosen": -1.9304271936416626,
"logits/rejected": -1.8033334016799927,
"logps/chosen": -174.45651245117188,
"logps/rejected": -144.34657287597656,
"loss": 0.7318,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.625647246837616,
"rewards/margins": 0.09466619789600372,
"rewards/rejected": -0.7203134298324585,
"step": 252
},
{
"epoch": 0.26,
"learning_rate": 4.6178534453303666e-05,
"logits/chosen": -2.082902193069458,
"logits/rejected": -2.0136592388153076,
"logps/chosen": -201.52587890625,
"logps/rejected": -198.62039184570312,
"loss": 0.87,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5361589789390564,
"rewards/margins": -0.25926750898361206,
"rewards/rejected": -0.27689146995544434,
"step": 253
},
{
"epoch": 0.26,
"learning_rate": 4.6129752138594874e-05,
"logits/chosen": -1.957344889640808,
"logits/rejected": -1.982904314994812,
"logps/chosen": -187.89329528808594,
"logps/rejected": -186.68289184570312,
"loss": 0.7922,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5014457106590271,
"rewards/margins": -0.08516909182071686,
"rewards/rejected": -0.41627663373947144,
"step": 254
},
{
"epoch": 0.27,
"learning_rate": 4.608068654181617e-05,
"logits/chosen": -1.654222846031189,
"logits/rejected": -1.6798536777496338,
"logps/chosen": -184.433349609375,
"logps/rejected": -182.4044189453125,
"loss": 0.6938,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.25272616744041443,
"rewards/margins": 0.10025610774755478,
"rewards/rejected": -0.3529822528362274,
"step": 255
},
{
"epoch": 0.27,
"learning_rate": 4.6031338320779534e-05,
"logits/chosen": -2.0019171237945557,
"logits/rejected": -2.0632452964782715,
"logps/chosen": -162.12811279296875,
"logps/rejected": -177.98736572265625,
"loss": 0.688,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5015031099319458,
"rewards/margins": 0.15753880143165588,
"rewards/rejected": -0.6590418815612793,
"step": 256
},
{
"epoch": 0.27,
"learning_rate": 4.5981708137086e-05,
"logits/chosen": -2.0519323348999023,
"logits/rejected": -2.089592933654785,
"logps/chosen": -168.29608154296875,
"logps/rejected": -180.1384735107422,
"loss": 0.6519,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.46469518542289734,
"rewards/margins": 0.22538693249225616,
"rewards/rejected": -0.6900821924209595,
"step": 257
},
{
"epoch": 0.27,
"learning_rate": 4.5931796656116846e-05,
"logits/chosen": -1.9542289972305298,
"logits/rejected": -2.1067469120025635,
"logps/chosen": -141.18040466308594,
"logps/rejected": -154.12332153320312,
"loss": 0.7912,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.424506276845932,
"rewards/margins": -0.11340674757957458,
"rewards/rejected": -0.3110995292663574,
"step": 258
},
{
"epoch": 0.27,
"learning_rate": 4.588160454702462e-05,
"logits/chosen": -1.9130336046218872,
"logits/rejected": -1.8589096069335938,
"logps/chosen": -154.01983642578125,
"logps/rejected": -151.2600860595703,
"loss": 0.6448,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.17828267812728882,
"rewards/margins": 0.22233837842941284,
"rewards/rejected": -0.40062105655670166,
"step": 259
},
{
"epoch": 0.27,
"learning_rate": 4.5831132482724195e-05,
"logits/chosen": -1.969378113746643,
"logits/rejected": -1.9947978258132935,
"logps/chosen": -221.1347198486328,
"logps/rejected": -219.78065490722656,
"loss": 0.7675,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8122418522834778,
"rewards/margins": -0.012366384267807007,
"rewards/rejected": -0.7998754978179932,
"step": 260
},
{
"epoch": 0.27,
"learning_rate": 4.578038113988376e-05,
"logits/chosen": -1.9141626358032227,
"logits/rejected": -1.8990297317504883,
"logps/chosen": -181.55563354492188,
"logps/rejected": -158.1247100830078,
"loss": 0.8615,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.5867054462432861,
"rewards/margins": -0.2548179626464844,
"rewards/rejected": -0.33188754320144653,
"step": 261
},
{
"epoch": 0.27,
"learning_rate": 4.572935119891571e-05,
"logits/chosen": -1.9274933338165283,
"logits/rejected": -2.0607924461364746,
"logps/chosen": -200.04896545410156,
"logps/rejected": -209.02926635742188,
"loss": 0.6567,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5149486064910889,
"rewards/margins": 0.22366176545619965,
"rewards/rejected": -0.7386104464530945,
"step": 262
},
{
"epoch": 0.27,
"learning_rate": 4.5678043343967554e-05,
"logits/chosen": -2.108922243118286,
"logits/rejected": -2.041205406188965,
"logps/chosen": -174.11276245117188,
"logps/rejected": -146.20103454589844,
"loss": 0.9138,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.47598356008529663,
"rewards/margins": -0.3184044659137726,
"rewards/rejected": -0.15757909417152405,
"step": 263
},
{
"epoch": 0.28,
"learning_rate": 4.5626458262912745e-05,
"logits/chosen": -2.0428683757781982,
"logits/rejected": -1.9962735176086426,
"logps/chosen": -203.15782165527344,
"logps/rejected": -179.91824340820312,
"loss": 0.8575,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.2803889513015747,
"rewards/margins": -0.19668762385845184,
"rewards/rejected": -0.08370131254196167,
"step": 264
},
{
"epoch": 0.28,
"learning_rate": 4.557459664734141e-05,
"logits/chosen": -1.9452859163284302,
"logits/rejected": -2.0013086795806885,
"logps/chosen": -151.39443969726562,
"logps/rejected": -160.42532348632812,
"loss": 0.7297,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.366563618183136,
"rewards/margins": 0.06915048509836197,
"rewards/rejected": -0.43571415543556213,
"step": 265
},
{
"epoch": 0.28,
"learning_rate": 4.552245919255117e-05,
"logits/chosen": -1.9962891340255737,
"logits/rejected": -2.0349621772766113,
"logps/chosen": -178.134765625,
"logps/rejected": -167.83949279785156,
"loss": 0.7017,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.19582800567150116,
"rewards/margins": 0.07028350979089737,
"rewards/rejected": -0.2661115229129791,
"step": 266
},
{
"epoch": 0.28,
"learning_rate": 4.5470046597537735e-05,
"logits/chosen": -1.9965953826904297,
"logits/rejected": -2.0604090690612793,
"logps/chosen": -158.63546752929688,
"logps/rejected": -177.67669677734375,
"loss": 0.7415,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.3360700309276581,
"rewards/margins": 0.05659861862659454,
"rewards/rejected": -0.3926686644554138,
"step": 267
},
{
"epoch": 0.28,
"learning_rate": 4.541735956498554e-05,
"logits/chosen": -1.95860755443573,
"logits/rejected": -1.9922930002212524,
"logps/chosen": -130.50543212890625,
"logps/rejected": -137.30873107910156,
"loss": 0.6415,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.13774187862873077,
"rewards/margins": 0.156549870967865,
"rewards/rejected": -0.29429173469543457,
"step": 268
},
{
"epoch": 0.28,
"learning_rate": 4.5364398801258396e-05,
"logits/chosen": -2.0323469638824463,
"logits/rejected": -1.989902377128601,
"logps/chosen": -123.73210144042969,
"logps/rejected": -120.40567016601562,
"loss": 0.7417,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.23733878135681152,
"rewards/margins": -0.006110057234764099,
"rewards/rejected": -0.2312287539243698,
"step": 269
},
{
"epoch": 0.28,
"learning_rate": 4.5311165016389916e-05,
"logits/chosen": -2.2102789878845215,
"logits/rejected": -2.1945927143096924,
"logps/chosen": -178.87513732910156,
"logps/rejected": -185.6321258544922,
"loss": 0.58,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3368569016456604,
"rewards/margins": 0.3213872015476227,
"rewards/rejected": -0.6582440733909607,
"step": 270
},
{
"epoch": 0.28,
"learning_rate": 4.525765892407409e-05,
"logits/chosen": -2.014317750930786,
"logits/rejected": -1.9814597368240356,
"logps/chosen": -162.72450256347656,
"logps/rejected": -162.10733032226562,
"loss": 0.7063,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.30713194608688354,
"rewards/margins": 0.028126142919063568,
"rewards/rejected": -0.33525803685188293,
"step": 271
},
{
"epoch": 0.28,
"learning_rate": 4.5203881241655644e-05,
"logits/chosen": -2.2270286083221436,
"logits/rejected": -2.207059144973755,
"logps/chosen": -158.911376953125,
"logps/rejected": -163.7472381591797,
"loss": 0.8314,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.24247878789901733,
"rewards/margins": -0.18095803260803223,
"rewards/rejected": -0.0615207776427269,
"step": 272
},
{
"epoch": 0.28,
"learning_rate": 4.514983269012049e-05,
"logits/chosen": -2.163167715072632,
"logits/rejected": -2.183046817779541,
"logps/chosen": -174.82070922851562,
"logps/rejected": -164.1748504638672,
"loss": 0.7933,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.38691380620002747,
"rewards/margins": -0.16529785096645355,
"rewards/rejected": -0.22161594033241272,
"step": 273
},
{
"epoch": 0.29,
"learning_rate": 4.509551399408598e-05,
"logits/chosen": -2.253500461578369,
"logits/rejected": -2.260409355163574,
"logps/chosen": -190.3398895263672,
"logps/rejected": -201.19200134277344,
"loss": 0.7775,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.21146364510059357,
"rewards/margins": -0.06666092574596405,
"rewards/rejected": -0.14480271935462952,
"step": 274
},
{
"epoch": 0.29,
"learning_rate": 4.504092588179128e-05,
"logits/chosen": -2.2398221492767334,
"logits/rejected": -2.1929337978363037,
"logps/chosen": -231.88616943359375,
"logps/rejected": -226.3920135498047,
"loss": 0.6504,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.30005282163619995,
"rewards/margins": 0.13275346159934998,
"rewards/rejected": -0.43280625343322754,
"step": 275
},
{
"epoch": 0.29,
"learning_rate": 4.498606908508754e-05,
"logits/chosen": -2.251856565475464,
"logits/rejected": -2.2825677394866943,
"logps/chosen": -194.575927734375,
"logps/rejected": -208.67977905273438,
"loss": 0.7303,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.1339748501777649,
"rewards/margins": -0.01863221824169159,
"rewards/rejected": -0.1153426244854927,
"step": 276
},
{
"epoch": 0.29,
"learning_rate": 4.4930944339428085e-05,
"logits/chosen": -1.9029638767242432,
"logits/rejected": -2.052668571472168,
"logps/chosen": -192.86978149414062,
"logps/rejected": -214.9619140625,
"loss": 0.734,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.31461602449417114,
"rewards/margins": -0.04775575175881386,
"rewards/rejected": -0.26686030626296997,
"step": 277
},
{
"epoch": 0.29,
"learning_rate": 4.487555238385862e-05,
"logits/chosen": -2.242000102996826,
"logits/rejected": -2.1664445400238037,
"logps/chosen": -177.2509002685547,
"logps/rejected": -168.57376098632812,
"loss": 0.7252,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3784841001033783,
"rewards/margins": -0.008476179093122482,
"rewards/rejected": -0.3700079321861267,
"step": 278
},
{
"epoch": 0.29,
"learning_rate": 4.481989396100724e-05,
"logits/chosen": -2.1768834590911865,
"logits/rejected": -2.122082471847534,
"logps/chosen": -136.37344360351562,
"logps/rejected": -129.23260498046875,
"loss": 0.7226,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1186956837773323,
"rewards/margins": 0.10684624314308167,
"rewards/rejected": -0.22554191946983337,
"step": 279
},
{
"epoch": 0.29,
"learning_rate": 4.476396981707453e-05,
"logits/chosen": -2.2543294429779053,
"logits/rejected": -2.2154581546783447,
"logps/chosen": -170.55038452148438,
"logps/rejected": -156.1742706298828,
"loss": 0.7387,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.22529585659503937,
"rewards/margins": -0.04772930592298508,
"rewards/rejected": -0.1775665581226349,
"step": 280
},
{
"epoch": 0.29,
"learning_rate": 4.470778070182353e-05,
"logits/chosen": -2.2135331630706787,
"logits/rejected": -2.170997142791748,
"logps/chosen": -140.70477294921875,
"logps/rejected": -140.6665496826172,
"loss": 0.7037,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09401345998048782,
"rewards/margins": 0.028659436851739883,
"rewards/rejected": -0.122672900557518,
"step": 281
},
{
"epoch": 0.29,
"learning_rate": 4.465132736856969e-05,
"logits/chosen": -2.2878525257110596,
"logits/rejected": -2.2237839698791504,
"logps/chosen": -164.2831573486328,
"logps/rejected": -160.92367553710938,
"loss": 0.7044,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.34608572721481323,
"rewards/margins": 0.046185556799173355,
"rewards/rejected": -0.3922712802886963,
"step": 282
},
{
"epoch": 0.3,
"learning_rate": 4.459461057417078e-05,
"logits/chosen": -2.185762882232666,
"logits/rejected": -2.099052667617798,
"logps/chosen": -151.68963623046875,
"logps/rejected": -147.94314575195312,
"loss": 0.6582,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.20870129764080048,
"rewards/margins": 0.16736984252929688,
"rewards/rejected": -0.37607109546661377,
"step": 283
},
{
"epoch": 0.3,
"learning_rate": 4.453763107901675e-05,
"logits/chosen": -2.1122889518737793,
"logits/rejected": -2.010392189025879,
"logps/chosen": -187.94906616210938,
"logps/rejected": -185.841796875,
"loss": 0.6521,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08321194350719452,
"rewards/margins": 0.12147242575883865,
"rewards/rejected": -0.20468439161777496,
"step": 284
},
{
"epoch": 0.3,
"learning_rate": 4.4480389647019505e-05,
"logits/chosen": -2.1214723587036133,
"logits/rejected": -2.033308506011963,
"logps/chosen": -152.14337158203125,
"logps/rejected": -149.34632873535156,
"loss": 0.7408,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3923906683921814,
"rewards/margins": 0.008589165285229683,
"rewards/rejected": -0.40097981691360474,
"step": 285
},
{
"epoch": 0.3,
"learning_rate": 4.442288704560268e-05,
"logits/chosen": -2.1580543518066406,
"logits/rejected": -2.1600093841552734,
"logps/chosen": -202.5330810546875,
"logps/rejected": -194.6158905029297,
"loss": 0.8982,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.4735994040966034,
"rewards/margins": -0.2955199182033539,
"rewards/rejected": -0.1780795007944107,
"step": 286
},
{
"epoch": 0.3,
"learning_rate": 4.436512404569136e-05,
"logits/chosen": -2.1974916458129883,
"logits/rejected": -2.259157657623291,
"logps/chosen": -147.84683227539062,
"logps/rejected": -164.73829650878906,
"loss": 0.6104,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2959311902523041,
"rewards/margins": 0.22050346434116364,
"rewards/rejected": -0.5164346694946289,
"step": 287
},
{
"epoch": 0.3,
"learning_rate": 4.430710142170176e-05,
"logits/chosen": -2.341240644454956,
"logits/rejected": -2.3171639442443848,
"logps/chosen": -151.489990234375,
"logps/rejected": -136.1929931640625,
"loss": 0.7421,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.12041179090738297,
"rewards/margins": -0.06451301276683807,
"rewards/rejected": -0.0558987595140934,
"step": 288
},
{
"epoch": 0.3,
"learning_rate": 4.424881995153076e-05,
"logits/chosen": -2.076103448867798,
"logits/rejected": -2.188572883605957,
"logps/chosen": -155.2179718017578,
"logps/rejected": -181.94903564453125,
"loss": 0.7223,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.1940208077430725,
"rewards/margins": 0.03361191600561142,
"rewards/rejected": -0.22763270139694214,
"step": 289
},
{
"epoch": 0.3,
"learning_rate": 4.419028041654559e-05,
"logits/chosen": -2.1491026878356934,
"logits/rejected": -2.1136281490325928,
"logps/chosen": -151.4420928955078,
"logps/rejected": -141.8388214111328,
"loss": 0.6722,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2384054809808731,
"rewards/margins": 0.15212693810462952,
"rewards/rejected": -0.39053237438201904,
"step": 290
},
{
"epoch": 0.3,
"learning_rate": 4.4131483601573285e-05,
"logits/chosen": -1.9776232242584229,
"logits/rejected": -2.004852771759033,
"logps/chosen": -170.9802703857422,
"logps/rejected": -162.25498962402344,
"loss": 0.7193,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.18611471354961395,
"rewards/margins": 0.00812564603984356,
"rewards/rejected": -0.19424037635326385,
"step": 291
},
{
"epoch": 0.3,
"learning_rate": 4.4072430294890174e-05,
"logits/chosen": -2.0479369163513184,
"logits/rejected": -2.0386621952056885,
"logps/chosen": -178.59158325195312,
"logps/rejected": -174.89373779296875,
"loss": 0.6113,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.28108635544776917,
"rewards/margins": 0.20999087393283844,
"rewards/rejected": -0.4910773038864136,
"step": 292
},
{
"epoch": 0.31,
"learning_rate": 4.4013121288211307e-05,
"logits/chosen": -2.2802951335906982,
"logits/rejected": -2.192854881286621,
"logps/chosen": -153.86907958984375,
"logps/rejected": -142.56341552734375,
"loss": 0.7816,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.07042498886585236,
"rewards/margins": -0.13551297783851624,
"rewards/rejected": 0.06508798897266388,
"step": 293
},
{
"epoch": 0.31,
"learning_rate": 4.3953557376679856e-05,
"logits/chosen": -2.2507810592651367,
"logits/rejected": -2.2218830585479736,
"logps/chosen": -125.35671997070312,
"logps/rejected": -128.93533325195312,
"loss": 0.7147,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.10209127515554428,
"rewards/margins": 0.0391690619289875,
"rewards/rejected": -0.14126034080982208,
"step": 294
},
{
"epoch": 0.31,
"learning_rate": 4.389373935885646e-05,
"logits/chosen": -2.169445514678955,
"logits/rejected": -2.194335460662842,
"logps/chosen": -157.78758239746094,
"logps/rejected": -165.60736083984375,
"loss": 0.7124,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.24438923597335815,
"rewards/margins": 0.05163384974002838,
"rewards/rejected": -0.29602310061454773,
"step": 295
},
{
"epoch": 0.31,
"learning_rate": 4.383366803670849e-05,
"logits/chosen": -2.2508602142333984,
"logits/rejected": -2.30161714553833,
"logps/chosen": -167.88427734375,
"logps/rejected": -184.48272705078125,
"loss": 0.7121,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.15736985206604004,
"rewards/margins": 0.126334547996521,
"rewards/rejected": -0.28370437026023865,
"step": 296
},
{
"epoch": 0.31,
"learning_rate": 4.377334421559932e-05,
"logits/chosen": -2.314563035964966,
"logits/rejected": -2.2978732585906982,
"logps/chosen": -179.24159240722656,
"logps/rejected": -188.38165283203125,
"loss": 0.7184,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.012131119146943092,
"rewards/margins": -0.01239142008125782,
"rewards/rejected": 0.02452254109084606,
"step": 297
},
{
"epoch": 0.31,
"learning_rate": 4.371276870427753e-05,
"logits/chosen": -2.066857099533081,
"logits/rejected": -2.174121856689453,
"logps/chosen": -170.82960510253906,
"logps/rejected": -189.02621459960938,
"loss": 0.7574,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.28441551327705383,
"rewards/margins": -0.09379325062036514,
"rewards/rejected": -0.1906222403049469,
"step": 298
},
{
"epoch": 0.31,
"learning_rate": 4.365194231486604e-05,
"logits/chosen": -2.147336006164551,
"logits/rejected": -2.132305383682251,
"logps/chosen": -158.25559997558594,
"logps/rejected": -152.197509765625,
"loss": 0.6692,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.104363813996315,
"rewards/margins": 0.12111049890518188,
"rewards/rejected": -0.2254743129014969,
"step": 299
},
{
"epoch": 0.31,
"learning_rate": 4.359086586285127e-05,
"logits/chosen": -2.247628688812256,
"logits/rejected": -2.286552667617798,
"logps/chosen": -133.13673400878906,
"logps/rejected": -183.82647705078125,
"loss": 0.6152,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.17196473479270935,
"rewards/margins": 0.2282213419675827,
"rewards/rejected": -0.40018609166145325,
"step": 300
},
{
"epoch": 0.31,
"learning_rate": 4.3529540167072126e-05,
"logits/chosen": -1.8818175792694092,
"logits/rejected": -1.8767746686935425,
"logps/chosen": -134.77548217773438,
"logps/rejected": -151.11073303222656,
"loss": 0.6999,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2484281212091446,
"rewards/margins": 0.12181131541728973,
"rewards/rejected": -0.3702394366264343,
"step": 301
},
{
"epoch": 0.32,
"learning_rate": 4.346796604970912e-05,
"logits/chosen": -2.107909679412842,
"logits/rejected": -2.138780355453491,
"logps/chosen": -168.53460693359375,
"logps/rejected": -174.63592529296875,
"loss": 0.7954,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.30226460099220276,
"rewards/margins": -0.09562454372644424,
"rewards/rejected": -0.20664002001285553,
"step": 302
},
{
"epoch": 0.32,
"learning_rate": 4.340614433627328e-05,
"logits/chosen": -2.1604933738708496,
"logits/rejected": -2.2617201805114746,
"logps/chosen": -155.14198303222656,
"logps/rejected": -169.87091064453125,
"loss": 0.6444,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.07352495938539505,
"rewards/margins": 0.1430576741695404,
"rewards/rejected": -0.21658262610435486,
"step": 303
},
{
"epoch": 0.32,
"learning_rate": 4.3344075855595104e-05,
"logits/chosen": -2.1969313621520996,
"logits/rejected": -2.2095913887023926,
"logps/chosen": -165.4632568359375,
"logps/rejected": -158.85653686523438,
"loss": 0.7844,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.2591025233268738,
"rewards/margins": -0.1096937507390976,
"rewards/rejected": -0.14940877258777618,
"step": 304
},
{
"epoch": 0.32,
"learning_rate": 4.328176143981343e-05,
"logits/chosen": -2.146892786026001,
"logits/rejected": -2.1590354442596436,
"logps/chosen": -165.599365234375,
"logps/rejected": -155.86920166015625,
"loss": 0.5594,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.04538270831108093,
"rewards/margins": 0.3189627528190613,
"rewards/rejected": -0.27358004450798035,
"step": 305
},
{
"epoch": 0.32,
"learning_rate": 4.321920192436433e-05,
"logits/chosen": -2.226012706756592,
"logits/rejected": -2.2147953510284424,
"logps/chosen": -149.3193817138672,
"logps/rejected": -182.29501342773438,
"loss": 0.7058,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.34394580125808716,
"rewards/margins": 0.0967680886387825,
"rewards/rejected": -0.44071388244628906,
"step": 306
},
{
"epoch": 0.32,
"learning_rate": 4.315639814796983e-05,
"logits/chosen": -2.0329627990722656,
"logits/rejected": -2.1080126762390137,
"logps/chosen": -145.89712524414062,
"logps/rejected": -164.14891052246094,
"loss": 0.6894,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.22714479267597198,
"rewards/margins": 0.17113275825977325,
"rewards/rejected": -0.39827755093574524,
"step": 307
},
{
"epoch": 0.32,
"learning_rate": 4.309335095262676e-05,
"logits/chosen": -2.131873607635498,
"logits/rejected": -2.3015997409820557,
"logps/chosen": -146.17129516601562,
"logps/rejected": -182.18138122558594,
"loss": 0.6993,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3666872978210449,
"rewards/margins": 0.09421360492706299,
"rewards/rejected": -0.4609009325504303,
"step": 308
},
{
"epoch": 0.32,
"learning_rate": 4.303006118359537e-05,
"logits/chosen": -2.2067878246307373,
"logits/rejected": -2.162324905395508,
"logps/chosen": -169.20140075683594,
"logps/rejected": -163.42198181152344,
"loss": 0.7353,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5356483459472656,
"rewards/margins": -0.05132238194346428,
"rewards/rejected": -0.48432594537734985,
"step": 309
},
{
"epoch": 0.32,
"learning_rate": 4.296652968938807e-05,
"logits/chosen": -2.0966219902038574,
"logits/rejected": -2.096193313598633,
"logps/chosen": -181.05987548828125,
"logps/rejected": -199.04830932617188,
"loss": 0.8487,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5439967513084412,
"rewards/margins": -0.17244365811347961,
"rewards/rejected": -0.3715530335903168,
"step": 310
},
{
"epoch": 0.32,
"learning_rate": 4.2902757321758016e-05,
"logits/chosen": -2.0997745990753174,
"logits/rejected": -2.11462140083313,
"logps/chosen": -154.70413208007812,
"logps/rejected": -166.63629150390625,
"loss": 0.6186,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.27216559648513794,
"rewards/margins": 0.23034000396728516,
"rewards/rejected": -0.5025056004524231,
"step": 311
},
{
"epoch": 0.33,
"learning_rate": 4.283874493568772e-05,
"logits/chosen": -2.1701714992523193,
"logits/rejected": -2.2467424869537354,
"logps/chosen": -172.70042419433594,
"logps/rejected": -210.9363250732422,
"loss": 0.7279,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.49327972531318665,
"rewards/margins": 0.008369775488972664,
"rewards/rejected": -0.5016494989395142,
"step": 312
},
{
"epoch": 0.33,
"learning_rate": 4.2774493389377545e-05,
"logits/chosen": -2.2590439319610596,
"logits/rejected": -2.214010000228882,
"logps/chosen": -149.47921752929688,
"logps/rejected": -162.21078491210938,
"loss": 0.7782,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.27489376068115234,
"rewards/margins": -0.09422563016414642,
"rewards/rejected": -0.18066814541816711,
"step": 313
},
{
"epoch": 0.33,
"learning_rate": 4.271000354423426e-05,
"logits/chosen": -2.179133892059326,
"logits/rejected": -2.3145484924316406,
"logps/chosen": -161.55218505859375,
"logps/rejected": -180.9322509765625,
"loss": 0.766,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.41766488552093506,
"rewards/margins": -0.06459204852581024,
"rewards/rejected": -0.353072851896286,
"step": 314
},
{
"epoch": 0.33,
"learning_rate": 4.2645276264859394e-05,
"logits/chosen": -2.14973521232605,
"logits/rejected": -2.122270107269287,
"logps/chosen": -169.07066345214844,
"logps/rejected": -148.82968139648438,
"loss": 0.7404,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4353243112564087,
"rewards/margins": -0.06003642827272415,
"rewards/rejected": -0.3752879202365875,
"step": 315
},
{
"epoch": 0.33,
"learning_rate": 4.258031241903778e-05,
"logits/chosen": -2.2368862628936768,
"logits/rejected": -2.231748104095459,
"logps/chosen": -228.32550048828125,
"logps/rejected": -248.53265380859375,
"loss": 0.7322,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.48313820362091064,
"rewards/margins": -0.03280310332775116,
"rewards/rejected": -0.4503350555896759,
"step": 316
},
{
"epoch": 0.33,
"learning_rate": 4.251511287772579e-05,
"logits/chosen": -2.172724485397339,
"logits/rejected": -2.166696786880493,
"logps/chosen": -166.26548767089844,
"logps/rejected": -189.61898803710938,
"loss": 0.7494,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4010167717933655,
"rewards/margins": -0.011104248464107513,
"rewards/rejected": -0.38991254568099976,
"step": 317
},
{
"epoch": 0.33,
"learning_rate": 4.2449678515039747e-05,
"logits/chosen": -2.168539047241211,
"logits/rejected": -2.230973243713379,
"logps/chosen": -150.7926025390625,
"logps/rejected": -141.18051147460938,
"loss": 0.801,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.29772108793258667,
"rewards/margins": -0.10154817998409271,
"rewards/rejected": -0.19617292284965515,
"step": 318
},
{
"epoch": 0.33,
"learning_rate": 4.238401020824416e-05,
"logits/chosen": -2.1671128273010254,
"logits/rejected": -2.1290652751922607,
"logps/chosen": -163.53701782226562,
"logps/rejected": -176.98638916015625,
"loss": 0.6244,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.26465463638305664,
"rewards/margins": 0.2812088131904602,
"rewards/rejected": -0.5458635091781616,
"step": 319
},
{
"epoch": 0.33,
"learning_rate": 4.231810883773999e-05,
"logits/chosen": -2.0769715309143066,
"logits/rejected": -2.2087574005126953,
"logps/chosen": -143.20196533203125,
"logps/rejected": -185.92027282714844,
"loss": 0.7082,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.41921350359916687,
"rewards/margins": 0.10322752594947815,
"rewards/rejected": -0.522441029548645,
"step": 320
},
{
"epoch": 0.33,
"learning_rate": 4.2251975287052804e-05,
"logits/chosen": -2.1802303791046143,
"logits/rejected": -2.2122409343719482,
"logps/chosen": -156.01007080078125,
"logps/rejected": -183.6471405029297,
"loss": 0.6816,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2279636561870575,
"rewards/margins": 0.09283652901649475,
"rewards/rejected": -0.32080018520355225,
"step": 321
},
{
"epoch": 0.34,
"learning_rate": 4.218561044282099e-05,
"logits/chosen": -2.113987684249878,
"logits/rejected": -2.1755659580230713,
"logps/chosen": -183.98928833007812,
"logps/rejected": -201.93418884277344,
"loss": 0.6784,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.39034149050712585,
"rewards/margins": 0.08643309772014618,
"rewards/rejected": -0.47677451372146606,
"step": 322
},
{
"epoch": 0.34,
"learning_rate": 4.211901519478382e-05,
"logits/chosen": -2.139608144760132,
"logits/rejected": -2.3428738117218018,
"logps/chosen": -165.4562530517578,
"logps/rejected": -215.16778564453125,
"loss": 0.631,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6147371530532837,
"rewards/margins": 0.23880484700202942,
"rewards/rejected": -0.8535419702529907,
"step": 323
},
{
"epoch": 0.34,
"learning_rate": 4.2052190435769554e-05,
"logits/chosen": -2.1603267192840576,
"logits/rejected": -2.0630412101745605,
"logps/chosen": -173.13119506835938,
"logps/rejected": -156.80117797851562,
"loss": 0.6334,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2138206511735916,
"rewards/margins": 0.18611598014831543,
"rewards/rejected": -0.39993664622306824,
"step": 324
},
{
"epoch": 0.34,
"learning_rate": 4.198513706168345e-05,
"logits/chosen": -2.132692813873291,
"logits/rejected": -2.117668390274048,
"logps/chosen": -163.94891357421875,
"logps/rejected": -177.30056762695312,
"loss": 0.6352,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3021223843097687,
"rewards/margins": 0.18045057356357574,
"rewards/rejected": -0.4825729727745056,
"step": 325
},
{
"epoch": 0.34,
"learning_rate": 4.191785597149577e-05,
"logits/chosen": -2.129894495010376,
"logits/rejected": -2.126570224761963,
"logps/chosen": -233.8136749267578,
"logps/rejected": -209.49566650390625,
"loss": 0.7343,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5364924073219299,
"rewards/margins": -0.013996928930282593,
"rewards/rejected": -0.5224955677986145,
"step": 326
},
{
"epoch": 0.34,
"learning_rate": 4.1850348067229696e-05,
"logits/chosen": -2.096973419189453,
"logits/rejected": -2.1738548278808594,
"logps/chosen": -152.7852020263672,
"logps/rejected": -173.36817932128906,
"loss": 0.7105,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.07808439433574677,
"rewards/margins": 0.017765391618013382,
"rewards/rejected": -0.09584978222846985,
"step": 327
},
{
"epoch": 0.34,
"learning_rate": 4.178261425394926e-05,
"logits/chosen": -2.026822566986084,
"logits/rejected": -2.074733257293701,
"logps/chosen": -171.08468627929688,
"logps/rejected": -204.09425354003906,
"loss": 0.8344,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.6632390022277832,
"rewards/margins": -0.23066554963588715,
"rewards/rejected": -0.4325733780860901,
"step": 328
},
{
"epoch": 0.34,
"learning_rate": 4.171465543974723e-05,
"logits/chosen": -2.205124855041504,
"logits/rejected": -2.198807716369629,
"logps/chosen": -153.81307983398438,
"logps/rejected": -165.59127807617188,
"loss": 0.6848,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3507269620895386,
"rewards/margins": 0.07101988792419434,
"rewards/rejected": -0.4217468202114105,
"step": 329
},
{
"epoch": 0.34,
"learning_rate": 4.1646472535732895e-05,
"logits/chosen": -2.2543835639953613,
"logits/rejected": -2.169010877609253,
"logps/chosen": -193.3108673095703,
"logps/rejected": -163.5917510986328,
"loss": 0.7477,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.4203927218914032,
"rewards/margins": -0.05665392428636551,
"rewards/rejected": -0.3637387752532959,
"step": 330
},
{
"epoch": 0.35,
"learning_rate": 4.157806645601988e-05,
"logits/chosen": -1.9615943431854248,
"logits/rejected": -2.026665210723877,
"logps/chosen": -188.08934020996094,
"logps/rejected": -210.8447265625,
"loss": 0.6278,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.26600322127342224,
"rewards/margins": 0.22521573305130005,
"rewards/rejected": -0.4912189245223999,
"step": 331
},
{
"epoch": 0.35,
"learning_rate": 4.1509438117713866e-05,
"logits/chosen": -2.1103501319885254,
"logits/rejected": -2.092162609100342,
"logps/chosen": -152.57652282714844,
"logps/rejected": -155.68255615234375,
"loss": 0.7481,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.19243402779102325,
"rewards/margins": -0.01794758439064026,
"rewards/rejected": -0.1744864583015442,
"step": 332
},
{
"epoch": 0.35,
"learning_rate": 4.144058844090032e-05,
"logits/chosen": -2.059112310409546,
"logits/rejected": -2.1364006996154785,
"logps/chosen": -134.24253845214844,
"logps/rejected": -140.66732788085938,
"loss": 0.7082,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.19404403865337372,
"rewards/margins": 0.034823037683963776,
"rewards/rejected": -0.2288670837879181,
"step": 333
},
{
"epoch": 0.35,
"learning_rate": 4.137151834863213e-05,
"logits/chosen": -2.178894519805908,
"logits/rejected": -2.218296766281128,
"logps/chosen": -167.58921813964844,
"logps/rejected": -184.11642456054688,
"loss": 0.6905,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.18130388855934143,
"rewards/margins": 0.06935537606477737,
"rewards/rejected": -0.2506592571735382,
"step": 334
},
{
"epoch": 0.35,
"learning_rate": 4.130222876691726e-05,
"logits/chosen": -1.9210792779922485,
"logits/rejected": -1.9056644439697266,
"logps/chosen": -248.0112762451172,
"logps/rejected": -249.83152770996094,
"loss": 0.6878,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5254245400428772,
"rewards/margins": 0.07501597702503204,
"rewards/rejected": -0.600440502166748,
"step": 335
},
{
"epoch": 0.35,
"learning_rate": 4.123272062470633e-05,
"logits/chosen": -2.2695250511169434,
"logits/rejected": -2.3075647354125977,
"logps/chosen": -175.56838989257812,
"logps/rejected": -185.3209686279297,
"loss": 0.6858,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5953464508056641,
"rewards/margins": 0.08486279845237732,
"rewards/rejected": -0.680209219455719,
"step": 336
},
{
"epoch": 0.35,
"learning_rate": 4.116299485388014e-05,
"logits/chosen": -2.1485931873321533,
"logits/rejected": -2.143951177597046,
"logps/chosen": -147.05918884277344,
"logps/rejected": -146.83811950683594,
"loss": 0.8233,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.5940690040588379,
"rewards/margins": -0.17442089319229126,
"rewards/rejected": -0.4196482002735138,
"step": 337
},
{
"epoch": 0.35,
"learning_rate": 4.109305238923718e-05,
"logits/chosen": -2.151376247406006,
"logits/rejected": -2.2524118423461914,
"logps/chosen": -258.1955261230469,
"logps/rejected": -254.03448486328125,
"loss": 0.6856,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5329893231391907,
"rewards/margins": 0.19465385377407074,
"rewards/rejected": -0.7276431322097778,
"step": 338
},
{
"epoch": 0.35,
"learning_rate": 4.102289416848114e-05,
"logits/chosen": -2.141131639480591,
"logits/rejected": -2.094794511795044,
"logps/chosen": -143.72801208496094,
"logps/rejected": -137.58067321777344,
"loss": 0.7808,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.384622186422348,
"rewards/margins": -0.13031712174415588,
"rewards/rejected": -0.25430506467819214,
"step": 339
},
{
"epoch": 0.35,
"learning_rate": 4.095252113220827e-05,
"logits/chosen": -2.16725492477417,
"logits/rejected": -2.1304190158843994,
"logps/chosen": -168.14285278320312,
"logps/rejected": -173.74656677246094,
"loss": 0.7767,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.37200167775154114,
"rewards/margins": -0.032151952385902405,
"rewards/rejected": -0.3398497402667999,
"step": 340
},
{
"epoch": 0.36,
"learning_rate": 4.088193422389484e-05,
"logits/chosen": -2.1071646213531494,
"logits/rejected": -2.1935393810272217,
"logps/chosen": -165.9573516845703,
"logps/rejected": -193.26974487304688,
"loss": 0.5765,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.30919840931892395,
"rewards/margins": 0.3557150065898895,
"rewards/rejected": -0.6649134159088135,
"step": 341
},
{
"epoch": 0.36,
"learning_rate": 4.0811134389884433e-05,
"logits/chosen": -1.9773459434509277,
"logits/rejected": -2.059852361679077,
"logps/chosen": -149.0285186767578,
"logps/rejected": -159.69705200195312,
"loss": 0.641,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1402963101863861,
"rewards/margins": 0.19628655910491943,
"rewards/rejected": -0.33658286929130554,
"step": 342
},
{
"epoch": 0.36,
"learning_rate": 4.0740122579375286e-05,
"logits/chosen": -2.0288474559783936,
"logits/rejected": -2.244412422180176,
"logps/chosen": -158.99160766601562,
"logps/rejected": -198.47886657714844,
"loss": 0.6393,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3524281680583954,
"rewards/margins": 0.20226937532424927,
"rewards/rejected": -0.5546976327896118,
"step": 343
},
{
"epoch": 0.36,
"learning_rate": 4.066889974440757e-05,
"logits/chosen": -1.9884339570999146,
"logits/rejected": -2.0600476264953613,
"logps/chosen": -149.9541473388672,
"logps/rejected": -168.91061401367188,
"loss": 0.7141,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3000350594520569,
"rewards/margins": 0.055553682148456573,
"rewards/rejected": -0.35558873414993286,
"step": 344
},
{
"epoch": 0.36,
"learning_rate": 4.0597466839850595e-05,
"logits/chosen": -2.229095935821533,
"logits/rejected": -2.208395481109619,
"logps/chosen": -180.67138671875,
"logps/rejected": -191.8995361328125,
"loss": 0.8027,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.5766149163246155,
"rewards/margins": -0.08274443447589874,
"rewards/rejected": -0.4938705563545227,
"step": 345
},
{
"epoch": 0.36,
"learning_rate": 4.0525824823390045e-05,
"logits/chosen": -1.9827308654785156,
"logits/rejected": -2.038292646408081,
"logps/chosen": -137.3387451171875,
"logps/rejected": -156.16510009765625,
"loss": 0.6739,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.24382978677749634,
"rewards/margins": 0.09684039652347565,
"rewards/rejected": -0.3406701982021332,
"step": 346
},
{
"epoch": 0.36,
"learning_rate": 4.045397465551513e-05,
"logits/chosen": -2.0480711460113525,
"logits/rejected": -2.0361733436584473,
"logps/chosen": -173.17909240722656,
"logps/rejected": -157.0221405029297,
"loss": 0.7591,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5380522012710571,
"rewards/margins": -0.046980153769254684,
"rewards/rejected": -0.4910720884799957,
"step": 347
},
{
"epoch": 0.36,
"learning_rate": 4.038191729950569e-05,
"logits/chosen": -2.229896068572998,
"logits/rejected": -2.211841106414795,
"logps/chosen": -167.17428588867188,
"logps/rejected": -167.23196411132812,
"loss": 0.8467,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.6671632528305054,
"rewards/margins": -0.22287489473819733,
"rewards/rejected": -0.44428837299346924,
"step": 348
},
{
"epoch": 0.36,
"learning_rate": 4.030965372141927e-05,
"logits/chosen": -2.0725326538085938,
"logits/rejected": -2.0685665607452393,
"logps/chosen": -151.9163360595703,
"logps/rejected": -158.74620056152344,
"loss": 0.6537,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.27225998044013977,
"rewards/margins": 0.1315668821334839,
"rewards/rejected": -0.40382686257362366,
"step": 349
},
{
"epoch": 0.37,
"learning_rate": 4.0237184890078245e-05,
"logits/chosen": -2.1178064346313477,
"logits/rejected": -2.1523594856262207,
"logps/chosen": -157.16265869140625,
"logps/rejected": -176.5404052734375,
"loss": 0.622,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.31149694323539734,
"rewards/margins": 0.21072791516780853,
"rewards/rejected": -0.5222248435020447,
"step": 350
},
{
"epoch": 0.37,
"learning_rate": 4.0164511777056725e-05,
"logits/chosen": -2.2286159992218018,
"logits/rejected": -2.2007691860198975,
"logps/chosen": -186.49661254882812,
"logps/rejected": -188.19149780273438,
"loss": 0.7013,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5486454963684082,
"rewards/margins": 0.11480455100536346,
"rewards/rejected": -0.6634500622749329,
"step": 351
},
{
"epoch": 0.37,
"learning_rate": 4.009163535666761e-05,
"logits/chosen": -2.182291030883789,
"logits/rejected": -2.1893458366394043,
"logps/chosen": -148.7130126953125,
"logps/rejected": -159.863037109375,
"loss": 0.6501,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.23504705727100372,
"rewards/margins": 0.17834332585334778,
"rewards/rejected": -0.4133903682231903,
"step": 352
},
{
"epoch": 0.37,
"learning_rate": 4.001855660594948e-05,
"logits/chosen": -2.0689799785614014,
"logits/rejected": -2.133513927459717,
"logps/chosen": -193.48846435546875,
"logps/rejected": -231.55738830566406,
"loss": 0.6689,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5353763699531555,
"rewards/margins": 0.13139232993125916,
"rewards/rejected": -0.6667687296867371,
"step": 353
},
{
"epoch": 0.37,
"learning_rate": 3.994527650465352e-05,
"logits/chosen": -2.2244315147399902,
"logits/rejected": -2.206336259841919,
"logps/chosen": -153.9037628173828,
"logps/rejected": -153.5791778564453,
"loss": 0.6506,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.10444588959217072,
"rewards/margins": 0.24528437852859497,
"rewards/rejected": -0.3497302234172821,
"step": 354
},
{
"epoch": 0.37,
"learning_rate": 3.98717960352304e-05,
"logits/chosen": -2.0277304649353027,
"logits/rejected": -1.9793894290924072,
"logps/chosen": -153.72535705566406,
"logps/rejected": -152.46961975097656,
"loss": 0.7438,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5082700252532959,
"rewards/margins": 0.030984222888946533,
"rewards/rejected": -0.5392543077468872,
"step": 355
},
{
"epoch": 0.37,
"learning_rate": 3.979811618281706e-05,
"logits/chosen": -2.0062384605407715,
"logits/rejected": -2.057262420654297,
"logps/chosen": -136.8114013671875,
"logps/rejected": -134.69456481933594,
"loss": 0.7313,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2975345849990845,
"rewards/margins": 0.03512765094637871,
"rewards/rejected": -0.3326622247695923,
"step": 356
},
{
"epoch": 0.37,
"learning_rate": 3.972423793522352e-05,
"logits/chosen": -2.0485219955444336,
"logits/rejected": -2.085298776626587,
"logps/chosen": -193.29806518554688,
"logps/rejected": -208.20973205566406,
"loss": 0.818,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.6571996212005615,
"rewards/margins": -0.1324111372232437,
"rewards/rejected": -0.524788498878479,
"step": 357
},
{
"epoch": 0.37,
"learning_rate": 3.9650162282919655e-05,
"logits/chosen": -1.9818403720855713,
"logits/rejected": -2.0531868934631348,
"logps/chosen": -158.05345153808594,
"logps/rejected": -157.64669799804688,
"loss": 0.7875,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.27151066064834595,
"rewards/margins": -0.05360978841781616,
"rewards/rejected": -0.21790087223052979,
"step": 358
},
{
"epoch": 0.37,
"learning_rate": 3.957589021902191e-05,
"logits/chosen": -2.1913740634918213,
"logits/rejected": -2.147808790206909,
"logps/chosen": -158.68458557128906,
"logps/rejected": -168.61692810058594,
"loss": 0.8811,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.6251986622810364,
"rewards/margins": -0.24323511123657227,
"rewards/rejected": -0.3819635510444641,
"step": 359
},
{
"epoch": 0.38,
"learning_rate": 3.9501422739279956e-05,
"logits/chosen": -1.9855284690856934,
"logits/rejected": -2.0047171115875244,
"logps/chosen": -158.85598754882812,
"logps/rejected": -186.7561492919922,
"loss": 0.8237,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.4131236970424652,
"rewards/margins": -0.2267259657382965,
"rewards/rejected": -0.1863977611064911,
"step": 360
},
{
"epoch": 0.38,
"learning_rate": 3.942676084206338e-05,
"logits/chosen": -2.1845693588256836,
"logits/rejected": -2.2711105346679688,
"logps/chosen": -153.8560791015625,
"logps/rejected": -188.9151611328125,
"loss": 0.661,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2779008448123932,
"rewards/margins": 0.1375725418329239,
"rewards/rejected": -0.4154733717441559,
"step": 361
},
{
"epoch": 0.38,
"learning_rate": 3.9351905528348285e-05,
"logits/chosen": -2.063652515411377,
"logits/rejected": -2.1154680252075195,
"logps/chosen": -154.21665954589844,
"logps/rejected": -166.70904541015625,
"loss": 0.6842,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.302326500415802,
"rewards/margins": 0.07578597962856293,
"rewards/rejected": -0.3781124949455261,
"step": 362
},
{
"epoch": 0.38,
"learning_rate": 3.927685780170385e-05,
"logits/chosen": -2.115208625793457,
"logits/rejected": -2.042466878890991,
"logps/chosen": -133.27244567871094,
"logps/rejected": -123.8929214477539,
"loss": 0.667,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09392361342906952,
"rewards/margins": 0.1019444689154625,
"rewards/rejected": -0.1958681046962738,
"step": 363
},
{
"epoch": 0.38,
"learning_rate": 3.920161866827889e-05,
"logits/chosen": -2.167541980743408,
"logits/rejected": -2.15377140045166,
"logps/chosen": -152.6026611328125,
"logps/rejected": -145.47889709472656,
"loss": 0.723,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.22231236100196838,
"rewards/margins": 0.015478478744626045,
"rewards/rejected": -0.23779082298278809,
"step": 364
},
{
"epoch": 0.38,
"learning_rate": 3.9126189136788416e-05,
"logits/chosen": -2.1280405521392822,
"logits/rejected": -1.9908369779586792,
"logps/chosen": -146.22325134277344,
"logps/rejected": -131.01043701171875,
"loss": 0.6667,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.32886457443237305,
"rewards/margins": 0.1325063705444336,
"rewards/rejected": -0.46137094497680664,
"step": 365
},
{
"epoch": 0.38,
"learning_rate": 3.90505702185e-05,
"logits/chosen": -2.0596060752868652,
"logits/rejected": -1.9969902038574219,
"logps/chosen": -164.17156982421875,
"logps/rejected": -137.92181396484375,
"loss": 0.8594,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.4894620478153229,
"rewards/margins": -0.21743880212306976,
"rewards/rejected": -0.2720232605934143,
"step": 366
},
{
"epoch": 0.38,
"learning_rate": 3.897476292722034e-05,
"logits/chosen": -1.9921385049819946,
"logits/rejected": -2.1082358360290527,
"logps/chosen": -140.42034912109375,
"logps/rejected": -164.16915893554688,
"loss": 0.6899,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.26265978813171387,
"rewards/margins": 0.043181706219911575,
"rewards/rejected": -0.30584150552749634,
"step": 367
},
{
"epoch": 0.38,
"learning_rate": 3.889876827928156e-05,
"logits/chosen": -2.0175795555114746,
"logits/rejected": -2.040492296218872,
"logps/chosen": -152.78158569335938,
"logps/rejected": -151.21533203125,
"loss": 0.595,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.16340675950050354,
"rewards/margins": 0.2702358067035675,
"rewards/rejected": -0.43364256620407104,
"step": 368
},
{
"epoch": 0.38,
"learning_rate": 3.882258729352768e-05,
"logits/chosen": -2.0957415103912354,
"logits/rejected": -2.0559308528900146,
"logps/chosen": -185.1763153076172,
"logps/rejected": -192.24658203125,
"loss": 0.6894,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2837347686290741,
"rewards/margins": 0.07096924632787704,
"rewards/rejected": -0.35470402240753174,
"step": 369
},
{
"epoch": 0.39,
"learning_rate": 3.874622099130087e-05,
"logits/chosen": -1.9157230854034424,
"logits/rejected": -1.9512850046157837,
"logps/chosen": -149.11521911621094,
"logps/rejected": -165.88624572753906,
"loss": 0.6754,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.335940957069397,
"rewards/margins": 0.11438290774822235,
"rewards/rejected": -0.45032384991645813,
"step": 370
},
{
"epoch": 0.39,
"learning_rate": 3.866967039642784e-05,
"logits/chosen": -2.0574257373809814,
"logits/rejected": -2.203120470046997,
"logps/chosen": -158.74758911132812,
"logps/rejected": -173.32127380371094,
"loss": 0.6842,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.40988606214523315,
"rewards/margins": 0.061044152826070786,
"rewards/rejected": -0.4709302484989166,
"step": 371
},
{
"epoch": 0.39,
"learning_rate": 3.859293653520604e-05,
"logits/chosen": -2.053711175918579,
"logits/rejected": -1.980366587638855,
"logps/chosen": -214.1997528076172,
"logps/rejected": -201.98345947265625,
"loss": 0.6636,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2103947103023529,
"rewards/margins": 0.11527465283870697,
"rewards/rejected": -0.3256693482398987,
"step": 372
},
{
"epoch": 0.39,
"learning_rate": 3.851602043638994e-05,
"logits/chosen": -2.012058973312378,
"logits/rejected": -1.9634625911712646,
"logps/chosen": -167.78753662109375,
"logps/rejected": -190.8964385986328,
"loss": 0.6111,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.26867952942848206,
"rewards/margins": 0.29532575607299805,
"rewards/rejected": -0.5640051960945129,
"step": 373
},
{
"epoch": 0.39,
"learning_rate": 3.843892313117724e-05,
"logits/chosen": -2.0894453525543213,
"logits/rejected": -2.1030113697052,
"logps/chosen": -155.4733123779297,
"logps/rejected": -177.70977783203125,
"loss": 0.784,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.658606767654419,
"rewards/margins": -0.06122620403766632,
"rewards/rejected": -0.597380518913269,
"step": 374
},
{
"epoch": 0.39,
"learning_rate": 3.8361645653195026e-05,
"logits/chosen": -2.1214489936828613,
"logits/rejected": -2.1964516639709473,
"logps/chosen": -163.82373046875,
"logps/rejected": -185.65740966796875,
"loss": 0.7088,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.20250454545021057,
"rewards/margins": 0.09702645242214203,
"rewards/rejected": -0.2995309829711914,
"step": 375
},
{
"epoch": 0.39,
"learning_rate": 3.8284189038485936e-05,
"logits/chosen": -2.225022554397583,
"logits/rejected": -2.2104990482330322,
"logps/chosen": -160.45584106445312,
"logps/rejected": -158.27365112304688,
"loss": 0.6533,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3125608265399933,
"rewards/margins": 0.14464329183101654,
"rewards/rejected": -0.45720410346984863,
"step": 376
},
{
"epoch": 0.39,
"learning_rate": 3.8206554325494225e-05,
"logits/chosen": -2.246929168701172,
"logits/rejected": -2.206899881362915,
"logps/chosen": -177.77415466308594,
"logps/rejected": -168.1491241455078,
"loss": 0.6993,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4578195810317993,
"rewards/margins": 0.06638437509536743,
"rewards/rejected": -0.5242039561271667,
"step": 377
},
{
"epoch": 0.39,
"learning_rate": 3.812874255505191e-05,
"logits/chosen": -2.2009379863739014,
"logits/rejected": -2.1823983192443848,
"logps/chosen": -149.4908905029297,
"logps/rejected": -165.87646484375,
"loss": 0.822,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.47621116042137146,
"rewards/margins": -0.1242537796497345,
"rewards/rejected": -0.35195738077163696,
"step": 378
},
{
"epoch": 0.4,
"learning_rate": 3.805075477036476e-05,
"logits/chosen": -2.1507351398468018,
"logits/rejected": -2.098275661468506,
"logps/chosen": -155.75393676757812,
"logps/rejected": -150.32437133789062,
"loss": 0.6496,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.23333942890167236,
"rewards/margins": 0.19434207677841187,
"rewards/rejected": -0.42768150568008423,
"step": 379
},
{
"epoch": 0.4,
"learning_rate": 3.797259201699833e-05,
"logits/chosen": -2.231349468231201,
"logits/rejected": -2.2406768798828125,
"logps/chosen": -165.03302001953125,
"logps/rejected": -160.68557739257812,
"loss": 0.6536,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2146168351173401,
"rewards/margins": 0.12036348879337311,
"rewards/rejected": -0.334980309009552,
"step": 380
},
{
"epoch": 0.4,
"learning_rate": 3.789425534286394e-05,
"logits/chosen": -2.3824687004089355,
"logits/rejected": -2.3478920459747314,
"logps/chosen": -267.89453125,
"logps/rejected": -268.1957702636719,
"loss": 0.8122,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.3535911440849304,
"rewards/margins": -0.1552121490240097,
"rewards/rejected": -0.19837898015975952,
"step": 381
},
{
"epoch": 0.4,
"learning_rate": 3.781574579820464e-05,
"logits/chosen": -2.171052932739258,
"logits/rejected": -2.204153299331665,
"logps/chosen": -226.82186889648438,
"logps/rejected": -241.44273376464844,
"loss": 0.6833,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4559452533721924,
"rewards/margins": 0.0971999317407608,
"rewards/rejected": -0.553145170211792,
"step": 382
},
{
"epoch": 0.4,
"learning_rate": 3.773706443558111e-05,
"logits/chosen": -2.1312382221221924,
"logits/rejected": -2.159982442855835,
"logps/chosen": -169.75729370117188,
"logps/rejected": -180.92498779296875,
"loss": 0.769,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.4790065586566925,
"rewards/margins": -0.034655213356018066,
"rewards/rejected": -0.44435134530067444,
"step": 383
},
{
"epoch": 0.4,
"learning_rate": 3.765821230985758e-05,
"logits/chosen": -2.1556124687194824,
"logits/rejected": -2.1452436447143555,
"logps/chosen": -205.10719299316406,
"logps/rejected": -178.97406005859375,
"loss": 0.6892,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.4622449278831482,
"rewards/margins": 0.09195668250322342,
"rewards/rejected": -0.554201602935791,
"step": 384
},
{
"epoch": 0.4,
"learning_rate": 3.75791904781876e-05,
"logits/chosen": -2.2256433963775635,
"logits/rejected": -2.1840739250183105,
"logps/chosen": -173.39869689941406,
"logps/rejected": -180.0209503173828,
"loss": 0.7169,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.41330060362815857,
"rewards/margins": 0.014306016266345978,
"rewards/rejected": -0.42760664224624634,
"step": 385
},
{
"epoch": 0.4,
"learning_rate": 3.7500000000000003e-05,
"logits/chosen": -2.2018544673919678,
"logits/rejected": -2.175457000732422,
"logps/chosen": -141.57928466796875,
"logps/rejected": -142.81686401367188,
"loss": 0.6963,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.1722419261932373,
"rewards/margins": 0.04862082004547119,
"rewards/rejected": -0.2208627462387085,
"step": 386
},
{
"epoch": 0.4,
"learning_rate": 3.74206419369846e-05,
"logits/chosen": -2.224078893661499,
"logits/rejected": -2.2861950397491455,
"logps/chosen": -193.62677001953125,
"logps/rejected": -196.46714782714844,
"loss": 0.8549,
"rewards/accuracies": 0.125,
"rewards/chosen": -0.5895551443099976,
"rewards/margins": -0.2451256364583969,
"rewards/rejected": -0.34442949295043945,
"step": 387
},
{
"epoch": 0.4,
"learning_rate": 3.7341117353077966e-05,
"logits/chosen": -2.3696727752685547,
"logits/rejected": -2.3207218647003174,
"logps/chosen": -237.3257598876953,
"logps/rejected": -208.5120391845703,
"loss": 0.6565,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.47394251823425293,
"rewards/margins": 0.12043605744838715,
"rewards/rejected": -0.5943784713745117,
"step": 388
},
{
"epoch": 0.41,
"learning_rate": 3.726142731444921e-05,
"logits/chosen": -2.1822972297668457,
"logits/rejected": -2.2831835746765137,
"logps/chosen": -150.12652587890625,
"logps/rejected": -144.74551391601562,
"loss": 0.7884,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.33526331186294556,
"rewards/margins": -0.13741034269332886,
"rewards/rejected": -0.1978529691696167,
"step": 389
},
{
"epoch": 0.41,
"learning_rate": 3.718157288948563e-05,
"logits/chosen": -2.2395238876342773,
"logits/rejected": -2.2703099250793457,
"logps/chosen": -177.6690216064453,
"logps/rejected": -185.6038818359375,
"loss": 0.5714,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.45524442195892334,
"rewards/margins": 0.34756022691726685,
"rewards/rejected": -0.8028046488761902,
"step": 390
},
{
"epoch": 0.41,
"learning_rate": 3.710155514877844e-05,
"logits/chosen": -2.2453153133392334,
"logits/rejected": -2.2400312423706055,
"logps/chosen": -161.18931579589844,
"logps/rejected": -155.0644073486328,
"loss": 0.9213,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.633940577507019,
"rewards/margins": -0.3272451162338257,
"rewards/rejected": -0.30669546127319336,
"step": 391
},
{
"epoch": 0.41,
"learning_rate": 3.702137516510838e-05,
"logits/chosen": -2.1709861755371094,
"logits/rejected": -2.1527490615844727,
"logps/chosen": -149.6127166748047,
"logps/rejected": -135.25987243652344,
"loss": 0.6775,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.31984999775886536,
"rewards/margins": 0.08133503049612045,
"rewards/rejected": -0.4011850357055664,
"step": 392
},
{
"epoch": 0.41,
"learning_rate": 3.694103401343136e-05,
"logits/chosen": -2.3013548851013184,
"logits/rejected": -2.2986433506011963,
"logps/chosen": -165.6312255859375,
"logps/rejected": -174.7577362060547,
"loss": 0.7364,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5793277025222778,
"rewards/margins": -0.015010036528110504,
"rewards/rejected": -0.5643177032470703,
"step": 393
},
{
"epoch": 0.41,
"learning_rate": 3.686053277086401e-05,
"logits/chosen": -2.1550047397613525,
"logits/rejected": -2.246464252471924,
"logps/chosen": -147.44175720214844,
"logps/rejected": -154.8894805908203,
"loss": 0.7561,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.3496069014072418,
"rewards/margins": -0.07592416554689407,
"rewards/rejected": -0.27368271350860596,
"step": 394
},
{
"epoch": 0.41,
"learning_rate": 3.6779872516669295e-05,
"logits/chosen": -2.1460325717926025,
"logits/rejected": -2.154590129852295,
"logps/chosen": -151.6021728515625,
"logps/rejected": -168.52456665039062,
"loss": 0.5368,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2864820659160614,
"rewards/margins": 0.4405989944934845,
"rewards/rejected": -0.7270810008049011,
"step": 395
},
{
"epoch": 0.41,
"learning_rate": 3.669905433224199e-05,
"logits/chosen": -2.315129041671753,
"logits/rejected": -2.4050261974334717,
"logps/chosen": -146.9856414794922,
"logps/rejected": -169.13397216796875,
"loss": 0.7616,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2588258385658264,
"rewards/margins": -0.059488385915756226,
"rewards/rejected": -0.199337437748909,
"step": 396
},
{
"epoch": 0.41,
"learning_rate": 3.6618079301094216e-05,
"logits/chosen": -2.233609199523926,
"logits/rejected": -2.2259573936462402,
"logps/chosen": -179.19204711914062,
"logps/rejected": -178.01219177246094,
"loss": 0.5825,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2573452591896057,
"rewards/margins": 0.3134271800518036,
"rewards/rejected": -0.5707724094390869,
"step": 397
},
{
"epoch": 0.42,
"learning_rate": 3.653694850884091e-05,
"logits/chosen": -2.2443690299987793,
"logits/rejected": -2.3436453342437744,
"logps/chosen": -141.96392822265625,
"logps/rejected": -165.8162384033203,
"loss": 0.6064,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1481434404850006,
"rewards/margins": 0.25018060207366943,
"rewards/rejected": -0.39832407236099243,
"step": 398
},
{
"epoch": 0.42,
"learning_rate": 3.645566304318526e-05,
"logits/chosen": -2.251343250274658,
"logits/rejected": -2.2624480724334717,
"logps/chosen": -199.05337524414062,
"logps/rejected": -199.2899627685547,
"loss": 0.6354,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.46376729011535645,
"rewards/margins": 0.15301668643951416,
"rewards/rejected": -0.6167839765548706,
"step": 399
},
{
"epoch": 0.42,
"learning_rate": 3.637422399390413e-05,
"logits/chosen": -2.309321165084839,
"logits/rejected": -2.249835252761841,
"logps/chosen": -187.30145263671875,
"logps/rejected": -175.0135498046875,
"loss": 0.8022,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6318432688713074,
"rewards/margins": -0.13741618394851685,
"rewards/rejected": -0.49442705512046814,
"step": 400
},
{
"epoch": 0.42,
"learning_rate": 3.6292632452833436e-05,
"logits/chosen": -2.149308681488037,
"logits/rejected": -2.1867353916168213,
"logps/chosen": -157.0370635986328,
"logps/rejected": -179.92384338378906,
"loss": 0.6469,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2832821309566498,
"rewards/margins": 0.17430852353572845,
"rewards/rejected": -0.45759066939353943,
"step": 401
},
{
"epoch": 0.42,
"learning_rate": 3.621088951385353e-05,
"logits/chosen": -2.430102825164795,
"logits/rejected": -2.4103968143463135,
"logps/chosen": -174.2300262451172,
"logps/rejected": -194.47872924804688,
"loss": 0.6782,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6608787178993225,
"rewards/margins": 0.14938460290431976,
"rewards/rejected": -0.8102633953094482,
"step": 402
},
{
"epoch": 0.42,
"learning_rate": 3.612899627287452e-05,
"logits/chosen": -2.381316661834717,
"logits/rejected": -2.4797677993774414,
"logps/chosen": -183.57020568847656,
"logps/rejected": -211.94908142089844,
"loss": 0.7676,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6477088332176208,
"rewards/margins": 0.04121372848749161,
"rewards/rejected": -0.6889225840568542,
"step": 403
},
{
"epoch": 0.42,
"learning_rate": 3.604695382782159e-05,
"logits/chosen": -2.2722840309143066,
"logits/rejected": -2.2684082984924316,
"logps/chosen": -157.826171875,
"logps/rejected": -155.97283935546875,
"loss": 0.7743,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.18509769439697266,
"rewards/margins": -0.07210510969161987,
"rewards/rejected": -0.11299259960651398,
"step": 404
},
{
"epoch": 0.42,
"learning_rate": 3.596476327862024e-05,
"logits/chosen": -2.128013849258423,
"logits/rejected": -2.2023983001708984,
"logps/chosen": -194.61569213867188,
"logps/rejected": -211.86927795410156,
"loss": 0.6399,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.36270129680633545,
"rewards/margins": 0.2554909288883209,
"rewards/rejected": -0.618192195892334,
"step": 405
},
{
"epoch": 0.42,
"learning_rate": 3.588242572718162e-05,
"logits/chosen": -2.377016305923462,
"logits/rejected": -2.2582497596740723,
"logps/chosen": -161.89572143554688,
"logps/rejected": -164.3067169189453,
"loss": 0.744,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.38204970955848694,
"rewards/margins": -0.01100611686706543,
"rewards/rejected": -0.3710435926914215,
"step": 406
},
{
"epoch": 0.42,
"learning_rate": 3.579994227738767e-05,
"logits/chosen": -2.208984851837158,
"logits/rejected": -2.288970708847046,
"logps/chosen": -191.62936401367188,
"logps/rejected": -223.2491455078125,
"loss": 0.6478,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.34289631247520447,
"rewards/margins": 0.17745056748390198,
"rewards/rejected": -0.5203468799591064,
"step": 407
},
{
"epoch": 0.43,
"learning_rate": 3.5717314035076355e-05,
"logits/chosen": -2.2903645038604736,
"logits/rejected": -2.224257469177246,
"logps/chosen": -173.94061279296875,
"logps/rejected": -183.39857482910156,
"loss": 0.9867,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8537546396255493,
"rewards/margins": -0.33834022283554077,
"rewards/rejected": -0.5154143571853638,
"step": 408
},
{
"epoch": 0.43,
"learning_rate": 3.5634542108026876e-05,
"logits/chosen": -2.2586324214935303,
"logits/rejected": -2.333674192428589,
"logps/chosen": -128.76527404785156,
"logps/rejected": -138.5326690673828,
"loss": 0.6894,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3050040900707245,
"rewards/margins": 0.024737656116485596,
"rewards/rejected": -0.32974177598953247,
"step": 409
},
{
"epoch": 0.43,
"learning_rate": 3.5551627605944745e-05,
"logits/chosen": -2.2455837726593018,
"logits/rejected": -2.2226006984710693,
"logps/chosen": -165.0658416748047,
"logps/rejected": -158.5568084716797,
"loss": 0.7748,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.47917866706848145,
"rewards/margins": -0.09431587904691696,
"rewards/rejected": -0.3848627507686615,
"step": 410
},
{
"epoch": 0.43,
"learning_rate": 3.5468571640446994e-05,
"logits/chosen": -2.220954179763794,
"logits/rejected": -2.2065913677215576,
"logps/chosen": -155.57606506347656,
"logps/rejected": -196.17453002929688,
"loss": 0.681,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.43619242310523987,
"rewards/margins": 0.16942578554153442,
"rewards/rejected": -0.6056181788444519,
"step": 411
},
{
"epoch": 0.43,
"learning_rate": 3.5385375325047166e-05,
"logits/chosen": -2.28615665435791,
"logits/rejected": -2.3207404613494873,
"logps/chosen": -139.8089599609375,
"logps/rejected": -146.37413024902344,
"loss": 0.5752,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2894131541252136,
"rewards/margins": 0.32500603795051575,
"rewards/rejected": -0.6144192218780518,
"step": 412
},
{
"epoch": 0.43,
"learning_rate": 3.5302039775140486e-05,
"logits/chosen": -2.223402500152588,
"logits/rejected": -2.2294511795043945,
"logps/chosen": -192.4325714111328,
"logps/rejected": -195.81764221191406,
"loss": 0.6225,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.422715961933136,
"rewards/margins": 0.23224471509456635,
"rewards/rejected": -0.654960572719574,
"step": 413
},
{
"epoch": 0.43,
"learning_rate": 3.521856610798887e-05,
"logits/chosen": -2.1355066299438477,
"logits/rejected": -2.2011489868164062,
"logps/chosen": -186.72837829589844,
"logps/rejected": -198.95672607421875,
"loss": 0.7121,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8084388971328735,
"rewards/margins": 0.06887459009885788,
"rewards/rejected": -0.8773134350776672,
"step": 414
},
{
"epoch": 0.43,
"learning_rate": 3.513495544270592e-05,
"logits/chosen": -2.2741241455078125,
"logits/rejected": -2.2826316356658936,
"logps/chosen": -167.8680877685547,
"logps/rejected": -167.94290161132812,
"loss": 0.7632,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.5057553648948669,
"rewards/margins": -0.06321151554584503,
"rewards/rejected": -0.44254380464553833,
"step": 415
},
{
"epoch": 0.43,
"learning_rate": 3.505120890024195e-05,
"logits/chosen": -2.2100603580474854,
"logits/rejected": -2.1715095043182373,
"logps/chosen": -178.6398162841797,
"logps/rejected": -199.203857421875,
"loss": 0.7779,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6270021796226501,
"rewards/margins": -0.08634312450885773,
"rewards/rejected": -0.5406590700149536,
"step": 416
},
{
"epoch": 0.43,
"learning_rate": 3.496732760336895e-05,
"logits/chosen": -2.3388140201568604,
"logits/rejected": -2.3569979667663574,
"logps/chosen": -183.85336303710938,
"logps/rejected": -176.68597412109375,
"loss": 0.6665,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.38947370648384094,
"rewards/margins": 0.12428087741136551,
"rewards/rejected": -0.5137546062469482,
"step": 417
},
{
"epoch": 0.44,
"learning_rate": 3.4883312676665536e-05,
"logits/chosen": -2.0799946784973145,
"logits/rejected": -2.127676248550415,
"logps/chosen": -157.06455993652344,
"logps/rejected": -196.19081115722656,
"loss": 0.6533,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.38800424337387085,
"rewards/margins": 0.2187524437904358,
"rewards/rejected": -0.6067566275596619,
"step": 418
},
{
"epoch": 0.44,
"learning_rate": 3.479916524650188e-05,
"logits/chosen": -2.2445905208587646,
"logits/rejected": -2.2511062622070312,
"logps/chosen": -191.49032592773438,
"logps/rejected": -211.65402221679688,
"loss": 0.6537,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4725024104118347,
"rewards/margins": 0.14553888142108917,
"rewards/rejected": -0.6180413365364075,
"step": 419
},
{
"epoch": 0.44,
"learning_rate": 3.4714886441024574e-05,
"logits/chosen": -2.3066306114196777,
"logits/rejected": -2.295111894607544,
"logps/chosen": -166.61192321777344,
"logps/rejected": -183.85430908203125,
"loss": 0.7243,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6095054149627686,
"rewards/margins": 0.045834727585315704,
"rewards/rejected": -0.6553401350975037,
"step": 420
},
{
"epoch": 0.44,
"learning_rate": 3.4630477390141556e-05,
"logits/chosen": -2.0795845985412598,
"logits/rejected": -2.0988335609436035,
"logps/chosen": -176.13385009765625,
"logps/rejected": -162.16116333007812,
"loss": 0.8325,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.6280168890953064,
"rewards/margins": -0.21699213981628418,
"rewards/rejected": -0.41102465987205505,
"step": 421
},
{
"epoch": 0.44,
"learning_rate": 3.4545939225506934e-05,
"logits/chosen": -2.2829484939575195,
"logits/rejected": -2.369950294494629,
"logps/chosen": -120.27825164794922,
"logps/rejected": -135.86590576171875,
"loss": 0.6277,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3205239772796631,
"rewards/margins": 0.2128859907388687,
"rewards/rejected": -0.5334099531173706,
"step": 422
},
{
"epoch": 0.44,
"learning_rate": 3.4461273080505793e-05,
"logits/chosen": -2.227790117263794,
"logits/rejected": -2.3280563354492188,
"logps/chosen": -166.44442749023438,
"logps/rejected": -190.11199951171875,
"loss": 0.658,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6263283491134644,
"rewards/margins": 0.1760719120502472,
"rewards/rejected": -0.8024002909660339,
"step": 423
},
{
"epoch": 0.44,
"learning_rate": 3.437648009023905e-05,
"logits/chosen": -2.2951772212982178,
"logits/rejected": -2.2783102989196777,
"logps/chosen": -168.9396209716797,
"logps/rejected": -161.99826049804688,
"loss": 0.7575,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.4612702429294586,
"rewards/margins": -0.09408943355083466,
"rewards/rejected": -0.36718082427978516,
"step": 424
},
{
"epoch": 0.44,
"learning_rate": 3.4291561391508185e-05,
"logits/chosen": -2.233304023742676,
"logits/rejected": -2.1193814277648926,
"logps/chosen": -191.5747833251953,
"logps/rejected": -198.6702423095703,
"loss": 0.6742,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6324459910392761,
"rewards/margins": 0.13748225569725037,
"rewards/rejected": -0.7699282169342041,
"step": 425
},
{
"epoch": 0.44,
"learning_rate": 3.420651812280006e-05,
"logits/chosen": -2.0054640769958496,
"logits/rejected": -2.0682601928710938,
"logps/chosen": -179.95880126953125,
"logps/rejected": -178.59805297851562,
"loss": 0.7556,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5854605436325073,
"rewards/margins": -0.06146989390254021,
"rewards/rejected": -0.5239906311035156,
"step": 426
},
{
"epoch": 0.45,
"learning_rate": 3.4121351424271594e-05,
"logits/chosen": -2.220736026763916,
"logits/rejected": -2.229501247406006,
"logps/chosen": -161.79852294921875,
"logps/rejected": -150.96261596679688,
"loss": 0.634,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5760630369186401,
"rewards/margins": 0.19034941494464874,
"rewards/rejected": -0.7664124965667725,
"step": 427
},
{
"epoch": 0.45,
"learning_rate": 3.4036062437734484e-05,
"logits/chosen": -2.084941864013672,
"logits/rejected": -2.1283769607543945,
"logps/chosen": -138.01251220703125,
"logps/rejected": -141.75653076171875,
"loss": 0.6951,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5684069991111755,
"rewards/margins": 0.06860056519508362,
"rewards/rejected": -0.6370075345039368,
"step": 428
},
{
"epoch": 0.45,
"learning_rate": 3.395065230663996e-05,
"logits/chosen": -2.356782913208008,
"logits/rejected": -2.3323380947113037,
"logps/chosen": -164.42636108398438,
"logps/rejected": -157.51885986328125,
"loss": 0.8111,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5730559825897217,
"rewards/margins": -0.158206045627594,
"rewards/rejected": -0.4148499667644501,
"step": 429
},
{
"epoch": 0.45,
"learning_rate": 3.386512217606339e-05,
"logits/chosen": -2.306445837020874,
"logits/rejected": -2.305457353591919,
"logps/chosen": -177.36483764648438,
"logps/rejected": -180.41497802734375,
"loss": 0.7929,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6710464358329773,
"rewards/margins": -0.10232071578502655,
"rewards/rejected": -0.5687257647514343,
"step": 430
},
{
"epoch": 0.45,
"learning_rate": 3.3779473192688954e-05,
"logits/chosen": -2.2322001457214355,
"logits/rejected": -2.2678284645080566,
"logps/chosen": -180.5767822265625,
"logps/rejected": -215.00439453125,
"loss": 0.6272,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7634757161140442,
"rewards/margins": 0.3175090253353119,
"rewards/rejected": -1.0809847116470337,
"step": 431
},
{
"epoch": 0.45,
"learning_rate": 3.369370650479425e-05,
"logits/chosen": -2.3506946563720703,
"logits/rejected": -2.272690534591675,
"logps/chosen": -191.31764221191406,
"logps/rejected": -167.90931701660156,
"loss": 0.6944,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6492268443107605,
"rewards/margins": 0.056166499853134155,
"rewards/rejected": -0.7053933143615723,
"step": 432
},
{
"epoch": 0.45,
"learning_rate": 3.360782326223493e-05,
"logits/chosen": -2.21726131439209,
"logits/rejected": -2.1750893592834473,
"logps/chosen": -130.21981811523438,
"logps/rejected": -122.38761138916016,
"loss": 0.7093,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6112926006317139,
"rewards/margins": 0.05839107558131218,
"rewards/rejected": -0.6696836948394775,
"step": 433
},
{
"epoch": 0.45,
"learning_rate": 3.3521824616429285e-05,
"logits/chosen": -2.276099681854248,
"logits/rejected": -2.3207895755767822,
"logps/chosen": -152.9349822998047,
"logps/rejected": -176.16763305664062,
"loss": 0.6335,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6175845861434937,
"rewards/margins": 0.19425898790359497,
"rewards/rejected": -0.8118435740470886,
"step": 434
},
{
"epoch": 0.45,
"learning_rate": 3.3435711720342764e-05,
"logits/chosen": -2.3540244102478027,
"logits/rejected": -2.4207704067230225,
"logps/chosen": -162.0496063232422,
"logps/rejected": -180.61257934570312,
"loss": 0.6201,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7302818298339844,
"rewards/margins": 0.24378572404384613,
"rewards/rejected": -0.9740675687789917,
"step": 435
},
{
"epoch": 0.45,
"learning_rate": 3.3349485728472535e-05,
"logits/chosen": -2.2981767654418945,
"logits/rejected": -2.403442144393921,
"logps/chosen": -169.84153747558594,
"logps/rejected": -196.2303466796875,
"loss": 0.6202,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.631123960018158,
"rewards/margins": 0.2665478587150574,
"rewards/rejected": -0.8976718187332153,
"step": 436
},
{
"epoch": 0.46,
"learning_rate": 3.326314779683207e-05,
"logits/chosen": -2.45729923248291,
"logits/rejected": -2.3062028884887695,
"logps/chosen": -180.6869354248047,
"logps/rejected": -158.7431640625,
"loss": 0.8641,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.7027081251144409,
"rewards/margins": -0.22353459894657135,
"rewards/rejected": -0.47917354106903076,
"step": 437
},
{
"epoch": 0.46,
"learning_rate": 3.3176699082935545e-05,
"logits/chosen": -2.310640811920166,
"logits/rejected": -2.3036937713623047,
"logps/chosen": -186.8025665283203,
"logps/rejected": -198.4528045654297,
"loss": 0.6925,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8213132619857788,
"rewards/margins": 0.07846779376268387,
"rewards/rejected": -0.8997809886932373,
"step": 438
},
{
"epoch": 0.46,
"learning_rate": 3.3090140745782396e-05,
"logits/chosen": -2.3146181106567383,
"logits/rejected": -2.318394660949707,
"logps/chosen": -207.11068725585938,
"logps/rejected": -205.5038604736328,
"loss": 0.6093,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6563000679016113,
"rewards/margins": 0.33885622024536133,
"rewards/rejected": -0.9951564073562622,
"step": 439
},
{
"epoch": 0.46,
"learning_rate": 3.300347394584172e-05,
"logits/chosen": -2.4188132286071777,
"logits/rejected": -2.4725587368011475,
"logps/chosen": -156.28175354003906,
"logps/rejected": -178.71530151367188,
"loss": 0.7442,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6938459277153015,
"rewards/margins": 0.030388107523322105,
"rewards/rejected": -0.7242341041564941,
"step": 440
},
{
"epoch": 0.46,
"learning_rate": 3.2916699845036816e-05,
"logits/chosen": -2.326338768005371,
"logits/rejected": -2.3983230590820312,
"logps/chosen": -116.61759948730469,
"logps/rejected": -127.69525909423828,
"loss": 0.6664,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6837319135665894,
"rewards/margins": 0.17259922623634338,
"rewards/rejected": -0.8563311696052551,
"step": 441
},
{
"epoch": 0.46,
"learning_rate": 3.282981960672948e-05,
"logits/chosen": -2.239466905593872,
"logits/rejected": -2.3111233711242676,
"logps/chosen": -159.78004455566406,
"logps/rejected": -177.1030731201172,
"loss": 0.6711,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7229784727096558,
"rewards/margins": 0.17383922636508942,
"rewards/rejected": -0.896817684173584,
"step": 442
},
{
"epoch": 0.46,
"learning_rate": 3.2742834395704486e-05,
"logits/chosen": -2.203927516937256,
"logits/rejected": -2.1760244369506836,
"logps/chosen": -133.5518341064453,
"logps/rejected": -157.52099609375,
"loss": 0.7027,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.71323561668396,
"rewards/margins": 0.07185419648885727,
"rewards/rejected": -0.7850897908210754,
"step": 443
},
{
"epoch": 0.46,
"learning_rate": 3.265574537815398e-05,
"logits/chosen": -2.1297600269317627,
"logits/rejected": -2.0856151580810547,
"logps/chosen": -149.8739776611328,
"logps/rejected": -155.45904541015625,
"loss": 0.7246,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7274818420410156,
"rewards/margins": 0.02891545556485653,
"rewards/rejected": -0.7563972473144531,
"step": 444
},
{
"epoch": 0.46,
"learning_rate": 3.25685537216618e-05,
"logits/chosen": -2.446577787399292,
"logits/rejected": -2.4465479850769043,
"logps/chosen": -205.2270050048828,
"logps/rejected": -194.4155731201172,
"loss": 0.7937,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9242798686027527,
"rewards/margins": -0.04263466224074364,
"rewards/rejected": -0.8816452622413635,
"step": 445
},
{
"epoch": 0.47,
"learning_rate": 3.248126059518785e-05,
"logits/chosen": -2.11894154548645,
"logits/rejected": -2.1701767444610596,
"logps/chosen": -146.13601684570312,
"logps/rejected": -163.9047088623047,
"loss": 0.6358,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5888944864273071,
"rewards/margins": 0.164439395070076,
"rewards/rejected": -0.7533338665962219,
"step": 446
},
{
"epoch": 0.47,
"learning_rate": 3.2393867169052385e-05,
"logits/chosen": -2.250922203063965,
"logits/rejected": -2.2642922401428223,
"logps/chosen": -222.06298828125,
"logps/rejected": -232.71881103515625,
"loss": 0.8317,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9078823924064636,
"rewards/margins": -0.15829764306545258,
"rewards/rejected": -0.7495847344398499,
"step": 447
},
{
"epoch": 0.47,
"learning_rate": 3.230637461492043e-05,
"logits/chosen": -2.254838228225708,
"logits/rejected": -2.2903223037719727,
"logps/chosen": -182.53579711914062,
"logps/rejected": -193.14736938476562,
"loss": 0.6348,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.626493513584137,
"rewards/margins": 0.2684532105922699,
"rewards/rejected": -0.8949467539787292,
"step": 448
},
{
"epoch": 0.47,
"learning_rate": 3.221878410578593e-05,
"logits/chosen": -2.258246898651123,
"logits/rejected": -2.1956984996795654,
"logps/chosen": -210.88497924804688,
"logps/rejected": -204.88185119628906,
"loss": 0.8482,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.5795395374298096,
"rewards/margins": -0.15963196754455566,
"rewards/rejected": -0.4199075698852539,
"step": 449
},
{
"epoch": 0.47,
"learning_rate": 3.213109681595612e-05,
"logits/chosen": -2.3528032302856445,
"logits/rejected": -2.296219825744629,
"logps/chosen": -207.69216918945312,
"logps/rejected": -225.57296752929688,
"loss": 0.7991,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9065631031990051,
"rewards/margins": -0.03170624002814293,
"rewards/rejected": -0.8748568892478943,
"step": 450
},
{
"epoch": 0.47,
"learning_rate": 3.2043313921035743e-05,
"logits/chosen": -2.1952693462371826,
"logits/rejected": -2.2114603519439697,
"logps/chosen": -204.89718627929688,
"logps/rejected": -201.62734985351562,
"loss": 0.9834,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.2711020708084106,
"rewards/margins": -0.4141416847705841,
"rewards/rejected": -0.8569603562355042,
"step": 451
},
{
"epoch": 0.47,
"learning_rate": 3.195543659791132e-05,
"logits/chosen": -2.1842381954193115,
"logits/rejected": -2.2123162746429443,
"logps/chosen": -169.44754028320312,
"logps/rejected": -167.3434295654297,
"loss": 0.64,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6639611124992371,
"rewards/margins": 0.15555183589458466,
"rewards/rejected": -0.8195129632949829,
"step": 452
},
{
"epoch": 0.47,
"learning_rate": 3.186746602473533e-05,
"logits/chosen": -2.1493401527404785,
"logits/rejected": -2.1739141941070557,
"logps/chosen": -149.2794189453125,
"logps/rejected": -154.29624938964844,
"loss": 0.8507,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6466760039329529,
"rewards/margins": -0.20036213099956512,
"rewards/rejected": -0.44631391763687134,
"step": 453
},
{
"epoch": 0.47,
"learning_rate": 3.177940338091043e-05,
"logits/chosen": -2.2300286293029785,
"logits/rejected": -2.3122761249542236,
"logps/chosen": -191.32516479492188,
"logps/rejected": -194.84938049316406,
"loss": 0.7401,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7524707317352295,
"rewards/margins": 0.010163038969039917,
"rewards/rejected": -0.7626338005065918,
"step": 454
},
{
"epoch": 0.47,
"learning_rate": 3.169124984707367e-05,
"logits/chosen": -2.2368390560150146,
"logits/rejected": -2.29437255859375,
"logps/chosen": -163.08969116210938,
"logps/rejected": -171.7424774169922,
"loss": 0.8942,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.9385874271392822,
"rewards/margins": -0.3221665024757385,
"rewards/rejected": -0.6164208650588989,
"step": 455
},
{
"epoch": 0.48,
"learning_rate": 3.160300660508064e-05,
"logits/chosen": -2.2047293186187744,
"logits/rejected": -2.1667182445526123,
"logps/chosen": -156.24505615234375,
"logps/rejected": -150.25613403320312,
"loss": 0.7661,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5315223932266235,
"rewards/margins": -0.031079813838005066,
"rewards/rejected": -0.500442624092102,
"step": 456
},
{
"epoch": 0.48,
"learning_rate": 3.151467483798961e-05,
"logits/chosen": -2.2086293697357178,
"logits/rejected": -2.196566581726074,
"logps/chosen": -163.7198944091797,
"logps/rejected": -154.07669067382812,
"loss": 0.7642,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7637531757354736,
"rewards/margins": -0.03873248025774956,
"rewards/rejected": -0.7250206470489502,
"step": 457
},
{
"epoch": 0.48,
"learning_rate": 3.14262557300457e-05,
"logits/chosen": -2.1346004009246826,
"logits/rejected": -2.2624478340148926,
"logps/chosen": -157.80322265625,
"logps/rejected": -193.9817657470703,
"loss": 0.6025,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5126041173934937,
"rewards/margins": 0.3903144598007202,
"rewards/rejected": -0.9029185175895691,
"step": 458
},
{
"epoch": 0.48,
"learning_rate": 3.1337750466665e-05,
"logits/chosen": -2.120087146759033,
"logits/rejected": -2.164226770401001,
"logps/chosen": -189.44192504882812,
"logps/rejected": -220.5596466064453,
"loss": 0.7477,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8327434659004211,
"rewards/margins": -0.02192605845630169,
"rewards/rejected": -0.8108173608779907,
"step": 459
},
{
"epoch": 0.48,
"learning_rate": 3.124916023441865e-05,
"logits/chosen": -2.2006072998046875,
"logits/rejected": -2.1659958362579346,
"logps/chosen": -182.32632446289062,
"logps/rejected": -194.20724487304688,
"loss": 0.8611,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8323721885681152,
"rewards/margins": -0.26599258184432983,
"rewards/rejected": -0.5663796067237854,
"step": 460
},
{
"epoch": 0.48,
"learning_rate": 3.116048622101694e-05,
"logits/chosen": -2.143481969833374,
"logits/rejected": -2.1845016479492188,
"logps/chosen": -165.87046813964844,
"logps/rejected": -171.90936279296875,
"loss": 0.7098,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8616752028465271,
"rewards/margins": 0.11464538425207138,
"rewards/rejected": -0.9763206243515015,
"step": 461
},
{
"epoch": 0.48,
"learning_rate": 3.107172961529343e-05,
"logits/chosen": -2.1274116039276123,
"logits/rejected": -2.162541389465332,
"logps/chosen": -158.4412841796875,
"logps/rejected": -173.54653930664062,
"loss": 0.7462,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7143791317939758,
"rewards/margins": 0.004445172846317291,
"rewards/rejected": -0.7188242673873901,
"step": 462
},
{
"epoch": 0.48,
"learning_rate": 3.098289160718895e-05,
"logits/chosen": -2.1465463638305664,
"logits/rejected": -2.1098814010620117,
"logps/chosen": -130.60450744628906,
"logps/rejected": -149.80252075195312,
"loss": 0.6369,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4852756857872009,
"rewards/margins": 0.2071731984615326,
"rewards/rejected": -0.6924489140510559,
"step": 463
},
{
"epoch": 0.48,
"learning_rate": 3.0893973387735687e-05,
"logits/chosen": -2.323080539703369,
"logits/rejected": -2.2071361541748047,
"logps/chosen": -180.71392822265625,
"logps/rejected": -165.3758544921875,
"loss": 0.9189,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8797387480735779,
"rewards/margins": -0.3374296724796295,
"rewards/rejected": -0.5423091053962708,
"step": 464
},
{
"epoch": 0.49,
"learning_rate": 3.0804976149041195e-05,
"logits/chosen": -2.3689966201782227,
"logits/rejected": -2.432495355606079,
"logps/chosen": -183.48805236816406,
"logps/rejected": -180.59786987304688,
"loss": 0.6557,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7070946097373962,
"rewards/margins": 0.12962420284748077,
"rewards/rejected": -0.836718738079071,
"step": 465
},
{
"epoch": 0.49,
"learning_rate": 3.071590108427244e-05,
"logits/chosen": -2.2095448970794678,
"logits/rejected": -2.22792387008667,
"logps/chosen": -194.24359130859375,
"logps/rejected": -181.46434020996094,
"loss": 0.5084,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.46445751190185547,
"rewards/margins": 0.5187379121780396,
"rewards/rejected": -0.9831954836845398,
"step": 466
},
{
"epoch": 0.49,
"learning_rate": 3.062674938763976e-05,
"logits/chosen": -2.224792718887329,
"logits/rejected": -2.276299476623535,
"logps/chosen": -151.77529907226562,
"logps/rejected": -171.2163543701172,
"loss": 0.589,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.49393290281295776,
"rewards/margins": 0.3881426751613617,
"rewards/rejected": -0.8820755481719971,
"step": 467
},
{
"epoch": 0.49,
"learning_rate": 3.0537522254380905e-05,
"logits/chosen": -2.327399730682373,
"logits/rejected": -2.2717721462249756,
"logps/chosen": -178.17420959472656,
"logps/rejected": -183.68106079101562,
"loss": 0.7317,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3572332262992859,
"rewards/margins": 0.004655532538890839,
"rewards/rejected": -0.3618887662887573,
"step": 468
},
{
"epoch": 0.49,
"learning_rate": 3.044822088074496e-05,
"logits/chosen": -2.150599479675293,
"logits/rejected": -2.1766562461853027,
"logps/chosen": -150.70323181152344,
"logps/rejected": -175.47964477539062,
"loss": 0.6856,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5796679258346558,
"rewards/margins": 0.14709994196891785,
"rewards/rejected": -0.726767897605896,
"step": 469
},
{
"epoch": 0.49,
"learning_rate": 3.0358846463976372e-05,
"logits/chosen": -2.2366018295288086,
"logits/rejected": -2.338874578475952,
"logps/chosen": -192.69740295410156,
"logps/rejected": -190.33204650878906,
"loss": 0.6836,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.48144611716270447,
"rewards/margins": 0.07051944732666016,
"rewards/rejected": -0.551965594291687,
"step": 470
},
{
"epoch": 0.49,
"learning_rate": 3.026940020229882e-05,
"logits/chosen": -2.133188247680664,
"logits/rejected": -2.177133798599243,
"logps/chosen": -150.59495544433594,
"logps/rejected": -149.1016845703125,
"loss": 0.818,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.7209250926971436,
"rewards/margins": -0.17671580612659454,
"rewards/rejected": -0.5442093014717102,
"step": 471
},
{
"epoch": 0.49,
"learning_rate": 3.017988329489923e-05,
"logits/chosen": -2.2492454051971436,
"logits/rejected": -2.2075250148773193,
"logps/chosen": -218.95291137695312,
"logps/rejected": -213.46139526367188,
"loss": 0.7839,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6977720856666565,
"rewards/margins": -0.08655133843421936,
"rewards/rejected": -0.6112207770347595,
"step": 472
},
{
"epoch": 0.49,
"learning_rate": 3.0090296941911633e-05,
"logits/chosen": -2.1852970123291016,
"logits/rejected": -2.1652181148529053,
"logps/chosen": -196.5089874267578,
"logps/rejected": -201.77569580078125,
"loss": 0.7244,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5932771563529968,
"rewards/margins": -0.0026064813137054443,
"rewards/rejected": -0.5906707644462585,
"step": 473
},
{
"epoch": 0.49,
"learning_rate": 3.0000642344401113e-05,
"logits/chosen": -2.115180015563965,
"logits/rejected": -2.0559911727905273,
"logps/chosen": -157.2303924560547,
"logps/rejected": -145.6020050048828,
"loss": 0.7223,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.48168429732322693,
"rewards/margins": 0.11891864240169525,
"rewards/rejected": -0.600602924823761,
"step": 474
},
{
"epoch": 0.5,
"learning_rate": 2.9910920704347696e-05,
"logits/chosen": -2.387964963912964,
"logits/rejected": -2.433955669403076,
"logps/chosen": -245.86285400390625,
"logps/rejected": -259.4566955566406,
"loss": 0.7292,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6999150514602661,
"rewards/margins": 0.08391554653644562,
"rewards/rejected": -0.7838307023048401,
"step": 475
},
{
"epoch": 0.5,
"learning_rate": 2.9821133224630226e-05,
"logits/chosen": -2.1827383041381836,
"logits/rejected": -2.2108314037323,
"logps/chosen": -172.43350219726562,
"logps/rejected": -167.54298400878906,
"loss": 0.6998,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.40625911951065063,
"rewards/margins": 0.13796135783195496,
"rewards/rejected": -0.5442204475402832,
"step": 476
},
{
"epoch": 0.5,
"learning_rate": 2.9731281109010256e-05,
"logits/chosen": -2.393608331680298,
"logits/rejected": -2.4628074169158936,
"logps/chosen": -155.9365234375,
"logps/rejected": -150.9811248779297,
"loss": 0.6863,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4619404375553131,
"rewards/margins": 0.08013444393873215,
"rewards/rejected": -0.5420749187469482,
"step": 477
},
{
"epoch": 0.5,
"learning_rate": 2.9641365562115887e-05,
"logits/chosen": -2.1305439472198486,
"logits/rejected": -2.158849000930786,
"logps/chosen": -157.4604034423828,
"logps/rejected": -158.77505493164062,
"loss": 0.6796,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5900415182113647,
"rewards/margins": 0.08278737962245941,
"rewards/rejected": -0.6728289127349854,
"step": 478
},
{
"epoch": 0.5,
"learning_rate": 2.9551387789425638e-05,
"logits/chosen": -2.111013412475586,
"logits/rejected": -2.1469898223876953,
"logps/chosen": -177.7059326171875,
"logps/rejected": -199.083251953125,
"loss": 0.6744,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6065003871917725,
"rewards/margins": 0.17838376760482788,
"rewards/rejected": -0.7848842144012451,
"step": 479
},
{
"epoch": 0.5,
"learning_rate": 2.9461348997252265e-05,
"logits/chosen": -2.2793450355529785,
"logits/rejected": -2.2518503665924072,
"logps/chosen": -167.08595275878906,
"logps/rejected": -162.74386596679688,
"loss": 0.6446,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5793865323066711,
"rewards/margins": 0.19307895004749298,
"rewards/rejected": -0.7724654674530029,
"step": 480
},
{
"epoch": 0.5,
"learning_rate": 2.9371250392726614e-05,
"logits/chosen": -2.156540632247925,
"logits/rejected": -2.1982791423797607,
"logps/chosen": -232.06939697265625,
"logps/rejected": -225.7415771484375,
"loss": 0.6585,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6940824389457703,
"rewards/margins": 0.11553283035755157,
"rewards/rejected": -0.8096152544021606,
"step": 481
},
{
"epoch": 0.5,
"learning_rate": 2.9281093183781403e-05,
"logits/chosen": -2.0882251262664795,
"logits/rejected": -2.2663707733154297,
"logps/chosen": -130.05990600585938,
"logps/rejected": -188.05630493164062,
"loss": 0.6944,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4038165509700775,
"rewards/margins": 0.06018731743097305,
"rewards/rejected": -0.46400386095046997,
"step": 482
},
{
"epoch": 0.5,
"learning_rate": 2.919087857913508e-05,
"logits/chosen": -2.3520162105560303,
"logits/rejected": -2.321183443069458,
"logps/chosen": -182.3740997314453,
"logps/rejected": -178.23336791992188,
"loss": 0.6351,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5323800444602966,
"rewards/margins": 0.18767206370830536,
"rewards/rejected": -0.7200521230697632,
"step": 483
},
{
"epoch": 0.5,
"learning_rate": 2.9100607788275545e-05,
"logits/chosen": -2.1776552200317383,
"logits/rejected": -2.2282662391662598,
"logps/chosen": -163.6830596923828,
"logps/rejected": -172.34671020507812,
"loss": 0.7805,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.6728564500808716,
"rewards/margins": -0.11494327336549759,
"rewards/rejected": -0.5579131245613098,
"step": 484
},
{
"epoch": 0.51,
"learning_rate": 2.9010282021444008e-05,
"logits/chosen": -2.239274501800537,
"logits/rejected": -2.17651104927063,
"logps/chosen": -174.9864044189453,
"logps/rejected": -169.44493103027344,
"loss": 0.8076,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5585433840751648,
"rewards/margins": -0.09774555265903473,
"rewards/rejected": -0.4607977867126465,
"step": 485
},
{
"epoch": 0.51,
"learning_rate": 2.891990248961871e-05,
"logits/chosen": -2.1217386722564697,
"logits/rejected": -2.1039137840270996,
"logps/chosen": -159.67498779296875,
"logps/rejected": -174.40069580078125,
"loss": 0.6205,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.24506211280822754,
"rewards/margins": 0.40262073278427124,
"rewards/rejected": -0.647682785987854,
"step": 486
},
{
"epoch": 0.51,
"learning_rate": 2.8829470404498697e-05,
"logits/chosen": -2.1323282718658447,
"logits/rejected": -2.1301045417785645,
"logps/chosen": -129.35870361328125,
"logps/rejected": -170.45484924316406,
"loss": 0.6848,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4312661588191986,
"rewards/margins": 0.12291737645864487,
"rewards/rejected": -0.5541835427284241,
"step": 487
},
{
"epoch": 0.51,
"learning_rate": 2.8738986978487625e-05,
"logits/chosen": -2.2189228534698486,
"logits/rejected": -2.1614956855773926,
"logps/chosen": -193.06204223632812,
"logps/rejected": -182.66293334960938,
"loss": 0.7176,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6811609864234924,
"rewards/margins": 0.08441457152366638,
"rewards/rejected": -0.7655755877494812,
"step": 488
},
{
"epoch": 0.51,
"learning_rate": 2.8648453424677434e-05,
"logits/chosen": -2.2789225578308105,
"logits/rejected": -2.3813822269439697,
"logps/chosen": -168.468017578125,
"logps/rejected": -183.34982299804688,
"loss": 0.655,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5851632952690125,
"rewards/margins": 0.17568376660346985,
"rewards/rejected": -0.7608469724655151,
"step": 489
},
{
"epoch": 0.51,
"learning_rate": 2.8557870956832132e-05,
"logits/chosen": -2.264902114868164,
"logits/rejected": -2.2514560222625732,
"logps/chosen": -174.7198486328125,
"logps/rejected": -179.86676025390625,
"loss": 0.6624,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6332632303237915,
"rewards/margins": 0.08560072630643845,
"rewards/rejected": -0.7188639640808105,
"step": 490
},
{
"epoch": 0.51,
"learning_rate": 2.846724078937149e-05,
"logits/chosen": -2.1317250728607178,
"logits/rejected": -2.1464059352874756,
"logps/chosen": -174.97686767578125,
"logps/rejected": -182.57919311523438,
"loss": 0.6618,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6465609073638916,
"rewards/margins": 0.17847394943237305,
"rewards/rejected": -0.8250348567962646,
"step": 491
},
{
"epoch": 0.51,
"learning_rate": 2.8376564137354795e-05,
"logits/chosen": -2.1236746311187744,
"logits/rejected": -2.148552894592285,
"logps/chosen": -156.05751037597656,
"logps/rejected": -151.4677734375,
"loss": 0.773,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5343315601348877,
"rewards/margins": -0.059497520327568054,
"rewards/rejected": -0.47483405470848083,
"step": 492
},
{
"epoch": 0.51,
"learning_rate": 2.8285842216464543e-05,
"logits/chosen": -2.2011935710906982,
"logits/rejected": -2.3106913566589355,
"logps/chosen": -183.11766052246094,
"logps/rejected": -201.7404022216797,
"loss": 0.5969,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5661877989768982,
"rewards/margins": 0.39767351746559143,
"rewards/rejected": -0.9638612866401672,
"step": 493
},
{
"epoch": 0.52,
"learning_rate": 2.8195076242990122e-05,
"logits/chosen": -2.245713472366333,
"logits/rejected": -2.243020534515381,
"logps/chosen": -159.26397705078125,
"logps/rejected": -174.874267578125,
"loss": 0.8165,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6563709378242493,
"rewards/margins": -0.1701582670211792,
"rewards/rejected": -0.48621270060539246,
"step": 494
},
{
"epoch": 0.52,
"learning_rate": 2.8104267433811533e-05,
"logits/chosen": -2.1842641830444336,
"logits/rejected": -2.1591455936431885,
"logps/chosen": -121.25286102294922,
"logps/rejected": -115.32366180419922,
"loss": 0.6663,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4449831545352936,
"rewards/margins": 0.13848333060741425,
"rewards/rejected": -0.5834664702415466,
"step": 495
},
{
"epoch": 0.52,
"learning_rate": 2.8013417006383076e-05,
"logits/chosen": -2.1810221672058105,
"logits/rejected": -2.2239696979522705,
"logps/chosen": -151.152099609375,
"logps/rejected": -176.65977478027344,
"loss": 0.601,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4074276387691498,
"rewards/margins": 0.25585755705833435,
"rewards/rejected": -0.6632851958274841,
"step": 496
},
{
"epoch": 0.52,
"learning_rate": 2.7922526178717017e-05,
"logits/chosen": -2.1347427368164062,
"logits/rejected": -2.1655385494232178,
"logps/chosen": -159.8424072265625,
"logps/rejected": -178.92288208007812,
"loss": 0.6182,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.48965874314308167,
"rewards/margins": 0.2526865601539612,
"rewards/rejected": -0.7423452734947205,
"step": 497
},
{
"epoch": 0.52,
"learning_rate": 2.783159616936723e-05,
"logits/chosen": -2.141169309616089,
"logits/rejected": -2.141371726989746,
"logps/chosen": -158.35968017578125,
"logps/rejected": -176.37464904785156,
"loss": 0.642,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5580905079841614,
"rewards/margins": 0.24599358439445496,
"rewards/rejected": -0.8040841221809387,
"step": 498
},
{
"epoch": 0.52,
"learning_rate": 2.774062819741293e-05,
"logits/chosen": -2.250638246536255,
"logits/rejected": -2.1852548122406006,
"logps/chosen": -165.15774536132812,
"logps/rejected": -180.04124450683594,
"loss": 0.6831,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.4352782368659973,
"rewards/margins": 0.0841899961233139,
"rewards/rejected": -0.5194682478904724,
"step": 499
},
{
"epoch": 0.52,
"learning_rate": 2.764962348244228e-05,
"logits/chosen": -2.187967538833618,
"logits/rejected": -2.1378700733184814,
"logps/chosen": -179.86184692382812,
"logps/rejected": -174.86183166503906,
"loss": 0.7768,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8409562110900879,
"rewards/margins": -0.06990113109350204,
"rewards/rejected": -0.7710551619529724,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 958,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}