phi-2-gpo-v5-i1 / trainer_state.json
lole25's picture
Model save
9b945ab verified
raw
history blame contribute delete
No virus
32.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996190476190476,
"eval_steps": 500,
"global_step": 656,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 7.575757575757576e-08,
"logits/chosen": 0.040165986865758896,
"logits/rejected": 0.1715753823518753,
"logps/chosen": -294.844482421875,
"logps/rejected": -361.2099914550781,
"loss": 0.3581,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 7.575757575757576e-07,
"logits/chosen": 0.08021458983421326,
"logits/rejected": 0.320384681224823,
"logps/chosen": -393.2468566894531,
"logps/rejected": -318.70526123046875,
"loss": 0.3399,
"rewards/accuracies": 0.375,
"rewards/chosen": -5.44932481716387e-05,
"rewards/margins": -7.010095578152686e-05,
"rewards/rejected": 1.560769487696234e-05,
"step": 10
},
{
"epoch": 0.03,
"learning_rate": 1.5151515151515152e-06,
"logits/chosen": 0.07484304904937744,
"logits/rejected": 0.23277099430561066,
"logps/chosen": -332.4987487792969,
"logps/rejected": -281.78729248046875,
"loss": 0.3453,
"rewards/accuracies": 0.3687500059604645,
"rewards/chosen": 1.3394804909694358e-06,
"rewards/margins": -0.0002027603331953287,
"rewards/rejected": 0.00020409980788826942,
"step": 20
},
{
"epoch": 0.05,
"learning_rate": 2.2727272727272728e-06,
"logits/chosen": 0.07151266187429428,
"logits/rejected": 0.25990238785743713,
"logps/chosen": -358.3196716308594,
"logps/rejected": -292.2561950683594,
"loss": 0.3517,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0001912448788061738,
"rewards/margins": 0.00028789255884476006,
"rewards/rejected": -9.664769459050149e-05,
"step": 30
},
{
"epoch": 0.06,
"learning_rate": 3.0303030303030305e-06,
"logits/chosen": 0.12050364166498184,
"logits/rejected": 0.23899266123771667,
"logps/chosen": -346.99310302734375,
"logps/rejected": -286.31658935546875,
"loss": 0.3416,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.0006401558639481664,
"rewards/margins": 0.0007750070071779191,
"rewards/rejected": -0.00013485117233358324,
"step": 40
},
{
"epoch": 0.08,
"learning_rate": 3.7878787878787882e-06,
"logits/chosen": 0.07284825295209885,
"logits/rejected": 0.3108685612678528,
"logps/chosen": -343.5631408691406,
"logps/rejected": -287.16876220703125,
"loss": 0.3439,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.0013991177547723055,
"rewards/margins": 0.0013889471301808953,
"rewards/rejected": 1.0170697350986302e-05,
"step": 50
},
{
"epoch": 0.09,
"learning_rate": 4.5454545454545455e-06,
"logits/chosen": 0.07572797685861588,
"logits/rejected": 0.2742985785007477,
"logps/chosen": -371.0773620605469,
"logps/rejected": -303.0706481933594,
"loss": 0.3306,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0029931231401860714,
"rewards/margins": 0.004650537855923176,
"rewards/rejected": -0.0016574144829064608,
"step": 60
},
{
"epoch": 0.11,
"learning_rate": 4.999432965739786e-06,
"logits/chosen": 0.06861326098442078,
"logits/rejected": 0.25036171078681946,
"logps/chosen": -321.8224792480469,
"logps/rejected": -301.9892272949219,
"loss": 0.3315,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.005247811786830425,
"rewards/margins": 0.0075549171306192875,
"rewards/rejected": -0.0023071051109582186,
"step": 70
},
{
"epoch": 0.12,
"learning_rate": 4.9930567839810125e-06,
"logits/chosen": 0.0997760072350502,
"logits/rejected": 0.26540613174438477,
"logps/chosen": -360.35430908203125,
"logps/rejected": -307.5221862792969,
"loss": 0.3184,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.006557592656463385,
"rewards/margins": 0.01593288779258728,
"rewards/rejected": -0.009375295601785183,
"step": 80
},
{
"epoch": 0.14,
"learning_rate": 4.979613761906212e-06,
"logits/chosen": 0.11980845779180527,
"logits/rejected": 0.2542170584201813,
"logps/chosen": -325.69989013671875,
"logps/rejected": -303.8279724121094,
"loss": 0.3053,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.0012294445186853409,
"rewards/margins": 0.02537659928202629,
"rewards/rejected": -0.02414715476334095,
"step": 90
},
{
"epoch": 0.15,
"learning_rate": 4.959142005221991e-06,
"logits/chosen": 0.15030920505523682,
"logits/rejected": 0.24269947409629822,
"logps/chosen": -357.73846435546875,
"logps/rejected": -372.84625244140625,
"loss": 0.2998,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.03152700141072273,
"rewards/margins": 0.0535690113902092,
"rewards/rejected": -0.08509601652622223,
"step": 100
},
{
"epoch": 0.17,
"learning_rate": 4.931699543346854e-06,
"logits/chosen": 0.10781173408031464,
"logits/rejected": 0.28030428290367126,
"logps/chosen": -428.8057556152344,
"logps/rejected": -428.8355407714844,
"loss": 0.2724,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.09748424589633942,
"rewards/margins": 0.0674622505903244,
"rewards/rejected": -0.16494649648666382,
"step": 110
},
{
"epoch": 0.18,
"learning_rate": 4.897364164920515e-06,
"logits/chosen": 0.1421918272972107,
"logits/rejected": 0.2562108039855957,
"logps/chosen": -584.6065673828125,
"logps/rejected": -657.9326782226562,
"loss": 0.2475,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.21280452609062195,
"rewards/margins": 0.1321365237236023,
"rewards/rejected": -0.34494107961654663,
"step": 120
},
{
"epoch": 0.2,
"learning_rate": 4.8562331973035396e-06,
"logits/chosen": 0.1893201768398285,
"logits/rejected": 0.2765055298805237,
"logps/chosen": -574.9005737304688,
"logps/rejected": -700.53125,
"loss": 0.2567,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.23022952675819397,
"rewards/margins": 0.17663846909999847,
"rewards/rejected": -0.40686798095703125,
"step": 130
},
{
"epoch": 0.21,
"learning_rate": 4.808423230692374e-06,
"logits/chosen": 0.15862275660037994,
"logits/rejected": 0.3002353310585022,
"logps/chosen": -557.1515502929688,
"logps/rejected": -636.1343994140625,
"loss": 0.2578,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.2007971704006195,
"rewards/margins": 0.1420392543077469,
"rewards/rejected": -0.3428364396095276,
"step": 140
},
{
"epoch": 0.23,
"learning_rate": 4.754069787631761e-06,
"logits/chosen": 0.20283110439777374,
"logits/rejected": 0.2838110327720642,
"logps/chosen": -539.7517700195312,
"logps/rejected": -632.3270263671875,
"loss": 0.2807,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.22490541636943817,
"rewards/margins": 0.13436779379844666,
"rewards/rejected": -0.359273225069046,
"step": 150
},
{
"epoch": 0.24,
"learning_rate": 4.693326938861367e-06,
"logits/chosen": 0.16693079471588135,
"logits/rejected": 0.3414613604545593,
"logps/chosen": -519.4967651367188,
"logps/rejected": -644.0454711914062,
"loss": 0.2302,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1742098033428192,
"rewards/margins": 0.19720463454723358,
"rewards/rejected": -0.3714144825935364,
"step": 160
},
{
"epoch": 0.26,
"learning_rate": 4.626366866585528e-06,
"logits/chosen": 0.15613974630832672,
"logits/rejected": 0.27570822834968567,
"logps/chosen": -551.5475463867188,
"logps/rejected": -641.4393310546875,
"loss": 0.2342,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.18619480729103088,
"rewards/margins": 0.17629162967205048,
"rewards/rejected": -0.36248645186424255,
"step": 170
},
{
"epoch": 0.27,
"learning_rate": 4.553379376404085e-06,
"logits/chosen": 0.15913400053977966,
"logits/rejected": 0.28868401050567627,
"logps/chosen": -575.120361328125,
"logps/rejected": -672.7054443359375,
"loss": 0.2586,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.23076090216636658,
"rewards/margins": 0.1531594842672348,
"rewards/rejected": -0.3839203715324402,
"step": 180
},
{
"epoch": 0.29,
"learning_rate": 4.474571359287791e-06,
"logits/chosen": 0.2370852530002594,
"logits/rejected": 0.313865065574646,
"logps/chosen": -578.9676513671875,
"logps/rejected": -646.5762329101562,
"loss": 0.2759,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.22910158336162567,
"rewards/margins": 0.14492908120155334,
"rewards/rejected": -0.3740306496620178,
"step": 190
},
{
"epoch": 0.3,
"learning_rate": 4.3901662051233755e-06,
"logits/chosen": 0.156154602766037,
"logits/rejected": 0.28848981857299805,
"logps/chosen": -529.0174560546875,
"logps/rejected": -681.0127563476562,
"loss": 0.2411,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.21960540115833282,
"rewards/margins": 0.1676800698041916,
"rewards/rejected": -0.3872854709625244,
"step": 200
},
{
"epoch": 0.32,
"learning_rate": 4.30040316949064e-06,
"logits/chosen": 0.13694807887077332,
"logits/rejected": 0.3102852702140808,
"logps/chosen": -582.1616821289062,
"logps/rejected": -681.3240966796875,
"loss": 0.2166,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.24012689292430878,
"rewards/margins": 0.18257644772529602,
"rewards/rejected": -0.422703355550766,
"step": 210
},
{
"epoch": 0.34,
"learning_rate": 4.205536695466524e-06,
"logits/chosen": 0.15840545296669006,
"logits/rejected": 0.30105945467948914,
"logps/chosen": -578.1398315429688,
"logps/rejected": -766.7618408203125,
"loss": 0.2259,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2511864900588989,
"rewards/margins": 0.22828085720539093,
"rewards/rejected": -0.47946733236312866,
"step": 220
},
{
"epoch": 0.35,
"learning_rate": 4.105835692378557e-06,
"logits/chosen": 0.14648036658763885,
"logits/rejected": 0.21232767403125763,
"logps/chosen": -583.7035522460938,
"logps/rejected": -690.9400634765625,
"loss": 0.2757,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2628551125526428,
"rewards/margins": 0.15015873312950134,
"rewards/rejected": -0.41301384568214417,
"step": 230
},
{
"epoch": 0.37,
"learning_rate": 4.001582773552153e-06,
"logits/chosen": 0.13691949844360352,
"logits/rejected": 0.22931316494941711,
"logps/chosen": -621.2189331054688,
"logps/rejected": -718.5030517578125,
"loss": 0.2518,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.24823768436908722,
"rewards/margins": 0.16897296905517578,
"rewards/rejected": -0.4172106683254242,
"step": 240
},
{
"epoch": 0.38,
"learning_rate": 3.893073455212438e-06,
"logits/chosen": 0.15518508851528168,
"logits/rejected": 0.27118122577667236,
"logps/chosen": -649.8485107421875,
"logps/rejected": -743.3021240234375,
"loss": 0.2446,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.27570152282714844,
"rewards/margins": 0.17269738018512726,
"rewards/rejected": -0.4483988881111145,
"step": 250
},
{
"epoch": 0.4,
"learning_rate": 3.7806153188114027e-06,
"logits/chosen": 0.14696967601776123,
"logits/rejected": 0.26466238498687744,
"logps/chosen": -585.2598876953125,
"logps/rejected": -646.9161376953125,
"loss": 0.2236,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.22980618476867676,
"rewards/margins": 0.16329681873321533,
"rewards/rejected": -0.3931030035018921,
"step": 260
},
{
"epoch": 0.41,
"learning_rate": 3.6645271391548542e-06,
"logits/chosen": 0.1697523295879364,
"logits/rejected": 0.29494693875312805,
"logps/chosen": -613.8264770507812,
"logps/rejected": -749.4378051757812,
"loss": 0.228,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2746888995170593,
"rewards/margins": 0.2020600140094757,
"rewards/rejected": -0.47674888372421265,
"step": 270
},
{
"epoch": 0.43,
"learning_rate": 3.5451379808006014e-06,
"logits/chosen": 0.13133810460567474,
"logits/rejected": 0.2865068316459656,
"logps/chosen": -590.5033569335938,
"logps/rejected": -722.9710083007812,
"loss": 0.2252,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2536476254463196,
"rewards/margins": 0.177708238363266,
"rewards/rejected": -0.43135586380958557,
"step": 280
},
{
"epoch": 0.44,
"learning_rate": 3.4227862652892106e-06,
"logits/chosen": 0.16941356658935547,
"logits/rejected": 0.30433687567710876,
"logps/chosen": -602.0320434570312,
"logps/rejected": -731.9866333007812,
"loss": 0.2518,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.262613981962204,
"rewards/margins": 0.17090369760990143,
"rewards/rejected": -0.4335176944732666,
"step": 290
},
{
"epoch": 0.46,
"learning_rate": 3.2978188118513814e-06,
"logits/chosen": 0.1636120229959488,
"logits/rejected": 0.2873205840587616,
"logps/chosen": -595.1837768554688,
"logps/rejected": -764.756103515625,
"loss": 0.218,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.25328361988067627,
"rewards/margins": 0.21258826553821564,
"rewards/rejected": -0.4658718705177307,
"step": 300
},
{
"epoch": 0.47,
"learning_rate": 3.1705898543111576e-06,
"logits/chosen": 0.128164142370224,
"logits/rejected": 0.29025566577911377,
"logps/chosen": -615.720947265625,
"logps/rejected": -729.6194458007812,
"loss": 0.2587,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2794512212276459,
"rewards/margins": 0.1740822196006775,
"rewards/rejected": -0.45353350043296814,
"step": 310
},
{
"epoch": 0.49,
"learning_rate": 3.041460036971664e-06,
"logits/chosen": 0.15139839053153992,
"logits/rejected": 0.3216419517993927,
"logps/chosen": -633.3115234375,
"logps/rejected": -791.7572631835938,
"loss": 0.2391,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3004799485206604,
"rewards/margins": 0.18833482265472412,
"rewards/rejected": -0.4888147711753845,
"step": 320
},
{
"epoch": 0.5,
"learning_rate": 2.910795392329649e-06,
"logits/chosen": 0.21717897057533264,
"logits/rejected": 0.3525051474571228,
"logps/chosen": -616.4713134765625,
"logps/rejected": -754.4976806640625,
"loss": 0.2592,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2672343850135803,
"rewards/margins": 0.2101636379957199,
"rewards/rejected": -0.47739800810813904,
"step": 330
},
{
"epoch": 0.52,
"learning_rate": 2.7789663035166035e-06,
"logits/chosen": 0.2043914794921875,
"logits/rejected": 0.34597498178482056,
"logps/chosen": -658.1500244140625,
"logps/rejected": -802.0507202148438,
"loss": 0.2329,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2817768454551697,
"rewards/margins": 0.2174309492111206,
"rewards/rejected": -0.4992077946662903,
"step": 340
},
{
"epoch": 0.53,
"learning_rate": 2.6463464544075344e-06,
"logits/chosen": 0.16284213960170746,
"logits/rejected": 0.28806525468826294,
"logps/chosen": -627.6129760742188,
"logps/rejected": -686.5743408203125,
"loss": 0.2372,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2634487748146057,
"rewards/margins": 0.15015827119350433,
"rewards/rejected": -0.41360706090927124,
"step": 350
},
{
"epoch": 0.55,
"learning_rate": 2.513311770373421e-06,
"logits/chosen": 0.14228633046150208,
"logits/rejected": 0.26139333844184875,
"logps/chosen": -625.71875,
"logps/rejected": -751.2933349609375,
"loss": 0.2357,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.26470065116882324,
"rewards/margins": 0.1807432472705841,
"rewards/rejected": -0.44544392824172974,
"step": 360
},
{
"epoch": 0.56,
"learning_rate": 2.380239352679908e-06,
"logits/chosen": 0.14897385239601135,
"logits/rejected": 0.2588959336280823,
"logps/chosen": -641.2374267578125,
"logps/rejected": -802.7953491210938,
"loss": 0.2166,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.30122682452201843,
"rewards/margins": 0.1971724033355713,
"rewards/rejected": -0.49839919805526733,
"step": 370
},
{
"epoch": 0.58,
"learning_rate": 2.247506409552795e-06,
"logits/chosen": 0.14793309569358826,
"logits/rejected": 0.3080625534057617,
"logps/chosen": -668.0740966796875,
"logps/rejected": -816.8853759765625,
"loss": 0.2162,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.3042776584625244,
"rewards/margins": 0.20680758357048035,
"rewards/rejected": -0.5110852122306824,
"step": 380
},
{
"epoch": 0.59,
"learning_rate": 2.1154891869403436e-06,
"logits/chosen": 0.1291504055261612,
"logits/rejected": 0.3194652497768402,
"logps/chosen": -662.1779174804688,
"logps/rejected": -797.3854370117188,
"loss": 0.2264,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.33526724576950073,
"rewards/margins": 0.19740521907806396,
"rewards/rejected": -0.5326724648475647,
"step": 390
},
{
"epoch": 0.61,
"learning_rate": 1.9845619020032552e-06,
"logits/chosen": 0.16021743416786194,
"logits/rejected": 0.3056327700614929,
"logps/chosen": -644.52734375,
"logps/rejected": -788.876953125,
"loss": 0.2151,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.3085401952266693,
"rewards/margins": 0.211081862449646,
"rewards/rejected": -0.5196221470832825,
"step": 400
},
{
"epoch": 0.62,
"learning_rate": 1.8550956823554708e-06,
"logits/chosen": 0.15647678077220917,
"logits/rejected": 0.3057996332645416,
"logps/chosen": -702.3175048828125,
"logps/rejected": -853.0869140625,
"loss": 0.2227,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3388286232948303,
"rewards/margins": 0.2211228311061859,
"rewards/rejected": -0.5599514842033386,
"step": 410
},
{
"epoch": 0.64,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": 0.14826330542564392,
"logits/rejected": 0.32510313391685486,
"logps/chosen": -669.1925659179688,
"logps/rejected": -814.4791870117188,
"loss": 0.2161,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.308666467666626,
"rewards/margins": 0.18964755535125732,
"rewards/rejected": -0.4983140528202057,
"step": 420
},
{
"epoch": 0.66,
"learning_rate": 1.6020092013802002e-06,
"logits/chosen": 0.2039145529270172,
"logits/rejected": 0.27107498049736023,
"logps/chosen": -734.4779052734375,
"logps/rejected": -801.9241333007812,
"loss": 0.2567,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3436730206012726,
"rewards/margins": 0.15554025769233704,
"rewards/rejected": -0.499213308095932,
"step": 430
},
{
"epoch": 0.67,
"learning_rate": 1.4791063411799938e-06,
"logits/chosen": 0.1372929960489273,
"logits/rejected": 0.2705134451389313,
"logps/chosen": -655.2030029296875,
"logps/rejected": -761.6964111328125,
"loss": 0.2425,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2905011475086212,
"rewards/margins": 0.1854308694601059,
"rewards/rejected": -0.4759320318698883,
"step": 440
},
{
"epoch": 0.69,
"learning_rate": 1.3590973149722103e-06,
"logits/chosen": 0.16004619002342224,
"logits/rejected": 0.3311876654624939,
"logps/chosen": -623.554443359375,
"logps/rejected": -741.9662475585938,
"loss": 0.2205,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.2771863043308258,
"rewards/margins": 0.18584686517715454,
"rewards/rejected": -0.46303310990333557,
"step": 450
},
{
"epoch": 0.7,
"learning_rate": 1.2423223013801946e-06,
"logits/chosen": 0.15525056421756744,
"logits/rejected": 0.226671501994133,
"logps/chosen": -620.3182373046875,
"logps/rejected": -753.6258544921875,
"loss": 0.2323,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.28617820143699646,
"rewards/margins": 0.17914626002311707,
"rewards/rejected": -0.4653244912624359,
"step": 460
},
{
"epoch": 0.72,
"learning_rate": 1.1291123118671665e-06,
"logits/chosen": 0.1403166949748993,
"logits/rejected": 0.21838609874248505,
"logps/chosen": -640.9824829101562,
"logps/rejected": -792.9696044921875,
"loss": 0.23,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.281258225440979,
"rewards/margins": 0.21520480513572693,
"rewards/rejected": -0.4964630603790283,
"step": 470
},
{
"epoch": 0.73,
"learning_rate": 1.019788252448267e-06,
"logits/chosen": 0.12808464467525482,
"logits/rejected": 0.2763553857803345,
"logps/chosen": -644.160888671875,
"logps/rejected": -773.2025146484375,
"loss": 0.2376,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.31234392523765564,
"rewards/margins": 0.19615283608436584,
"rewards/rejected": -0.5084967017173767,
"step": 480
},
{
"epoch": 0.75,
"learning_rate": 9.146600140475945e-07,
"logits/chosen": 0.1601841151714325,
"logits/rejected": 0.22859685122966766,
"logps/chosen": -651.9871826171875,
"logps/rejected": -752.2882080078125,
"loss": 0.2404,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3149036467075348,
"rewards/margins": 0.15418918430805206,
"rewards/rejected": -0.46909284591674805,
"step": 490
},
{
"epoch": 0.76,
"learning_rate": 8.140255940787059e-07,
"logits/chosen": 0.1696256846189499,
"logits/rejected": 0.2493252456188202,
"logps/chosen": -615.3719482421875,
"logps/rejected": -736.1841430664062,
"loss": 0.2331,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2796221375465393,
"rewards/margins": 0.1891065537929535,
"rewards/rejected": -0.4687287211418152,
"step": 500
},
{
"epoch": 0.78,
"learning_rate": 7.181702517385789e-07,
"logits/chosen": 0.13488708436489105,
"logits/rejected": 0.24344106018543243,
"logps/chosen": -632.91064453125,
"logps/rejected": -779.7465209960938,
"loss": 0.2493,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.30229493975639343,
"rewards/margins": 0.19277289509773254,
"rewards/rejected": -0.49506789445877075,
"step": 510
},
{
"epoch": 0.79,
"learning_rate": 6.273656994094232e-07,
"logits/chosen": 0.078878253698349,
"logits/rejected": 0.32078155875205994,
"logps/chosen": -629.8251342773438,
"logps/rejected": -792.5658569335938,
"loss": 0.2593,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.29794004559516907,
"rewards/margins": 0.21232935786247253,
"rewards/rejected": -0.5102694034576416,
"step": 520
},
{
"epoch": 0.81,
"learning_rate": 5.418693324604082e-07,
"logits/chosen": 0.17152568697929382,
"logits/rejected": 0.2723831534385681,
"logps/chosen": -686.8225708007812,
"logps/rejected": -765.6764526367188,
"loss": 0.2106,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2921481132507324,
"rewards/margins": 0.19477225840091705,
"rewards/rejected": -0.48692041635513306,
"step": 530
},
{
"epoch": 0.82,
"learning_rate": 4.619234996325314e-07,
"logits/chosen": 0.16223089396953583,
"logits/rejected": 0.22344419360160828,
"logps/chosen": -644.6939086914062,
"logps/rejected": -796.0565185546875,
"loss": 0.2251,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.29831987619400024,
"rewards/margins": 0.2159034013748169,
"rewards/rejected": -0.5142232179641724,
"step": 540
},
{
"epoch": 0.84,
"learning_rate": 3.877548160747768e-07,
"logits/chosen": 0.14092543721199036,
"logits/rejected": 0.26907533407211304,
"logps/chosen": -665.1543579101562,
"logps/rejected": -785.69091796875,
"loss": 0.2293,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.3221438229084015,
"rewards/margins": 0.16273698210716248,
"rewards/rejected": -0.4848807752132416,
"step": 550
},
{
"epoch": 0.85,
"learning_rate": 3.195735209788528e-07,
"logits/chosen": 0.12745890021324158,
"logits/rejected": 0.16941148042678833,
"logps/chosen": -604.4722290039062,
"logps/rejected": -726.4368286132812,
"loss": 0.228,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.2934458255767822,
"rewards/margins": 0.16752712428569794,
"rewards/rejected": -0.46097296476364136,
"step": 560
},
{
"epoch": 0.87,
"learning_rate": 2.5757288163336806e-07,
"logits/chosen": 0.14064130187034607,
"logits/rejected": 0.2834423780441284,
"logps/chosen": -670.5109252929688,
"logps/rejected": -754.9119873046875,
"loss": 0.2279,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.29777050018310547,
"rewards/margins": 0.1834956705570221,
"rewards/rejected": -0.4812661111354828,
"step": 570
},
{
"epoch": 0.88,
"learning_rate": 2.019286455866981e-07,
"logits/chosen": 0.16343867778778076,
"logits/rejected": 0.23200741410255432,
"logps/chosen": -653.3126220703125,
"logps/rejected": -788.0684204101562,
"loss": 0.2129,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.30145275592803955,
"rewards/margins": 0.18838538229465485,
"rewards/rejected": -0.4898381233215332,
"step": 580
},
{
"epoch": 0.9,
"learning_rate": 1.5279854247146703e-07,
"logits/chosen": 0.0768275260925293,
"logits/rejected": 0.261466920375824,
"logps/chosen": -653.1698608398438,
"logps/rejected": -781.791259765625,
"loss": 0.2128,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.28429535031318665,
"rewards/margins": 0.22277021408081055,
"rewards/rejected": -0.5070655941963196,
"step": 590
},
{
"epoch": 0.91,
"learning_rate": 1.1032183690276754e-07,
"logits/chosen": 0.160492941737175,
"logits/rejected": 0.28358930349349976,
"logps/chosen": -716.923583984375,
"logps/rejected": -817.1744995117188,
"loss": 0.2252,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.3216014504432678,
"rewards/margins": 0.18884047865867615,
"rewards/rejected": -0.5104418992996216,
"step": 600
},
{
"epoch": 0.93,
"learning_rate": 7.46189337174788e-08,
"logits/chosen": 0.138215571641922,
"logits/rejected": 0.28554660081863403,
"logps/chosen": -634.0874633789062,
"logps/rejected": -762.0772094726562,
"loss": 0.204,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2693154215812683,
"rewards/margins": 0.21418070793151855,
"rewards/rejected": -0.48349618911743164,
"step": 610
},
{
"epoch": 0.94,
"learning_rate": 4.579103667367385e-08,
"logits/chosen": 0.13104286789894104,
"logits/rejected": 0.280353844165802,
"logps/chosen": -666.0810546875,
"logps/rejected": -752.5096435546875,
"loss": 0.229,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3018819987773895,
"rewards/margins": 0.1678927093744278,
"rewards/rejected": -0.4697747230529785,
"step": 620
},
{
"epoch": 0.96,
"learning_rate": 2.3919861577572924e-08,
"logits/chosen": 0.15655803680419922,
"logits/rejected": 0.32044172286987305,
"logps/chosen": -707.17138671875,
"logps/rejected": -806.7648315429688,
"loss": 0.229,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.30425772070884705,
"rewards/margins": 0.21442052721977234,
"rewards/rejected": -0.5186783075332642,
"step": 630
},
{
"epoch": 0.98,
"learning_rate": 9.067404651211808e-09,
"logits/chosen": 0.146881565451622,
"logits/rejected": 0.22085854411125183,
"logps/chosen": -632.6345825195312,
"logps/rejected": -776.0171508789062,
"loss": 0.2122,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2965379059314728,
"rewards/margins": 0.19161197543144226,
"rewards/rejected": -0.48814982175827026,
"step": 640
},
{
"epoch": 0.99,
"learning_rate": 1.2757667974155896e-09,
"logits/chosen": 0.18307064473628998,
"logits/rejected": 0.2208767831325531,
"logps/chosen": -612.895751953125,
"logps/rejected": -768.5109252929688,
"loss": 0.2581,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2931486666202545,
"rewards/margins": 0.18815748393535614,
"rewards/rejected": -0.4813062250614166,
"step": 650
},
{
"epoch": 1.0,
"step": 656,
"total_flos": 0.0,
"train_loss": 0.20043009792159244,
"train_runtime": 6753.4712,
"train_samples_per_second": 3.11,
"train_steps_per_second": 0.097
}
],
"logging_steps": 10,
"max_steps": 656,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}