phi-2-gpo-v6-i1 / trainer_state.json
lole25's picture
Model save
1d92b30 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.996400719856029,
"eval_steps": 500,
"global_step": 832,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.952380952380953e-08,
"logits/chosen": 0.11703574657440186,
"logits/rejected": 0.3661181330680847,
"logps/chosen": -218.64993286132812,
"logps/rejected": -191.34808349609375,
"loss": 0.3408,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 5.952380952380953e-07,
"logits/chosen": 0.10404814779758453,
"logits/rejected": 0.23778128623962402,
"logps/chosen": -401.4896240234375,
"logps/rejected": -345.9862976074219,
"loss": 0.3642,
"rewards/accuracies": 0.4791666567325592,
"rewards/chosen": 0.0004916194593533874,
"rewards/margins": 0.0005594216636382043,
"rewards/rejected": -6.780229159630835e-05,
"step": 10
},
{
"epoch": 0.05,
"learning_rate": 1.1904761904761906e-06,
"logits/chosen": 0.13218173384666443,
"logits/rejected": 0.20688870549201965,
"logps/chosen": -336.506591796875,
"logps/rejected": -319.3189392089844,
"loss": 0.3689,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": 0.00020826223772019148,
"rewards/margins": -0.000311180017888546,
"rewards/rejected": 0.0005194421974010766,
"step": 20
},
{
"epoch": 0.07,
"learning_rate": 1.7857142857142859e-06,
"logits/chosen": 0.11459924280643463,
"logits/rejected": 0.1922653764486313,
"logps/chosen": -342.02569580078125,
"logps/rejected": -324.1275939941406,
"loss": 0.3786,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.0006439354037865996,
"rewards/margins": 0.0004738263669423759,
"rewards/rejected": -0.0011177618289366364,
"step": 30
},
{
"epoch": 0.1,
"learning_rate": 2.380952380952381e-06,
"logits/chosen": 0.13577614724636078,
"logits/rejected": 0.17847472429275513,
"logps/chosen": -298.6214294433594,
"logps/rejected": -289.40850830078125,
"loss": 0.3689,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0008146329782903194,
"rewards/margins": 0.0024004268925637007,
"rewards/rejected": -0.001585794030688703,
"step": 40
},
{
"epoch": 0.12,
"learning_rate": 2.9761904761904763e-06,
"logits/chosen": 0.10261678695678711,
"logits/rejected": 0.20306341350078583,
"logps/chosen": -351.93572998046875,
"logps/rejected": -362.153564453125,
"loss": 0.3692,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0009010158246383071,
"rewards/margins": 0.004100508522242308,
"rewards/rejected": -0.003199493046849966,
"step": 50
},
{
"epoch": 0.14,
"learning_rate": 3.5714285714285718e-06,
"logits/chosen": 0.13770776987075806,
"logits/rejected": 0.2188442498445511,
"logps/chosen": -349.51690673828125,
"logps/rejected": -351.1549377441406,
"loss": 0.3655,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.003258631331846118,
"rewards/margins": 0.007584023289382458,
"rewards/rejected": -0.004325392190366983,
"step": 60
},
{
"epoch": 0.17,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": 0.1271902620792389,
"logits/rejected": 0.23070549964904785,
"logps/chosen": -378.33843994140625,
"logps/rejected": -350.60662841796875,
"loss": 0.3586,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.006277147680521011,
"rewards/margins": 0.015207210555672646,
"rewards/rejected": -0.00893006194382906,
"step": 70
},
{
"epoch": 0.19,
"learning_rate": 4.761904761904762e-06,
"logits/chosen": 0.08625562489032745,
"logits/rejected": 0.12316304445266724,
"logps/chosen": -307.9439697265625,
"logps/rejected": -335.3281555175781,
"loss": 0.3489,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.013669237494468689,
"rewards/margins": 0.02226843498647213,
"rewards/rejected": -0.008599198423326015,
"step": 80
},
{
"epoch": 0.22,
"learning_rate": 4.9992062457191005e-06,
"logits/chosen": 0.137899249792099,
"logits/rejected": 0.2165641039609909,
"logps/chosen": -355.6449890136719,
"logps/rejected": -338.1387634277344,
"loss": 0.3229,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.013719858601689339,
"rewards/margins": 0.042457275092601776,
"rewards/rejected": -0.028737416490912437,
"step": 90
},
{
"epoch": 0.24,
"learning_rate": 4.994357350311441e-06,
"logits/chosen": 0.14011432230472565,
"logits/rejected": 0.21795734763145447,
"logps/chosen": -360.2173156738281,
"logps/rejected": -358.1722717285156,
"loss": 0.3043,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.001885895850136876,
"rewards/margins": 0.06035756319761276,
"rewards/rejected": -0.06224345415830612,
"step": 100
},
{
"epoch": 0.26,
"learning_rate": 4.98510907587894e-06,
"logits/chosen": 0.13077042996883392,
"logits/rejected": 0.21840377151966095,
"logps/chosen": -356.6605224609375,
"logps/rejected": -348.19476318359375,
"loss": 0.3169,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.008259604685008526,
"rewards/margins": 0.08336080610752106,
"rewards/rejected": -0.09162042289972305,
"step": 110
},
{
"epoch": 0.29,
"learning_rate": 4.97147773390341e-06,
"logits/chosen": 0.14791826903820038,
"logits/rejected": 0.1786331683397293,
"logps/chosen": -320.29608154296875,
"logps/rejected": -337.16864013671875,
"loss": 0.2861,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.008191597647964954,
"rewards/margins": 0.09656454622745514,
"rewards/rejected": -0.08837294578552246,
"step": 120
},
{
"epoch": 0.31,
"learning_rate": 4.953487366425163e-06,
"logits/chosen": 0.12249626964330673,
"logits/rejected": 0.16907112300395966,
"logps/chosen": -342.0648498535156,
"logps/rejected": -363.51031494140625,
"loss": 0.3175,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0017940097022801638,
"rewards/margins": 0.07947574555873871,
"rewards/rejected": -0.07768173515796661,
"step": 130
},
{
"epoch": 0.34,
"learning_rate": 4.931169703639282e-06,
"logits/chosen": 0.0919104740023613,
"logits/rejected": 0.18652714788913727,
"logps/chosen": -337.65374755859375,
"logps/rejected": -364.11199951171875,
"loss": 0.2828,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.03137553483247757,
"rewards/margins": 0.12489553540945053,
"rewards/rejected": -0.09352000057697296,
"step": 140
},
{
"epoch": 0.36,
"learning_rate": 4.904564107932048e-06,
"logits/chosen": 0.13001379370689392,
"logits/rejected": 0.20237913727760315,
"logps/chosen": -351.857421875,
"logps/rejected": -336.6232604980469,
"loss": 0.2899,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.0018621661001816392,
"rewards/margins": 0.10416339337825775,
"rewards/rejected": -0.10602555423974991,
"step": 150
},
{
"epoch": 0.38,
"learning_rate": 4.873717504456219e-06,
"logits/chosen": 0.06932858377695084,
"logits/rejected": 0.15127311646938324,
"logps/chosen": -345.0473937988281,
"logps/rejected": -363.4601745605469,
"loss": 0.2889,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.01222093403339386,
"rewards/margins": 0.11404307186603546,
"rewards/rejected": -0.12626400589942932,
"step": 160
},
{
"epoch": 0.41,
"learning_rate": 4.838684298367616e-06,
"logits/chosen": 0.16357803344726562,
"logits/rejected": 0.23174886405467987,
"logps/chosen": -357.15289306640625,
"logps/rejected": -358.61065673828125,
"loss": 0.2884,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.017432499676942825,
"rewards/margins": 0.11707814782857895,
"rewards/rejected": -0.09964564442634583,
"step": 170
},
{
"epoch": 0.43,
"learning_rate": 4.7995262788689865e-06,
"logits/chosen": 0.16258656978607178,
"logits/rejected": 0.2536885738372803,
"logps/chosen": -337.7535705566406,
"logps/rejected": -346.13470458984375,
"loss": 0.2789,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.02853301540017128,
"rewards/margins": 0.1016291156411171,
"rewards/rejected": -0.07309609651565552,
"step": 180
},
{
"epoch": 0.46,
"learning_rate": 4.756312510230377e-06,
"logits/chosen": 0.14243337512016296,
"logits/rejected": 0.24410876631736755,
"logps/chosen": -376.64599609375,
"logps/rejected": -363.4615478515625,
"loss": 0.2828,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.03516390174627304,
"rewards/margins": 0.12421919405460358,
"rewards/rejected": -0.08905528485774994,
"step": 190
},
{
"epoch": 0.48,
"learning_rate": 4.709119209978242e-06,
"logits/chosen": 0.17320121824741364,
"logits/rejected": 0.2264091521501541,
"logps/chosen": -362.0121765136719,
"logps/rejected": -352.7041931152344,
"loss": 0.283,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.039128489792346954,
"rewards/margins": 0.11708054691553116,
"rewards/rejected": -0.07795204222202301,
"step": 200
},
{
"epoch": 0.5,
"learning_rate": 4.6580296144681155e-06,
"logits/chosen": 0.1604190617799759,
"logits/rejected": 0.17792078852653503,
"logps/chosen": -315.1614074707031,
"logps/rejected": -340.53619384765625,
"loss": 0.2754,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.05531097203493118,
"rewards/margins": 0.15012916922569275,
"rewards/rejected": -0.09481821954250336,
"step": 210
},
{
"epoch": 0.53,
"learning_rate": 4.603133832077953e-06,
"logits/chosen": 0.11915634572505951,
"logits/rejected": 0.15653367340564728,
"logps/chosen": -351.16986083984375,
"logps/rejected": -354.53607177734375,
"loss": 0.2738,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.06388933956623077,
"rewards/margins": 0.1507207453250885,
"rewards/rejected": -0.08683140575885773,
"step": 220
},
{
"epoch": 0.55,
"learning_rate": 4.544528684281056e-06,
"logits/chosen": 0.09443524479866028,
"logits/rejected": 0.1415812075138092,
"logps/chosen": -355.2025451660156,
"logps/rejected": -349.1300354003906,
"loss": 0.276,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.021877283230423927,
"rewards/margins": 0.1259470283985138,
"rewards/rejected": -0.10406973212957382,
"step": 230
},
{
"epoch": 0.58,
"learning_rate": 4.482317534878901e-06,
"logits/chosen": 0.08314280211925507,
"logits/rejected": 0.11439633369445801,
"logps/chosen": -333.59295654296875,
"logps/rejected": -341.5171203613281,
"loss": 0.2668,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.016557829454541206,
"rewards/margins": 0.11629464477300644,
"rewards/rejected": -0.09973680973052979,
"step": 240
},
{
"epoch": 0.6,
"learning_rate": 4.416610107695043e-06,
"logits/chosen": 0.11690554767847061,
"logits/rejected": 0.06475332379341125,
"logps/chosen": -331.7200012207031,
"logps/rejected": -341.45245361328125,
"loss": 0.2819,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.025893816724419594,
"rewards/margins": 0.13103850185871124,
"rewards/rejected": -0.15693232417106628,
"step": 250
},
{
"epoch": 0.62,
"learning_rate": 4.3475222930516484e-06,
"logits/chosen": 0.08940346539020538,
"logits/rejected": 0.12766343355178833,
"logps/chosen": -333.33343505859375,
"logps/rejected": -372.55755615234375,
"loss": 0.2833,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.026656050235033035,
"rewards/margins": 0.16600963473320007,
"rewards/rejected": -0.19266566634178162,
"step": 260
},
{
"epoch": 0.65,
"learning_rate": 4.2751759433699745e-06,
"logits/chosen": 0.04847298935055733,
"logits/rejected": 0.11083607375621796,
"logps/chosen": -342.9352722167969,
"logps/rejected": -357.6617736816406,
"loss": 0.274,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.021627375856041908,
"rewards/margins": 0.12919363379478455,
"rewards/rejected": -0.1508210003376007,
"step": 270
},
{
"epoch": 0.67,
"learning_rate": 4.199698658255298e-06,
"logits/chosen": 0.056878913193941116,
"logits/rejected": 0.14858202636241913,
"logps/chosen": -370.22637939453125,
"logps/rejected": -398.57159423828125,
"loss": 0.2715,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.021515587344765663,
"rewards/margins": 0.1492767035961151,
"rewards/rejected": -0.17079228162765503,
"step": 280
},
{
"epoch": 0.7,
"learning_rate": 4.121223559445343e-06,
"logits/chosen": 0.03415738046169281,
"logits/rejected": 0.12577436864376068,
"logps/chosen": -352.68072509765625,
"logps/rejected": -383.16204833984375,
"loss": 0.264,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.03958406671881676,
"rewards/margins": 0.1690487265586853,
"rewards/rejected": -0.20863279700279236,
"step": 290
},
{
"epoch": 0.72,
"learning_rate": 4.039889056019159e-06,
"logits/chosen": 0.02515377476811409,
"logits/rejected": 0.10390216112136841,
"logps/chosen": -353.2736511230469,
"logps/rejected": -353.888671875,
"loss": 0.2461,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.031048249453306198,
"rewards/margins": 0.1348181664943695,
"rewards/rejected": -0.1658664047718048,
"step": 300
},
{
"epoch": 0.74,
"learning_rate": 3.955838600280535e-06,
"logits/chosen": 0.025213222950696945,
"logits/rejected": 0.1410323679447174,
"logps/chosen": -387.21856689453125,
"logps/rejected": -373.70355224609375,
"loss": 0.2703,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.010617800056934357,
"rewards/margins": 0.19538867473602295,
"rewards/rejected": -0.184770867228508,
"step": 310
},
{
"epoch": 0.77,
"learning_rate": 3.869220434746509e-06,
"logits/chosen": 0.06151404231786728,
"logits/rejected": 0.1290605366230011,
"logps/chosen": -345.41571044921875,
"logps/rejected": -370.25592041015625,
"loss": 0.2703,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.019938651472330093,
"rewards/margins": 0.16865777969360352,
"rewards/rejected": -0.1885964572429657,
"step": 320
},
{
"epoch": 0.79,
"learning_rate": 3.7801873306872315e-06,
"logits/chosen": 0.06525089591741562,
"logits/rejected": 0.12144273519515991,
"logps/chosen": -340.03277587890625,
"logps/rejected": -371.6439514160156,
"loss": 0.2577,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.02320241369307041,
"rewards/margins": 0.17125853896141052,
"rewards/rejected": -0.14805614948272705,
"step": 330
},
{
"epoch": 0.82,
"learning_rate": 3.688896318678322e-06,
"logits/chosen": 0.055392809212207794,
"logits/rejected": 0.12697988748550415,
"logps/chosen": -349.14556884765625,
"logps/rejected": -333.9625549316406,
"loss": 0.2748,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.016882654279470444,
"rewards/margins": 0.16859912872314453,
"rewards/rejected": -0.1517164707183838,
"step": 340
},
{
"epoch": 0.84,
"learning_rate": 3.5955084116409382e-06,
"logits/chosen": 0.08919240534305573,
"logits/rejected": 0.1610582321882248,
"logps/chosen": -367.30621337890625,
"logps/rejected": -346.13873291015625,
"loss": 0.2664,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.04106982424855232,
"rewards/margins": 0.14082172513008118,
"rewards/rejected": -0.1818915605545044,
"step": 350
},
{
"epoch": 0.86,
"learning_rate": 3.5001883208580668e-06,
"logits/chosen": 0.056862883269786835,
"logits/rejected": 0.14601710438728333,
"logps/chosen": -383.3697204589844,
"logps/rejected": -388.45147705078125,
"loss": 0.2359,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.005547699984163046,
"rewards/margins": 0.20355132222175598,
"rewards/rejected": -0.20909900963306427,
"step": 360
},
{
"epoch": 0.89,
"learning_rate": 3.403104165467883e-06,
"logits/chosen": 0.047759585082530975,
"logits/rejected": 0.1289873570203781,
"logps/chosen": -363.989990234375,
"logps/rejected": -361.4288330078125,
"loss": 0.2491,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.030249441042542458,
"rewards/margins": 0.1802445650100708,
"rewards/rejected": -0.2104939967393875,
"step": 370
},
{
"epoch": 0.91,
"learning_rate": 3.30442717594657e-06,
"logits/chosen": 0.06461011618375778,
"logits/rejected": 0.14733566343784332,
"logps/chosen": -350.331298828125,
"logps/rejected": -334.6890563964844,
"loss": 0.2754,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.04477550461888313,
"rewards/margins": 0.12478353828191757,
"rewards/rejected": -0.1695590317249298,
"step": 380
},
{
"epoch": 0.94,
"learning_rate": 3.2043313921035747e-06,
"logits/chosen": 0.07650026679039001,
"logits/rejected": 0.10351625829935074,
"logps/chosen": -319.55328369140625,
"logps/rejected": -328.97625732421875,
"loss": 0.2601,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.017551960423588753,
"rewards/margins": 0.1492632031440735,
"rewards/rejected": -0.1668151617050171,
"step": 390
},
{
"epoch": 0.96,
"learning_rate": 3.102993356121938e-06,
"logits/chosen": 0.045068711042404175,
"logits/rejected": 0.133053719997406,
"logps/chosen": -376.1606750488281,
"logps/rejected": -360.3962097167969,
"loss": 0.2547,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.012314733117818832,
"rewards/margins": 0.18502004444599152,
"rewards/rejected": -0.19733479619026184,
"step": 400
},
{
"epoch": 0.98,
"learning_rate": 3.0005918011851245e-06,
"logits/chosen": 0.03985997289419174,
"logits/rejected": 0.1656588464975357,
"logps/chosen": -379.48199462890625,
"logps/rejected": -362.08380126953125,
"loss": 0.273,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.00483871391043067,
"rewards/margins": 0.1729108989238739,
"rewards/rejected": -0.16807220876216888,
"step": 410
},
{
"epoch": 1.01,
"learning_rate": 2.8973073362395e-06,
"logits/chosen": 0.06932957470417023,
"logits/rejected": 0.11695323139429092,
"logps/chosen": -350.8485107421875,
"logps/rejected": -359.5559387207031,
"loss": 0.2562,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.015226135030388832,
"rewards/margins": 0.13259340822696686,
"rewards/rejected": -0.14781954884529114,
"step": 420
},
{
"epoch": 1.03,
"learning_rate": 2.7933221274484725e-06,
"logits/chosen": 0.022776301950216293,
"logits/rejected": 0.1463911086320877,
"logps/chosen": -344.72900390625,
"logps/rejected": -374.57110595703125,
"loss": 0.2546,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.01165957935154438,
"rewards/margins": 0.17338308691978455,
"rewards/rejected": -0.1617235392332077,
"step": 430
},
{
"epoch": 1.06,
"learning_rate": 2.6888195769001147e-06,
"logits/chosen": 0.011232647113502026,
"logits/rejected": 0.08440439403057098,
"logps/chosen": -315.56158447265625,
"logps/rejected": -370.6732177734375,
"loss": 0.2635,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.0027101226150989532,
"rewards/margins": 0.18474070727825165,
"rewards/rejected": -0.1874508261680603,
"step": 440
},
{
"epoch": 1.08,
"learning_rate": 2.583983999134951e-06,
"logits/chosen": 0.033940933644771576,
"logits/rejected": 0.12383987754583359,
"logps/chosen": -353.528076171875,
"logps/rejected": -358.25433349609375,
"loss": 0.2647,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.01790205016732216,
"rewards/margins": 0.16596254706382751,
"rewards/rejected": -0.18386459350585938,
"step": 450
},
{
"epoch": 1.1,
"learning_rate": 2.479000296064417e-06,
"logits/chosen": 0.03699932247400284,
"logits/rejected": 0.13089559972286224,
"logps/chosen": -375.724609375,
"logps/rejected": -400.3955383300781,
"loss": 0.2481,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.041518934071063995,
"rewards/margins": 0.1739250123500824,
"rewards/rejected": -0.21544396877288818,
"step": 460
},
{
"epoch": 1.13,
"learning_rate": 2.374053630853358e-06,
"logits/chosen": 0.07867871224880219,
"logits/rejected": 0.0793570876121521,
"logps/chosen": -392.0462646484375,
"logps/rejected": -398.4570617675781,
"loss": 0.2589,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.03614164516329765,
"rewards/margins": 0.18701379001140594,
"rewards/rejected": -0.2231554538011551,
"step": 470
},
{
"epoch": 1.15,
"learning_rate": 2.269329101341745e-06,
"logits/chosen": 0.04767027124762535,
"logits/rejected": 0.10338594764471054,
"logps/chosen": -311.9175109863281,
"logps/rejected": -353.84375,
"loss": 0.253,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.022265803068876266,
"rewards/margins": 0.21186105906963348,
"rewards/rejected": -0.18959525227546692,
"step": 480
},
{
"epoch": 1.18,
"learning_rate": 2.1650114135816052e-06,
"logits/chosen": 0.04343586042523384,
"logits/rejected": 0.14493630826473236,
"logps/chosen": -368.74066162109375,
"logps/rejected": -401.21746826171875,
"loss": 0.254,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0024279176723212004,
"rewards/margins": 0.1874578297138214,
"rewards/rejected": -0.18988573551177979,
"step": 490
},
{
"epoch": 1.2,
"learning_rate": 2.06128455606496e-06,
"logits/chosen": 0.04143913835287094,
"logits/rejected": 0.06632859259843826,
"logps/chosen": -320.82281494140625,
"logps/rejected": -348.89923095703125,
"loss": 0.2438,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0026562472339719534,
"rewards/margins": 0.18748678267002106,
"rewards/rejected": -0.19014303386211395,
"step": 500
},
{
"epoch": 1.22,
"learning_rate": 1.958331475217357e-06,
"logits/chosen": 0.03532598540186882,
"logits/rejected": 0.07111676037311554,
"logps/chosen": -345.3083801269531,
"logps/rejected": -391.5373840332031,
"loss": 0.2428,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.011091398075222969,
"rewards/margins": 0.18126052618026733,
"rewards/rejected": -0.19235190749168396,
"step": 510
},
{
"epoch": 1.25,
"learning_rate": 1.856333752729311e-06,
"logits/chosen": 0.06463773548603058,
"logits/rejected": 0.07833746820688248,
"logps/chosen": -303.89508056640625,
"logps/rejected": -328.54095458984375,
"loss": 0.2549,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.030626490712165833,
"rewards/margins": 0.14131976664066315,
"rewards/rejected": -0.17194625735282898,
"step": 520
},
{
"epoch": 1.27,
"learning_rate": 1.7554712852947915e-06,
"logits/chosen": 0.017867419868707657,
"logits/rejected": 0.13077208399772644,
"logps/chosen": -354.83990478515625,
"logps/rejected": -369.40447998046875,
"loss": 0.2688,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.022668231278657913,
"rewards/margins": 0.164995938539505,
"rewards/rejected": -0.1876641809940338,
"step": 530
},
{
"epoch": 1.3,
"learning_rate": 1.6559219673215784e-06,
"logits/chosen": 0.07014649361371994,
"logits/rejected": 0.11957643926143646,
"logps/chosen": -341.1030578613281,
"logps/rejected": -360.0315246582031,
"loss": 0.2559,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.0031127408146858215,
"rewards/margins": 0.17289015650749207,
"rewards/rejected": -0.16977740824222565,
"step": 540
},
{
"epoch": 1.32,
"learning_rate": 1.5578613771731214e-06,
"logits/chosen": 0.044239241629838943,
"logits/rejected": 0.11994221061468124,
"logps/chosen": -347.32757568359375,
"logps/rejected": -388.6127624511719,
"loss": 0.244,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.0042419894598424435,
"rewards/margins": 0.21681733429431915,
"rewards/rejected": -0.22105932235717773,
"step": 550
},
{
"epoch": 1.34,
"learning_rate": 1.4614624674952843e-06,
"logits/chosen": 0.07131338119506836,
"logits/rejected": 0.14118310809135437,
"logps/chosen": -381.21112060546875,
"logps/rejected": -375.3702087402344,
"loss": 0.2594,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.01365007646381855,
"rewards/margins": 0.16313722729682922,
"rewards/rejected": -0.17678730189800262,
"step": 560
},
{
"epoch": 1.37,
"learning_rate": 1.3668952601741442e-06,
"logits/chosen": 0.019948173314332962,
"logits/rejected": 0.14301837980747223,
"logps/chosen": -359.31829833984375,
"logps/rejected": -386.3388366699219,
"loss": 0.2421,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.003145938040688634,
"rewards/margins": 0.17547301948070526,
"rewards/rejected": -0.17861898243427277,
"step": 570
},
{
"epoch": 1.39,
"learning_rate": 1.2743265464628787e-06,
"logits/chosen": 0.04147445410490036,
"logits/rejected": 0.07641445100307465,
"logps/chosen": -358.9191589355469,
"logps/rejected": -354.82989501953125,
"loss": 0.2574,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.03237663954496384,
"rewards/margins": 0.14051951467990875,
"rewards/rejected": -0.17289616167545319,
"step": 580
},
{
"epoch": 1.42,
"learning_rate": 1.1839195928066101e-06,
"logits/chosen": 0.010291008278727531,
"logits/rejected": 0.08601720631122589,
"logps/chosen": -338.0829162597656,
"logps/rejected": -349.2616882324219,
"loss": 0.2504,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.012054244987666607,
"rewards/margins": 0.18035855889320374,
"rewards/rejected": -0.19241279363632202,
"step": 590
},
{
"epoch": 1.44,
"learning_rate": 1.0958338528840893e-06,
"logits/chosen": 0.07830692082643509,
"logits/rejected": 0.1112513542175293,
"logps/chosen": -318.32928466796875,
"logps/rejected": -351.01531982421875,
"loss": 0.2642,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.012662211433053017,
"rewards/margins": 0.15172497928142548,
"rewards/rejected": -0.16438719630241394,
"step": 600
},
{
"epoch": 1.46,
"learning_rate": 1.0102246863740498e-06,
"logits/chosen": 0.013798505067825317,
"logits/rejected": 0.13072696328163147,
"logps/chosen": -326.76336669921875,
"logps/rejected": -380.63458251953125,
"loss": 0.2398,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.0045418571680784225,
"rewards/margins": 0.19731177389621735,
"rewards/rejected": -0.20185360312461853,
"step": 610
},
{
"epoch": 1.49,
"learning_rate": 9.272430849423175e-07,
"logits/chosen": 0.041550200432538986,
"logits/rejected": 0.12003109604120255,
"logps/chosen": -350.9006652832031,
"logps/rejected": -404.7802734375,
"loss": 0.2245,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.015362398698925972,
"rewards/margins": 0.22952251136302948,
"rewards/rejected": -0.21416012942790985,
"step": 620
},
{
"epoch": 1.51,
"learning_rate": 8.470354059328919e-07,
"logits/chosen": 0.104413703083992,
"logits/rejected": 0.11118074506521225,
"logps/chosen": -336.5838928222656,
"logps/rejected": -373.56085205078125,
"loss": 0.2452,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.010279458947479725,
"rewards/margins": 0.2295042723417282,
"rewards/rejected": -0.21922484040260315,
"step": 630
},
{
"epoch": 1.54,
"learning_rate": 7.697431142327633e-07,
"logits/chosen": 0.07976067811250687,
"logits/rejected": 0.12730778753757477,
"logps/chosen": -348.73443603515625,
"logps/rejected": -358.34088134765625,
"loss": 0.2338,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.030282145366072655,
"rewards/margins": 0.16269627213478088,
"rewards/rejected": -0.1929783970117569,
"step": 640
},
{
"epoch": 1.56,
"learning_rate": 6.955025327656839e-07,
"logits/chosen": 0.04196876287460327,
"logits/rejected": 0.11756552755832672,
"logps/chosen": -327.8496398925781,
"logps/rejected": -355.4369201660156,
"loss": 0.2558,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.0020761913619935513,
"rewards/margins": 0.17507974803447723,
"rewards/rejected": -0.17300358414649963,
"step": 650
},
{
"epoch": 1.58,
"learning_rate": 6.244446020550182e-07,
"logits/chosen": 0.05316174030303955,
"logits/rejected": 0.10895484685897827,
"logps/chosen": -354.5049133300781,
"logps/rejected": -411.59765625,
"loss": 0.2319,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.0010157767683267593,
"rewards/margins": 0.21365651488304138,
"rewards/rejected": -0.2146722972393036,
"step": 660
},
{
"epoch": 1.61,
"learning_rate": 5.566946492796766e-07,
"logits/chosen": 0.07230822741985321,
"logits/rejected": 0.09754084050655365,
"logps/chosen": -368.22802734375,
"logps/rejected": -368.54974365234375,
"loss": 0.2451,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02271811105310917,
"rewards/margins": 0.14353466033935547,
"rewards/rejected": -0.16625277698040009,
"step": 670
},
{
"epoch": 1.63,
"learning_rate": 4.923721672305148e-07,
"logits/chosen": 0.04747115820646286,
"logits/rejected": 0.10951533168554306,
"logps/chosen": -373.25653076171875,
"logps/rejected": -403.66619873046875,
"loss": 0.262,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.2638385164318606e-05,
"rewards/margins": 0.20511355996131897,
"rewards/rejected": -0.2051461637020111,
"step": 680
},
{
"epoch": 1.66,
"learning_rate": 4.3159060355700943e-07,
"logits/chosen": 0.007146243005990982,
"logits/rejected": 0.15595687925815582,
"logps/chosen": -360.5429382324219,
"logps/rejected": -360.84271240234375,
"loss": 0.2528,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.026043469086289406,
"rewards/margins": 0.19069012999534607,
"rewards/rejected": -0.21673360466957092,
"step": 690
},
{
"epoch": 1.68,
"learning_rate": 3.7445716067596506e-07,
"logits/chosen": -0.016133427619934082,
"logits/rejected": 0.06616418063640594,
"logps/chosen": -315.7747497558594,
"logps/rejected": -344.2303771972656,
"loss": 0.242,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.012198897078633308,
"rewards/margins": 0.2178380936384201,
"rewards/rejected": -0.20563916862010956,
"step": 700
},
{
"epoch": 1.7,
"learning_rate": 3.2107260669512334e-07,
"logits/chosen": 0.06611919403076172,
"logits/rejected": 0.08203423768281937,
"logps/chosen": -342.01263427734375,
"logps/rejected": -353.5125427246094,
"loss": 0.2461,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.01212338637560606,
"rewards/margins": 0.17198148369789124,
"rewards/rejected": -0.18410487473011017,
"step": 710
},
{
"epoch": 1.73,
"learning_rate": 2.7153109768518926e-07,
"logits/chosen": 0.05342602729797363,
"logits/rejected": 0.11405602842569351,
"logps/chosen": -393.02593994140625,
"logps/rejected": -416.9335021972656,
"loss": 0.244,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.015018805861473083,
"rewards/margins": 0.2232932150363922,
"rewards/rejected": -0.2383120059967041,
"step": 720
},
{
"epoch": 1.75,
"learning_rate": 2.2592001161370392e-07,
"logits/chosen": 0.059743158519268036,
"logits/rejected": 0.08855228126049042,
"logps/chosen": -365.6115417480469,
"logps/rejected": -373.24310302734375,
"loss": 0.2413,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.007994825020432472,
"rewards/margins": 0.19029465317726135,
"rewards/rejected": -0.19828948378562927,
"step": 730
},
{
"epoch": 1.78,
"learning_rate": 1.8431979423369607e-07,
"logits/chosen": 0.01501550804823637,
"logits/rejected": 0.09877587854862213,
"logps/chosen": -335.7201232910156,
"logps/rejected": -356.1680603027344,
"loss": 0.2601,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.013049180619418621,
"rewards/margins": 0.1567631959915161,
"rewards/rejected": -0.16981235146522522,
"step": 740
},
{
"epoch": 1.8,
"learning_rate": 1.468038171988881e-07,
"logits/chosen": -0.008327131159603596,
"logits/rejected": 0.04639572650194168,
"logps/chosen": -354.1353759765625,
"logps/rejected": -387.98297119140625,
"loss": 0.2595,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.02448558434844017,
"rewards/margins": 0.1901397705078125,
"rewards/rejected": -0.21462532877922058,
"step": 750
},
{
"epoch": 1.82,
"learning_rate": 1.1343824865573422e-07,
"logits/chosen": 0.01856027916073799,
"logits/rejected": 0.07309429347515106,
"logps/chosen": -321.44903564453125,
"logps/rejected": -341.5816955566406,
"loss": 0.2495,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.022448932752013206,
"rewards/margins": 0.17198805510997772,
"rewards/rejected": -0.19443701207637787,
"step": 760
},
{
"epoch": 1.85,
"learning_rate": 8.428193654051036e-08,
"logits/chosen": 0.04589134082198143,
"logits/rejected": 0.10319966077804565,
"logps/chosen": -388.9933776855469,
"logps/rejected": -376.8731994628906,
"loss": 0.2475,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.008324380032718182,
"rewards/margins": 0.20527882874011993,
"rewards/rejected": -0.19695445895195007,
"step": 770
},
{
"epoch": 1.87,
"learning_rate": 5.9386304787299175e-08,
"logits/chosen": 0.03318192437291145,
"logits/rejected": 0.1395682990550995,
"logps/chosen": -377.56622314453125,
"logps/rejected": -377.5900573730469,
"loss": 0.2477,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.0049881902523338795,
"rewards/margins": 0.2095176726579666,
"rewards/rejected": -0.2145058661699295,
"step": 780
},
{
"epoch": 1.9,
"learning_rate": 3.8795262629929e-08,
"logits/chosen": 0.03711915761232376,
"logits/rejected": 0.07861719280481339,
"logps/chosen": -311.10015869140625,
"logps/rejected": -340.22918701171875,
"loss": 0.2288,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.007546453736722469,
"rewards/margins": 0.215033620595932,
"rewards/rejected": -0.20748718082904816,
"step": 790
},
{
"epoch": 1.92,
"learning_rate": 2.2545127157831416e-08,
"logits/chosen": 0.06011080741882324,
"logits/rejected": 0.08075010776519775,
"logps/chosen": -342.993408203125,
"logps/rejected": -338.7896728515625,
"loss": 0.252,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.03219890594482422,
"rewards/margins": 0.15845921635627747,
"rewards/rejected": -0.1906580924987793,
"step": 800
},
{
"epoch": 1.94,
"learning_rate": 1.0664559262413831e-08,
"logits/chosen": 0.06324592232704163,
"logits/rejected": 0.15417756140232086,
"logps/chosen": -383.63238525390625,
"logps/rejected": -373.19720458984375,
"loss": 0.2445,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.013102272525429726,
"rewards/margins": 0.21051840484142303,
"rewards/rejected": -0.2236206978559494,
"step": 810
},
{
"epoch": 1.97,
"learning_rate": 3.1745130869123564e-09,
"logits/chosen": 0.02718031406402588,
"logits/rejected": 0.09324290603399277,
"logps/chosen": -342.188232421875,
"logps/rejected": -382.42657470703125,
"loss": 0.2445,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.02895962819457054,
"rewards/margins": 0.1746593415737152,
"rewards/rejected": -0.20361897349357605,
"step": 820
},
{
"epoch": 1.99,
"learning_rate": 8.819906889168117e-11,
"logits/chosen": 0.07415173202753067,
"logits/rejected": 0.12375295162200928,
"logps/chosen": -362.17572021484375,
"logps/rejected": -372.21044921875,
"loss": 0.2579,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.023642729967832565,
"rewards/margins": 0.1827639937400818,
"rewards/rejected": -0.20640675723552704,
"step": 830
},
{
"epoch": 2.0,
"step": 832,
"total_flos": 0.0,
"train_loss": 0.27172684411589915,
"train_runtime": 11567.6763,
"train_samples_per_second": 3.458,
"train_steps_per_second": 0.072
}
],
"logging_steps": 10,
"max_steps": 832,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}