Jimmy19991222's picture
Upload folder using huggingface_hub
9ff0ce4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982631930527722,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01068804275217101,
"grad_norm": 53.52218298444476,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -1.0146243572235107,
"logits/rejected": -0.9850981831550598,
"logps/chosen": -0.27403339743614197,
"logps/rejected": -0.2716384530067444,
"loss": 3.0444,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -2.7403340339660645,
"rewards/margins": -0.02394939959049225,
"rewards/rejected": -2.7163848876953125,
"step": 5
},
{
"epoch": 0.02137608550434202,
"grad_norm": 39.10999969888965,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -1.0449364185333252,
"logits/rejected": -0.9776930809020996,
"logps/chosen": -0.29451489448547363,
"logps/rejected": -0.2995792329311371,
"loss": 3.0211,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -2.9451489448547363,
"rewards/margins": 0.05064352601766586,
"rewards/rejected": -2.9957923889160156,
"step": 10
},
{
"epoch": 0.03206412825651302,
"grad_norm": 53.821066581509214,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.9672248959541321,
"logits/rejected": -0.9867329597473145,
"logps/chosen": -0.26386433839797974,
"logps/rejected": -0.30063143372535706,
"loss": 3.0404,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.638643503189087,
"rewards/margins": 0.3676711320877075,
"rewards/rejected": -3.006314516067505,
"step": 15
},
{
"epoch": 0.04275217100868404,
"grad_norm": 86.6542555553414,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.9602643847465515,
"logits/rejected": -0.9344671964645386,
"logps/chosen": -0.2776374816894531,
"logps/rejected": -0.29131022095680237,
"loss": 2.9793,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.7763748168945312,
"rewards/margins": 0.13672712445259094,
"rewards/rejected": -2.913102149963379,
"step": 20
},
{
"epoch": 0.053440213760855046,
"grad_norm": 56.919799993589805,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -1.0135596990585327,
"logits/rejected": -0.9844949841499329,
"logps/chosen": -0.2717221677303314,
"logps/rejected": -0.2782990336418152,
"loss": 3.124,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.717221736907959,
"rewards/margins": 0.06576814502477646,
"rewards/rejected": -2.7829902172088623,
"step": 25
},
{
"epoch": 0.06412825651302605,
"grad_norm": 45.796379698409524,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -0.9898878931999207,
"logits/rejected": -0.9455238580703735,
"logps/chosen": -0.2733747959136963,
"logps/rejected": -0.279060035943985,
"loss": 2.8977,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -2.733747959136963,
"rewards/margins": 0.05685253068804741,
"rewards/rejected": -2.790600299835205,
"step": 30
},
{
"epoch": 0.07481629926519706,
"grad_norm": 64.64288788170485,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -1.0491113662719727,
"logits/rejected": -0.9738750457763672,
"logps/chosen": -0.2941775918006897,
"logps/rejected": -0.32069069147109985,
"loss": 2.9119,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.9417757987976074,
"rewards/margins": 0.2651310861110687,
"rewards/rejected": -3.206906795501709,
"step": 35
},
{
"epoch": 0.08550434201736808,
"grad_norm": 60.56769615337976,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -1.0074384212493896,
"logits/rejected": -0.963466465473175,
"logps/chosen": -0.2797192931175232,
"logps/rejected": -0.3225395083427429,
"loss": 2.9345,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.7971930503845215,
"rewards/margins": 0.4282020032405853,
"rewards/rejected": -3.2253952026367188,
"step": 40
},
{
"epoch": 0.09619238476953908,
"grad_norm": 48.675093440338955,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -1.0469945669174194,
"logits/rejected": -1.0040814876556396,
"logps/chosen": -0.33255186676979065,
"logps/rejected": -0.38402628898620605,
"loss": 2.9815,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -3.3255183696746826,
"rewards/margins": 0.5147446393966675,
"rewards/rejected": -3.8402628898620605,
"step": 45
},
{
"epoch": 0.10688042752171009,
"grad_norm": 92.08652708998007,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -1.038892388343811,
"logits/rejected": -0.988103985786438,
"logps/chosen": -0.34245526790618896,
"logps/rejected": -0.38594862818717957,
"loss": 3.0508,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -3.4245529174804688,
"rewards/margins": 0.4349338412284851,
"rewards/rejected": -3.8594863414764404,
"step": 50
},
{
"epoch": 0.11756847027388109,
"grad_norm": 72.54827446103837,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -1.0567952394485474,
"logits/rejected": -1.0215675830841064,
"logps/chosen": -0.28753459453582764,
"logps/rejected": -0.3490275740623474,
"loss": 2.7982,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.8753461837768555,
"rewards/margins": 0.6149295568466187,
"rewards/rejected": -3.4902758598327637,
"step": 55
},
{
"epoch": 0.1282565130260521,
"grad_norm": 55.56312267177659,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -1.0922194719314575,
"logits/rejected": -1.059291958808899,
"logps/chosen": -0.3225264847278595,
"logps/rejected": -0.3470703959465027,
"loss": 2.8716,
"rewards/accuracies": 0.5625,
"rewards/chosen": -3.22526478767395,
"rewards/margins": 0.2454390972852707,
"rewards/rejected": -3.4707038402557373,
"step": 60
},
{
"epoch": 0.13894455577822312,
"grad_norm": 54.05440384507174,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -0.9975064992904663,
"logits/rejected": -0.9689160585403442,
"logps/chosen": -0.37468865513801575,
"logps/rejected": -0.43205341696739197,
"loss": 2.7901,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -3.746886730194092,
"rewards/margins": 0.5736472010612488,
"rewards/rejected": -4.3205342292785645,
"step": 65
},
{
"epoch": 0.14963259853039412,
"grad_norm": 37.66775098927071,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -1.0229814052581787,
"logits/rejected": -0.9982998967170715,
"logps/chosen": -0.3514581620693207,
"logps/rejected": -0.4274352192878723,
"loss": 2.8718,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -3.5145821571350098,
"rewards/margins": 0.7597699761390686,
"rewards/rejected": -4.274352073669434,
"step": 70
},
{
"epoch": 0.16032064128256512,
"grad_norm": 51.934633835606974,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.9838461875915527,
"logits/rejected": -0.9134309887886047,
"logps/chosen": -0.35928577184677124,
"logps/rejected": -0.4099213182926178,
"loss": 2.8345,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -3.592857837677002,
"rewards/margins": 0.5063551664352417,
"rewards/rejected": -4.099213123321533,
"step": 75
},
{
"epoch": 0.17100868403473615,
"grad_norm": 46.83578017177419,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -0.9681940078735352,
"logits/rejected": -0.9539217948913574,
"logps/chosen": -0.353752076625824,
"logps/rejected": -0.4523216187953949,
"loss": 2.7878,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -3.53752064704895,
"rewards/margins": 0.985695481300354,
"rewards/rejected": -4.523216247558594,
"step": 80
},
{
"epoch": 0.18169672678690715,
"grad_norm": 54.174949491419966,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.9862138628959656,
"logits/rejected": -0.9641338586807251,
"logps/chosen": -0.3405635952949524,
"logps/rejected": -0.39860305190086365,
"loss": 2.6715,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -3.4056358337402344,
"rewards/margins": 0.5803946852684021,
"rewards/rejected": -3.9860305786132812,
"step": 85
},
{
"epoch": 0.19238476953907815,
"grad_norm": 62.18682762469074,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -1.028374195098877,
"logits/rejected": -0.9928615689277649,
"logps/chosen": -0.4192899763584137,
"logps/rejected": -0.5028694868087769,
"loss": 2.8803,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -4.192899703979492,
"rewards/margins": 0.8357950448989868,
"rewards/rejected": -5.028695106506348,
"step": 90
},
{
"epoch": 0.20307281229124916,
"grad_norm": 56.712862810919404,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -1.102429747581482,
"logits/rejected": -1.017956256866455,
"logps/chosen": -0.4515204429626465,
"logps/rejected": -0.49105948209762573,
"loss": 2.7854,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -4.515204429626465,
"rewards/margins": 0.39539000391960144,
"rewards/rejected": -4.910594463348389,
"step": 95
},
{
"epoch": 0.21376085504342018,
"grad_norm": 77.56651991727357,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -1.0002816915512085,
"logits/rejected": -0.9756115078926086,
"logps/chosen": -0.4405655860900879,
"logps/rejected": -0.5030835866928101,
"loss": 2.8381,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.405655860900879,
"rewards/margins": 0.6251801252365112,
"rewards/rejected": -5.0308356285095215,
"step": 100
},
{
"epoch": 0.22444889779559118,
"grad_norm": 63.78609875386195,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -1.0079588890075684,
"logits/rejected": -0.9540907144546509,
"logps/chosen": -0.41310757398605347,
"logps/rejected": -0.5235550999641418,
"loss": 2.7704,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -4.131075859069824,
"rewards/margins": 1.1044747829437256,
"rewards/rejected": -5.235550880432129,
"step": 105
},
{
"epoch": 0.23513694054776219,
"grad_norm": 59.92913033519696,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.9624613523483276,
"logits/rejected": -0.9022065997123718,
"logps/chosen": -0.4771413207054138,
"logps/rejected": -0.6054214239120483,
"loss": 2.6684,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -4.771413326263428,
"rewards/margins": 1.2828001976013184,
"rewards/rejected": -6.0542144775390625,
"step": 110
},
{
"epoch": 0.2458249832999332,
"grad_norm": 57.71552130623015,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -1.0269968509674072,
"logits/rejected": -0.9661616086959839,
"logps/chosen": -0.5121074914932251,
"logps/rejected": -0.578630268573761,
"loss": 2.5559,
"rewards/accuracies": 0.625,
"rewards/chosen": -5.121075630187988,
"rewards/margins": 0.6652273535728455,
"rewards/rejected": -5.7863030433654785,
"step": 115
},
{
"epoch": 0.2565130260521042,
"grad_norm": 64.09249680400335,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.998211681842804,
"logits/rejected": -0.9050429463386536,
"logps/chosen": -0.5254617929458618,
"logps/rejected": -0.7217136025428772,
"loss": 2.4049,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -5.2546186447143555,
"rewards/margins": 1.9625177383422852,
"rewards/rejected": -7.217136383056641,
"step": 120
},
{
"epoch": 0.26720106880427524,
"grad_norm": 54.10213565718134,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -1.06635320186615,
"logits/rejected": -1.0216171741485596,
"logps/chosen": -0.5953704714775085,
"logps/rejected": -0.6902128458023071,
"loss": 2.3251,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -5.953704833984375,
"rewards/margins": 0.9484230875968933,
"rewards/rejected": -6.902127742767334,
"step": 125
},
{
"epoch": 0.27788911155644624,
"grad_norm": 87.63946362541415,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -1.0916361808776855,
"logits/rejected": -1.085458517074585,
"logps/chosen": -0.588487446308136,
"logps/rejected": -0.8501450419425964,
"loss": 2.1826,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.884873867034912,
"rewards/margins": 2.6165759563446045,
"rewards/rejected": -8.501450538635254,
"step": 130
},
{
"epoch": 0.28857715430861725,
"grad_norm": 66.78226800807278,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -1.067176342010498,
"logits/rejected": -1.0195186138153076,
"logps/chosen": -0.6727192401885986,
"logps/rejected": -0.8245170712471008,
"loss": 2.1861,
"rewards/accuracies": 0.71875,
"rewards/chosen": -6.7271928787231445,
"rewards/margins": 1.5179781913757324,
"rewards/rejected": -8.245170593261719,
"step": 135
},
{
"epoch": 0.29926519706078825,
"grad_norm": 77.9071558548112,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -1.1569595336914062,
"logits/rejected": -1.1336597204208374,
"logps/chosen": -0.785293698310852,
"logps/rejected": -0.9337224960327148,
"loss": 2.1564,
"rewards/accuracies": 0.71875,
"rewards/chosen": -7.852936744689941,
"rewards/margins": 1.4842884540557861,
"rewards/rejected": -9.337224960327148,
"step": 140
},
{
"epoch": 0.30995323981295925,
"grad_norm": 109.08809267522398,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -1.0582095384597778,
"logits/rejected": -1.03193998336792,
"logps/chosen": -0.827114462852478,
"logps/rejected": -1.0477594137191772,
"loss": 2.0118,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -8.27114486694336,
"rewards/margins": 2.206449508666992,
"rewards/rejected": -10.477594375610352,
"step": 145
},
{
"epoch": 0.32064128256513025,
"grad_norm": 72.02715367718524,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -1.0821495056152344,
"logits/rejected": -1.0622715950012207,
"logps/chosen": -0.9100320935249329,
"logps/rejected": -1.1453698873519897,
"loss": 2.0273,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -9.100319862365723,
"rewards/margins": 2.353379011154175,
"rewards/rejected": -11.453699111938477,
"step": 150
},
{
"epoch": 0.33132932531730125,
"grad_norm": 70.83089987980944,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -1.092185139656067,
"logits/rejected": -1.070657730102539,
"logps/chosen": -0.9754332304000854,
"logps/rejected": -1.2774028778076172,
"loss": 2.0633,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -9.754331588745117,
"rewards/margins": 3.0196967124938965,
"rewards/rejected": -12.774029731750488,
"step": 155
},
{
"epoch": 0.3420173680694723,
"grad_norm": 89.85353120616982,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -1.107634425163269,
"logits/rejected": -1.0893046855926514,
"logps/chosen": -1.0988253355026245,
"logps/rejected": -1.4862325191497803,
"loss": 2.0523,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -10.988253593444824,
"rewards/margins": 3.8740711212158203,
"rewards/rejected": -14.862322807312012,
"step": 160
},
{
"epoch": 0.3527054108216433,
"grad_norm": 60.873789300571126,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -1.1269104480743408,
"logits/rejected": -1.1010853052139282,
"logps/chosen": -1.0796130895614624,
"logps/rejected": -1.4461021423339844,
"loss": 1.8838,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -10.796131134033203,
"rewards/margins": 3.6648898124694824,
"rewards/rejected": -14.461019515991211,
"step": 165
},
{
"epoch": 0.3633934535738143,
"grad_norm": 77.23211870911884,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -1.1438876390457153,
"logits/rejected": -1.1206210851669312,
"logps/chosen": -1.0647801160812378,
"logps/rejected": -1.4476187229156494,
"loss": 1.8488,
"rewards/accuracies": 0.78125,
"rewards/chosen": -10.647802352905273,
"rewards/margins": 3.828385591506958,
"rewards/rejected": -14.476186752319336,
"step": 170
},
{
"epoch": 0.3740814963259853,
"grad_norm": 97.35090322598491,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -1.157462239265442,
"logits/rejected": -1.1056431531906128,
"logps/chosen": -1.1336826086044312,
"logps/rejected": -1.3956897258758545,
"loss": 1.7083,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -11.336827278137207,
"rewards/margins": 2.620070695877075,
"rewards/rejected": -13.956896781921387,
"step": 175
},
{
"epoch": 0.3847695390781563,
"grad_norm": 70.60533034676232,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -1.1124871969223022,
"logits/rejected": -1.0904567241668701,
"logps/chosen": -1.0966602563858032,
"logps/rejected": -1.4549492597579956,
"loss": 1.6569,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -10.966601371765137,
"rewards/margins": 3.5828919410705566,
"rewards/rejected": -14.549494743347168,
"step": 180
},
{
"epoch": 0.3954575818303273,
"grad_norm": 84.92007593834019,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -1.1567202806472778,
"logits/rejected": -1.0984870195388794,
"logps/chosen": -1.1207507848739624,
"logps/rejected": -1.5250511169433594,
"loss": 1.6817,
"rewards/accuracies": 0.84375,
"rewards/chosen": -11.20750904083252,
"rewards/margins": 4.043001651763916,
"rewards/rejected": -15.250509262084961,
"step": 185
},
{
"epoch": 0.4061456245824983,
"grad_norm": 82.82215861540205,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -1.1628299951553345,
"logits/rejected": -1.170377492904663,
"logps/chosen": -1.235033392906189,
"logps/rejected": -1.7156970500946045,
"loss": 1.5387,
"rewards/accuracies": 0.84375,
"rewards/chosen": -12.350334167480469,
"rewards/margins": 4.806637763977051,
"rewards/rejected": -17.156970977783203,
"step": 190
},
{
"epoch": 0.4168336673346693,
"grad_norm": 70.50682719627838,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -1.1164333820343018,
"logits/rejected": -1.0961400270462036,
"logps/chosen": -1.2593460083007812,
"logps/rejected": -1.6189504861831665,
"loss": 1.6047,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -12.593461036682129,
"rewards/margins": 3.5960440635681152,
"rewards/rejected": -16.189504623413086,
"step": 195
},
{
"epoch": 0.42752171008684037,
"grad_norm": 97.28442308133118,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -1.1036303043365479,
"logits/rejected": -1.0911258459091187,
"logps/chosen": -1.324706792831421,
"logps/rejected": -1.7423721551895142,
"loss": 1.5626,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -13.247068405151367,
"rewards/margins": 4.176652908325195,
"rewards/rejected": -17.42371940612793,
"step": 200
},
{
"epoch": 0.43820975283901137,
"grad_norm": 85.54406338680343,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -1.154517650604248,
"logits/rejected": -1.101162314414978,
"logps/chosen": -1.40175461769104,
"logps/rejected": -1.8435806035995483,
"loss": 1.7321,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -14.017547607421875,
"rewards/margins": 4.418261528015137,
"rewards/rejected": -18.435808181762695,
"step": 205
},
{
"epoch": 0.44889779559118237,
"grad_norm": 103.7420052940262,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -1.1762125492095947,
"logits/rejected": -1.1645376682281494,
"logps/chosen": -1.4059008359909058,
"logps/rejected": -1.826703429222107,
"loss": 1.6602,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.05901050567627,
"rewards/margins": 4.208024024963379,
"rewards/rejected": -18.267032623291016,
"step": 210
},
{
"epoch": 0.45958583834335337,
"grad_norm": 115.44925865991426,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -1.1282669305801392,
"logits/rejected": -1.1098558902740479,
"logps/chosen": -1.4084670543670654,
"logps/rejected": -1.8751609325408936,
"loss": 1.5529,
"rewards/accuracies": 0.8125,
"rewards/chosen": -14.084672927856445,
"rewards/margins": 4.666939735412598,
"rewards/rejected": -18.751609802246094,
"step": 215
},
{
"epoch": 0.47027388109552437,
"grad_norm": 94.83729222797992,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -1.191811442375183,
"logits/rejected": -1.1707171201705933,
"logps/chosen": -1.4761518239974976,
"logps/rejected": -1.9565551280975342,
"loss": 1.5052,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -14.761518478393555,
"rewards/margins": 4.804032325744629,
"rewards/rejected": -19.5655517578125,
"step": 220
},
{
"epoch": 0.48096192384769537,
"grad_norm": 69.08600083744463,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -1.171081304550171,
"logits/rejected": -1.1351138353347778,
"logps/chosen": -1.4978052377700806,
"logps/rejected": -1.93888258934021,
"loss": 1.4742,
"rewards/accuracies": 0.78125,
"rewards/chosen": -14.978052139282227,
"rewards/margins": 4.410772800445557,
"rewards/rejected": -19.388826370239258,
"step": 225
},
{
"epoch": 0.4916499665998664,
"grad_norm": 72.13177261697588,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -1.245228886604309,
"logits/rejected": -1.1923692226409912,
"logps/chosen": -1.4302809238433838,
"logps/rejected": -1.8505923748016357,
"loss": 1.4317,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -14.302810668945312,
"rewards/margins": 4.203113555908203,
"rewards/rejected": -18.505924224853516,
"step": 230
},
{
"epoch": 0.5023380093520374,
"grad_norm": 78.71892029667256,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -1.2821385860443115,
"logits/rejected": -1.2519080638885498,
"logps/chosen": -1.4527919292449951,
"logps/rejected": -1.9279251098632812,
"loss": 1.4134,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -14.527920722961426,
"rewards/margins": 4.751331329345703,
"rewards/rejected": -19.279251098632812,
"step": 235
},
{
"epoch": 0.5130260521042084,
"grad_norm": 87.04649782214463,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -1.245603322982788,
"logits/rejected": -1.247234582901001,
"logps/chosen": -1.3713314533233643,
"logps/rejected": -1.8449758291244507,
"loss": 1.4346,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -13.7133150100708,
"rewards/margins": 4.736443042755127,
"rewards/rejected": -18.449758529663086,
"step": 240
},
{
"epoch": 0.5237140948563794,
"grad_norm": 120.6181547874012,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -1.3496326208114624,
"logits/rejected": -1.2907614707946777,
"logps/chosen": -1.440033197402954,
"logps/rejected": -2.0060055255889893,
"loss": 1.4071,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -14.4003324508667,
"rewards/margins": 5.659722328186035,
"rewards/rejected": -20.060054779052734,
"step": 245
},
{
"epoch": 0.5344021376085505,
"grad_norm": 127.16635817286267,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -1.2846823930740356,
"logits/rejected": -1.2672080993652344,
"logps/chosen": -1.5308005809783936,
"logps/rejected": -2.1003577709198,
"loss": 1.3208,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.308004379272461,
"rewards/margins": 5.695572853088379,
"rewards/rejected": -21.003578186035156,
"step": 250
},
{
"epoch": 0.5450901803607214,
"grad_norm": 306.6500775815346,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -1.3492941856384277,
"logits/rejected": -1.3214812278747559,
"logps/chosen": -1.6527442932128906,
"logps/rejected": -2.1238582134246826,
"loss": 1.4957,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -16.527442932128906,
"rewards/margins": 4.711141586303711,
"rewards/rejected": -21.238582611083984,
"step": 255
},
{
"epoch": 0.5557782231128925,
"grad_norm": 89.88872208917493,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -1.3080346584320068,
"logits/rejected": -1.2766286134719849,
"logps/chosen": -1.5681380033493042,
"logps/rejected": -2.0582587718963623,
"loss": 1.4255,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -15.681379318237305,
"rewards/margins": 4.901208877563477,
"rewards/rejected": -20.58258819580078,
"step": 260
},
{
"epoch": 0.5664662658650634,
"grad_norm": 86.94866969630735,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -1.2753899097442627,
"logits/rejected": -1.2558867931365967,
"logps/chosen": -1.668534278869629,
"logps/rejected": -2.181380271911621,
"loss": 1.4204,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.68534278869629,
"rewards/margins": 5.1284589767456055,
"rewards/rejected": -21.813800811767578,
"step": 265
},
{
"epoch": 0.5771543086172345,
"grad_norm": 144.84472573271995,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -1.2718496322631836,
"logits/rejected": -1.2400305271148682,
"logps/chosen": -1.4821763038635254,
"logps/rejected": -2.010958194732666,
"loss": 1.3379,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -14.82176399230957,
"rewards/margins": 5.287820816040039,
"rewards/rejected": -20.109582901000977,
"step": 270
},
{
"epoch": 0.5878423513694054,
"grad_norm": 73.43309027045284,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -1.2501633167266846,
"logits/rejected": -1.2101550102233887,
"logps/chosen": -1.4654467105865479,
"logps/rejected": -1.9191405773162842,
"loss": 1.4963,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -14.654467582702637,
"rewards/margins": 4.536937713623047,
"rewards/rejected": -19.191404342651367,
"step": 275
},
{
"epoch": 0.5985303941215765,
"grad_norm": 118.57888241178858,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -1.3355966806411743,
"logits/rejected": -1.3093878030776978,
"logps/chosen": -1.5047754049301147,
"logps/rejected": -2.050473213195801,
"loss": 1.3782,
"rewards/accuracies": 0.8125,
"rewards/chosen": -15.047755241394043,
"rewards/margins": 5.456977844238281,
"rewards/rejected": -20.50473403930664,
"step": 280
},
{
"epoch": 0.6092184368737475,
"grad_norm": 100.79377019073691,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -1.307716965675354,
"logits/rejected": -1.2712657451629639,
"logps/chosen": -1.5421284437179565,
"logps/rejected": -2.111297845840454,
"loss": 1.1768,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -15.421285629272461,
"rewards/margins": 5.6916913986206055,
"rewards/rejected": -21.112977981567383,
"step": 285
},
{
"epoch": 0.6199064796259185,
"grad_norm": 102.72150408454053,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -1.333150863647461,
"logits/rejected": -1.2860305309295654,
"logps/chosen": -1.5572869777679443,
"logps/rejected": -2.028750419616699,
"loss": 1.3993,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.572871208190918,
"rewards/margins": 4.714633464813232,
"rewards/rejected": -20.287504196166992,
"step": 290
},
{
"epoch": 0.6305945223780896,
"grad_norm": 96.32710532692002,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -1.2499698400497437,
"logits/rejected": -1.2331962585449219,
"logps/chosen": -1.5715104341506958,
"logps/rejected": -2.08168625831604,
"loss": 1.2236,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.715105056762695,
"rewards/margins": 5.101758003234863,
"rewards/rejected": -20.81686019897461,
"step": 295
},
{
"epoch": 0.6412825651302605,
"grad_norm": 86.57462147935358,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -1.1837140321731567,
"logits/rejected": -1.156435251235962,
"logps/chosen": -1.662043809890747,
"logps/rejected": -2.1141371726989746,
"loss": 1.6291,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -16.620437622070312,
"rewards/margins": 4.52093505859375,
"rewards/rejected": -21.141372680664062,
"step": 300
},
{
"epoch": 0.6519706078824316,
"grad_norm": 80.28274687652879,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -1.2831798791885376,
"logits/rejected": -1.2358052730560303,
"logps/chosen": -1.636275053024292,
"logps/rejected": -2.093479871749878,
"loss": 1.3641,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -16.362751007080078,
"rewards/margins": 4.572048664093018,
"rewards/rejected": -20.93480110168457,
"step": 305
},
{
"epoch": 0.6626586506346025,
"grad_norm": 96.95154393343023,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -1.3118140697479248,
"logits/rejected": -1.2903715372085571,
"logps/chosen": -1.6529546976089478,
"logps/rejected": -2.148355007171631,
"loss": 1.4766,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -16.529544830322266,
"rewards/margins": 4.954004764556885,
"rewards/rejected": -21.483551025390625,
"step": 310
},
{
"epoch": 0.6733466933867736,
"grad_norm": 91.51736686071692,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -1.3063311576843262,
"logits/rejected": -1.2485519647598267,
"logps/chosen": -1.6333932876586914,
"logps/rejected": -2.1507859230041504,
"loss": 1.3963,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -16.333934783935547,
"rewards/margins": 5.173925876617432,
"rewards/rejected": -21.507858276367188,
"step": 315
},
{
"epoch": 0.6840347361389446,
"grad_norm": 97.07913178610919,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -1.2934232950210571,
"logits/rejected": -1.2893450260162354,
"logps/chosen": -1.6584867238998413,
"logps/rejected": -2.2650082111358643,
"loss": 1.3115,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -16.584867477416992,
"rewards/margins": 6.065215110778809,
"rewards/rejected": -22.650081634521484,
"step": 320
},
{
"epoch": 0.6947227788911156,
"grad_norm": 65.85264295945626,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -1.326992392539978,
"logits/rejected": -1.2749508619308472,
"logps/chosen": -1.7087081670761108,
"logps/rejected": -2.377331256866455,
"loss": 1.307,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -17.087081909179688,
"rewards/margins": 6.6862287521362305,
"rewards/rejected": -23.773311614990234,
"step": 325
},
{
"epoch": 0.7054108216432866,
"grad_norm": 71.08108071468983,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -1.2321017980575562,
"logits/rejected": -1.1879392862319946,
"logps/chosen": -1.6843183040618896,
"logps/rejected": -2.170222759246826,
"loss": 1.3597,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -16.843183517456055,
"rewards/margins": 4.859041690826416,
"rewards/rejected": -21.702226638793945,
"step": 330
},
{
"epoch": 0.7160988643954576,
"grad_norm": 85.80290375242986,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -1.2848079204559326,
"logits/rejected": -1.2643808126449585,
"logps/chosen": -1.6716737747192383,
"logps/rejected": -2.2179079055786133,
"loss": 1.2118,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.716739654541016,
"rewards/margins": 5.46234130859375,
"rewards/rejected": -22.179079055786133,
"step": 335
},
{
"epoch": 0.7267869071476286,
"grad_norm": 106.87884023285183,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -1.3009603023529053,
"logits/rejected": -1.2630964517593384,
"logps/chosen": -1.5984188318252563,
"logps/rejected": -2.146073579788208,
"loss": 1.3799,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.984187126159668,
"rewards/margins": 5.476546287536621,
"rewards/rejected": -21.460735321044922,
"step": 340
},
{
"epoch": 0.7374749498997996,
"grad_norm": 81.57738599240237,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -1.3159044981002808,
"logits/rejected": -1.2908227443695068,
"logps/chosen": -1.6741054058074951,
"logps/rejected": -2.3151228427886963,
"loss": 1.4007,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -16.741056442260742,
"rewards/margins": 6.4101715087890625,
"rewards/rejected": -23.151227951049805,
"step": 345
},
{
"epoch": 0.7481629926519706,
"grad_norm": 137.46788470613842,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -1.3111270666122437,
"logits/rejected": -1.3133299350738525,
"logps/chosen": -1.55172860622406,
"logps/rejected": -2.0665595531463623,
"loss": 1.3291,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.51728630065918,
"rewards/margins": 5.148309707641602,
"rewards/rejected": -20.66559410095215,
"step": 350
},
{
"epoch": 0.7588510354041417,
"grad_norm": 73.0747912837978,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -1.2305195331573486,
"logits/rejected": -1.2432688474655151,
"logps/chosen": -1.5626884698867798,
"logps/rejected": -2.1072001457214355,
"loss": 1.159,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.626884460449219,
"rewards/margins": 5.445114612579346,
"rewards/rejected": -21.071998596191406,
"step": 355
},
{
"epoch": 0.7695390781563126,
"grad_norm": 82.19549372560476,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -1.2950479984283447,
"logits/rejected": -1.2918254137039185,
"logps/chosen": -1.5626431703567505,
"logps/rejected": -2.192157030105591,
"loss": 1.3653,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -15.626432418823242,
"rewards/margins": 6.295140266418457,
"rewards/rejected": -21.921573638916016,
"step": 360
},
{
"epoch": 0.7802271209084837,
"grad_norm": 82.26556152038766,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -1.3408092260360718,
"logits/rejected": -1.2676836252212524,
"logps/chosen": -1.621119737625122,
"logps/rejected": -2.2568397521972656,
"loss": 1.2936,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -16.211196899414062,
"rewards/margins": 6.35720157623291,
"rewards/rejected": -22.56839942932129,
"step": 365
},
{
"epoch": 0.7909151636606546,
"grad_norm": 97.91298047906564,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -1.2643686532974243,
"logits/rejected": -1.2498524188995361,
"logps/chosen": -1.578148603439331,
"logps/rejected": -2.067432403564453,
"loss": 1.3095,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.781486511230469,
"rewards/margins": 4.892836093902588,
"rewards/rejected": -20.6743221282959,
"step": 370
},
{
"epoch": 0.8016032064128257,
"grad_norm": 76.40375667456833,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -1.2894870042800903,
"logits/rejected": -1.2930238246917725,
"logps/chosen": -1.7195911407470703,
"logps/rejected": -2.293926954269409,
"loss": 1.2661,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -17.195911407470703,
"rewards/margins": 5.7433576583862305,
"rewards/rejected": -22.939268112182617,
"step": 375
},
{
"epoch": 0.8122912491649966,
"grad_norm": 128.55014662844385,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -1.3014891147613525,
"logits/rejected": -1.2802826166152954,
"logps/chosen": -1.5879671573638916,
"logps/rejected": -2.113447666168213,
"loss": 1.4202,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -15.879669189453125,
"rewards/margins": 5.25480842590332,
"rewards/rejected": -21.134477615356445,
"step": 380
},
{
"epoch": 0.8229792919171677,
"grad_norm": 77.57546829061782,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -1.310450792312622,
"logits/rejected": -1.2848607301712036,
"logps/chosen": -1.6349436044692993,
"logps/rejected": -2.3224172592163086,
"loss": 1.1354,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -16.349435806274414,
"rewards/margins": 6.8747382164001465,
"rewards/rejected": -23.224172592163086,
"step": 385
},
{
"epoch": 0.8336673346693386,
"grad_norm": 75.76498018298135,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -1.3069926500320435,
"logits/rejected": -1.289568305015564,
"logps/chosen": -1.699163794517517,
"logps/rejected": -2.2884535789489746,
"loss": 1.3105,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -16.99163818359375,
"rewards/margins": 5.8928985595703125,
"rewards/rejected": -22.884536743164062,
"step": 390
},
{
"epoch": 0.8443553774215097,
"grad_norm": 74.3951066976334,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -1.3087493181228638,
"logits/rejected": -1.2855933904647827,
"logps/chosen": -1.6192277669906616,
"logps/rejected": -2.1721370220184326,
"loss": 1.2688,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -16.192277908325195,
"rewards/margins": 5.5290937423706055,
"rewards/rejected": -21.721370697021484,
"step": 395
},
{
"epoch": 0.8550434201736807,
"grad_norm": 63.10639530225684,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -1.3169952630996704,
"logits/rejected": -1.2985506057739258,
"logps/chosen": -1.6467092037200928,
"logps/rejected": -2.164301633834839,
"loss": 1.2114,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -16.467090606689453,
"rewards/margins": 5.17592716217041,
"rewards/rejected": -21.643016815185547,
"step": 400
},
{
"epoch": 0.8550434201736807,
"eval_logits/chosen": -1.4850261211395264,
"eval_logits/rejected": -1.493988037109375,
"eval_logps/chosen": -1.664995551109314,
"eval_logps/rejected": -2.2206830978393555,
"eval_loss": 1.2490928173065186,
"eval_rewards/accuracies": 0.8414633870124817,
"eval_rewards/chosen": -16.64995574951172,
"eval_rewards/margins": 5.556875228881836,
"eval_rewards/rejected": -22.206830978393555,
"eval_runtime": 95.4555,
"eval_samples_per_second": 20.544,
"eval_steps_per_second": 1.289,
"step": 400
},
{
"epoch": 0.8657314629258517,
"grad_norm": 102.60713365281785,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -1.267327904701233,
"logits/rejected": -1.2850300073623657,
"logps/chosen": -1.7324796915054321,
"logps/rejected": -2.2837843894958496,
"loss": 1.2554,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -17.32479476928711,
"rewards/margins": 5.513047218322754,
"rewards/rejected": -22.83784294128418,
"step": 405
},
{
"epoch": 0.8764195056780227,
"grad_norm": 159.07261192162792,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -1.2908105850219727,
"logits/rejected": -1.2769014835357666,
"logps/chosen": -1.6668212413787842,
"logps/rejected": -2.2075092792510986,
"loss": 1.3804,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.668210983276367,
"rewards/margins": 5.406882286071777,
"rewards/rejected": -22.075092315673828,
"step": 410
},
{
"epoch": 0.8871075484301937,
"grad_norm": 76.85488373819665,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -1.2890928983688354,
"logits/rejected": -1.2320820093154907,
"logps/chosen": -1.5973718166351318,
"logps/rejected": -2.237947940826416,
"loss": 1.4163,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.973716735839844,
"rewards/margins": 6.40576171875,
"rewards/rejected": -22.37947654724121,
"step": 415
},
{
"epoch": 0.8977955911823647,
"grad_norm": 92.42320617715352,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -1.3191107511520386,
"logits/rejected": -1.2656759023666382,
"logps/chosen": -1.5610657930374146,
"logps/rejected": -2.152204990386963,
"loss": 1.2658,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -15.610658645629883,
"rewards/margins": 5.911390781402588,
"rewards/rejected": -21.522048950195312,
"step": 420
},
{
"epoch": 0.9084836339345357,
"grad_norm": 102.84147971960329,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -1.3186463117599487,
"logits/rejected": -1.3073859214782715,
"logps/chosen": -1.736202597618103,
"logps/rejected": -2.2703185081481934,
"loss": 1.4248,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.362024307250977,
"rewards/margins": 5.34116268157959,
"rewards/rejected": -22.70318603515625,
"step": 425
},
{
"epoch": 0.9191716766867067,
"grad_norm": 108.04777919102577,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -1.2982522249221802,
"logits/rejected": -1.2813619375228882,
"logps/chosen": -1.7414271831512451,
"logps/rejected": -2.3307671546936035,
"loss": 1.2802,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.41427230834961,
"rewards/margins": 5.893403053283691,
"rewards/rejected": -23.30767250061035,
"step": 430
},
{
"epoch": 0.9298597194388778,
"grad_norm": 85.6236171514638,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -1.262458324432373,
"logits/rejected": -1.2183687686920166,
"logps/chosen": -1.5962882041931152,
"logps/rejected": -2.182863712310791,
"loss": 1.222,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -15.962882995605469,
"rewards/margins": 5.865753650665283,
"rewards/rejected": -21.828638076782227,
"step": 435
},
{
"epoch": 0.9405477621910487,
"grad_norm": 88.93173263482028,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -1.2786242961883545,
"logits/rejected": -1.2167497873306274,
"logps/chosen": -1.7170331478118896,
"logps/rejected": -2.2510578632354736,
"loss": 1.2694,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.170331954956055,
"rewards/margins": 5.340245723724365,
"rewards/rejected": -22.510578155517578,
"step": 440
},
{
"epoch": 0.9512358049432198,
"grad_norm": 80.06878550984797,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -1.229853868484497,
"logits/rejected": -1.2106773853302002,
"logps/chosen": -1.688746690750122,
"logps/rejected": -2.332123279571533,
"loss": 1.1496,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -16.88746452331543,
"rewards/margins": 6.433764457702637,
"rewards/rejected": -23.321231842041016,
"step": 445
},
{
"epoch": 0.9619238476953907,
"grad_norm": 87.87225651237878,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -1.3426064252853394,
"logits/rejected": -1.3202402591705322,
"logps/chosen": -1.6350570917129517,
"logps/rejected": -2.1853957176208496,
"loss": 1.3199,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.350570678710938,
"rewards/margins": 5.503388404846191,
"rewards/rejected": -21.853958129882812,
"step": 450
},
{
"epoch": 0.9726118904475618,
"grad_norm": 97.584405727653,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -1.3094408512115479,
"logits/rejected": -1.2717828750610352,
"logps/chosen": -1.6554279327392578,
"logps/rejected": -2.304875373840332,
"loss": 1.0871,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -16.554279327392578,
"rewards/margins": 6.494471549987793,
"rewards/rejected": -23.048751831054688,
"step": 455
},
{
"epoch": 0.9832999331997327,
"grad_norm": 94.99231466494224,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": -1.3035484552383423,
"logits/rejected": -1.2918545007705688,
"logps/chosen": -1.7271077632904053,
"logps/rejected": -2.327470541000366,
"loss": 1.2419,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -17.27107810974121,
"rewards/margins": 6.003628730773926,
"rewards/rejected": -23.27470588684082,
"step": 460
},
{
"epoch": 0.9939879759519038,
"grad_norm": 86.3036149732278,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -1.286787986755371,
"logits/rejected": -1.2908694744110107,
"logps/chosen": -1.7435226440429688,
"logps/rejected": -2.3610475063323975,
"loss": 1.3748,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -17.435226440429688,
"rewards/margins": 6.175250053405762,
"rewards/rejected": -23.610477447509766,
"step": 465
},
{
"epoch": 0.9982631930527722,
"step": 467,
"total_flos": 0.0,
"train_loss": 1.8360214427400707,
"train_runtime": 11486.9698,
"train_samples_per_second": 5.213,
"train_steps_per_second": 0.041
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}