Meta-Llama-3-8B-Instruct-MI-1e-6 / trainer_state.json
tengxiao1
TX
1f70f5a
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982631930527722,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01068804275217101,
"grad_norm": 7.6839051021356335,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": 0.030031317844986916,
"logits/rejected": -0.005169146694242954,
"logps/chosen": -0.2549566626548767,
"logps/rejected": -0.26970332860946655,
"loss": 1.2612,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.2549566626548767,
"rewards/margins": 0.014746698550879955,
"rewards/rejected": -0.26970332860946655,
"step": 5
},
{
"epoch": 0.02137608550434202,
"grad_norm": 6.455969735668311,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.030284494161605835,
"logits/rejected": -0.02130025625228882,
"logps/chosen": -0.27859872579574585,
"logps/rejected": -0.2721685469150543,
"loss": 1.2728,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.27859872579574585,
"rewards/margins": -0.006430179812014103,
"rewards/rejected": -0.2721685469150543,
"step": 10
},
{
"epoch": 0.03206412825651302,
"grad_norm": 6.3108235358652385,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": 0.05979523807764053,
"logits/rejected": 0.05295870825648308,
"logps/chosen": -0.2907688021659851,
"logps/rejected": -0.305183470249176,
"loss": 1.2799,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2907688021659851,
"rewards/margins": 0.014414620585739613,
"rewards/rejected": -0.305183470249176,
"step": 15
},
{
"epoch": 0.04275217100868404,
"grad_norm": 6.093982428242224,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": 0.03507867455482483,
"logits/rejected": 0.03253958001732826,
"logps/chosen": -0.261181116104126,
"logps/rejected": -0.27250123023986816,
"loss": 1.2661,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.261181116104126,
"rewards/margins": 0.011320129036903381,
"rewards/rejected": -0.27250123023986816,
"step": 20
},
{
"epoch": 0.053440213760855046,
"grad_norm": 5.3174858162797864,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": 0.025609856471419334,
"logits/rejected": 0.009822970256209373,
"logps/chosen": -0.2632735073566437,
"logps/rejected": -0.28461751341819763,
"loss": 1.2665,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.2632735073566437,
"rewards/margins": 0.02134399674832821,
"rewards/rejected": -0.28461751341819763,
"step": 25
},
{
"epoch": 0.06412825651302605,
"grad_norm": 5.716113428524292,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -0.0970643013715744,
"logits/rejected": -0.07809214293956757,
"logps/chosen": -0.2763083279132843,
"logps/rejected": -0.2804708778858185,
"loss": 1.2658,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.2763083279132843,
"rewards/margins": 0.0041625602170825005,
"rewards/rejected": -0.2804708778858185,
"step": 30
},
{
"epoch": 0.07481629926519706,
"grad_norm": 4.978401481602216,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -0.0406271293759346,
"logits/rejected": -0.07336937636137009,
"logps/chosen": -0.29073840379714966,
"logps/rejected": -0.293030709028244,
"loss": 1.2797,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.29073840379714966,
"rewards/margins": 0.0022922889329493046,
"rewards/rejected": -0.293030709028244,
"step": 35
},
{
"epoch": 0.08550434201736808,
"grad_norm": 6.722458816777254,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -0.12586958706378937,
"logits/rejected": -0.1309432089328766,
"logps/chosen": -0.2860681414604187,
"logps/rejected": -0.29788246750831604,
"loss": 1.2768,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.2860681414604187,
"rewards/margins": 0.01181434839963913,
"rewards/rejected": -0.29788246750831604,
"step": 40
},
{
"epoch": 0.09619238476953908,
"grad_norm": 9.862978906902363,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -0.0928565114736557,
"logits/rejected": -0.08263979852199554,
"logps/chosen": -0.28130441904067993,
"logps/rejected": -0.306749552488327,
"loss": 1.2561,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.28130441904067993,
"rewards/margins": 0.0254451222717762,
"rewards/rejected": -0.306749552488327,
"step": 45
},
{
"epoch": 0.10688042752171009,
"grad_norm": 6.916554774701635,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -0.1109582781791687,
"logits/rejected": -0.13332585990428925,
"logps/chosen": -0.2940751612186432,
"logps/rejected": -0.33872976899147034,
"loss": 1.2504,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.2940751612186432,
"rewards/margins": 0.04465465992689133,
"rewards/rejected": -0.33872976899147034,
"step": 50
},
{
"epoch": 0.11756847027388109,
"grad_norm": 6.886280103270556,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -0.07307116687297821,
"logits/rejected": -0.09735921025276184,
"logps/chosen": -0.29753613471984863,
"logps/rejected": -0.36083537340164185,
"loss": 1.2622,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.29753613471984863,
"rewards/margins": 0.06329929828643799,
"rewards/rejected": -0.36083537340164185,
"step": 55
},
{
"epoch": 0.1282565130260521,
"grad_norm": 6.581763598962665,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -0.08233795315027237,
"logits/rejected": -0.08182443678379059,
"logps/chosen": -0.3031911253929138,
"logps/rejected": -0.34265732765197754,
"loss": 1.2444,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.3031911253929138,
"rewards/margins": 0.03946622088551521,
"rewards/rejected": -0.34265732765197754,
"step": 60
},
{
"epoch": 0.13894455577822312,
"grad_norm": 9.155624459526475,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -0.12494422495365143,
"logits/rejected": -0.10039836168289185,
"logps/chosen": -0.31139710545539856,
"logps/rejected": -0.40564531087875366,
"loss": 1.2551,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.31139710545539856,
"rewards/margins": 0.0942481979727745,
"rewards/rejected": -0.40564531087875366,
"step": 65
},
{
"epoch": 0.14963259853039412,
"grad_norm": 6.234129539834448,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -0.12459670007228851,
"logits/rejected": -0.1347140073776245,
"logps/chosen": -0.2900499701499939,
"logps/rejected": -0.34025058150291443,
"loss": 1.2499,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.2900499701499939,
"rewards/margins": 0.05020058900117874,
"rewards/rejected": -0.34025058150291443,
"step": 70
},
{
"epoch": 0.16032064128256512,
"grad_norm": 6.758490065806782,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.04522291570901871,
"logits/rejected": -0.10806401073932648,
"logps/chosen": -0.36918264627456665,
"logps/rejected": -0.37469878792762756,
"loss": 1.255,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.36918264627456665,
"rewards/margins": 0.005516159348189831,
"rewards/rejected": -0.37469878792762756,
"step": 75
},
{
"epoch": 0.17100868403473615,
"grad_norm": 10.191842225644526,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -0.08387650549411774,
"logits/rejected": -0.07887469977140427,
"logps/chosen": -0.33048170804977417,
"logps/rejected": -0.4197458326816559,
"loss": 1.2368,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.33048170804977417,
"rewards/margins": 0.0892641618847847,
"rewards/rejected": -0.4197458326816559,
"step": 80
},
{
"epoch": 0.18169672678690715,
"grad_norm": 9.352000749502443,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.13144788146018982,
"logits/rejected": -0.13928399980068207,
"logps/chosen": -0.3154647946357727,
"logps/rejected": -0.38948363065719604,
"loss": 1.2319,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.3154647946357727,
"rewards/margins": 0.07401885092258453,
"rewards/rejected": -0.38948363065719604,
"step": 85
},
{
"epoch": 0.19238476953907815,
"grad_norm": 6.072420839953192,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -0.1306827962398529,
"logits/rejected": -0.1379804015159607,
"logps/chosen": -0.33726412057876587,
"logps/rejected": -0.3857743442058563,
"loss": 1.2455,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.33726412057876587,
"rewards/margins": 0.04851023852825165,
"rewards/rejected": -0.3857743442058563,
"step": 90
},
{
"epoch": 0.20307281229124916,
"grad_norm": 8.382063384630092,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -0.12879344820976257,
"logits/rejected": -0.14443747699260712,
"logps/chosen": -0.35025691986083984,
"logps/rejected": -0.40873831510543823,
"loss": 1.2414,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.35025691986083984,
"rewards/margins": 0.058481425046920776,
"rewards/rejected": -0.40873831510543823,
"step": 95
},
{
"epoch": 0.21376085504342018,
"grad_norm": 8.051181065225872,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -0.0478597953915596,
"logits/rejected": -0.0929640680551529,
"logps/chosen": -0.366682231426239,
"logps/rejected": -0.42526760697364807,
"loss": 1.2454,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.366682231426239,
"rewards/margins": 0.05858539417386055,
"rewards/rejected": -0.42526760697364807,
"step": 100
},
{
"epoch": 0.22444889779559118,
"grad_norm": 8.423595722166493,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -0.024932144209742546,
"logits/rejected": -0.029018620029091835,
"logps/chosen": -0.3645615577697754,
"logps/rejected": -0.4377099871635437,
"loss": 1.2321,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.3645615577697754,
"rewards/margins": 0.07314839214086533,
"rewards/rejected": -0.4377099871635437,
"step": 105
},
{
"epoch": 0.23513694054776219,
"grad_norm": 8.583118717745613,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.08025398850440979,
"logits/rejected": -0.11082024872303009,
"logps/chosen": -0.37390726804733276,
"logps/rejected": -0.44736775755882263,
"loss": 1.2231,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.37390726804733276,
"rewards/margins": 0.07346051186323166,
"rewards/rejected": -0.44736775755882263,
"step": 110
},
{
"epoch": 0.2458249832999332,
"grad_norm": 8.813654782582985,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -0.05264202877879143,
"logits/rejected": -0.08138756453990936,
"logps/chosen": -0.3586878478527069,
"logps/rejected": -0.4460626244544983,
"loss": 1.2357,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.3586878478527069,
"rewards/margins": 0.08737480640411377,
"rewards/rejected": -0.4460626244544983,
"step": 115
},
{
"epoch": 0.2565130260521042,
"grad_norm": 18.26289480767173,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.0790601596236229,
"logits/rejected": -0.10665098577737808,
"logps/chosen": -0.3613402545452118,
"logps/rejected": -0.43312540650367737,
"loss": 1.2356,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.3613402545452118,
"rewards/margins": 0.07178511470556259,
"rewards/rejected": -0.43312540650367737,
"step": 120
},
{
"epoch": 0.26720106880427524,
"grad_norm": 8.650831994920619,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -0.1340969204902649,
"logits/rejected": -0.1336316168308258,
"logps/chosen": -0.3937236964702606,
"logps/rejected": -0.5055503249168396,
"loss": 1.2281,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3937236964702606,
"rewards/margins": 0.11182668060064316,
"rewards/rejected": -0.5055503249168396,
"step": 125
},
{
"epoch": 0.27788911155644624,
"grad_norm": 7.496527663898527,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -0.1845657378435135,
"logits/rejected": -0.24125418066978455,
"logps/chosen": -0.38387566804885864,
"logps/rejected": -0.452624648809433,
"loss": 1.2223,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.38387566804885864,
"rewards/margins": 0.06874893605709076,
"rewards/rejected": -0.452624648809433,
"step": 130
},
{
"epoch": 0.28857715430861725,
"grad_norm": 6.7313487554996545,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -0.1565982848405838,
"logits/rejected": -0.15710704028606415,
"logps/chosen": -0.3900142312049866,
"logps/rejected": -0.5559936761856079,
"loss": 1.2183,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.3900142312049866,
"rewards/margins": 0.16597944498062134,
"rewards/rejected": -0.5559936761856079,
"step": 135
},
{
"epoch": 0.29926519706078825,
"grad_norm": 6.459204367259454,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -0.19400301575660706,
"logits/rejected": -0.23119351267814636,
"logps/chosen": -0.37699031829833984,
"logps/rejected": -0.4609736502170563,
"loss": 1.2272,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.37699031829833984,
"rewards/margins": 0.0839833989739418,
"rewards/rejected": -0.4609736502170563,
"step": 140
},
{
"epoch": 0.30995323981295925,
"grad_norm": 9.060033916313321,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -0.07448846101760864,
"logits/rejected": -0.16315212845802307,
"logps/chosen": -0.4142521917819977,
"logps/rejected": -0.5054866075515747,
"loss": 1.2179,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.4142521917819977,
"rewards/margins": 0.09123442322015762,
"rewards/rejected": -0.5054866075515747,
"step": 145
},
{
"epoch": 0.32064128256513025,
"grad_norm": 5.736273343276732,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -0.10538250207901001,
"logits/rejected": -0.1413673311471939,
"logps/chosen": -0.41122564673423767,
"logps/rejected": -0.5390647053718567,
"loss": 1.2124,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.41122564673423767,
"rewards/margins": 0.12783899903297424,
"rewards/rejected": -0.5390647053718567,
"step": 150
},
{
"epoch": 0.33132932531730125,
"grad_norm": 6.063903579743197,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -0.11919336020946503,
"logits/rejected": -0.1652189940214157,
"logps/chosen": -0.4010487496852875,
"logps/rejected": -0.5092573165893555,
"loss": 1.2134,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.4010487496852875,
"rewards/margins": 0.1082085520029068,
"rewards/rejected": -0.5092573165893555,
"step": 155
},
{
"epoch": 0.3420173680694723,
"grad_norm": 9.279674751992077,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -0.08387523144483566,
"logits/rejected": -0.14803090691566467,
"logps/chosen": -0.43099141120910645,
"logps/rejected": -0.5257623791694641,
"loss": 1.2144,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.43099141120910645,
"rewards/margins": 0.09477093070745468,
"rewards/rejected": -0.5257623791694641,
"step": 160
},
{
"epoch": 0.3527054108216433,
"grad_norm": 8.059439047004146,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -0.16660968959331512,
"logits/rejected": -0.18010763823986053,
"logps/chosen": -0.4078282415866852,
"logps/rejected": -0.5577529668807983,
"loss": 1.201,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4078282415866852,
"rewards/margins": 0.14992472529411316,
"rewards/rejected": -0.5577529668807983,
"step": 165
},
{
"epoch": 0.3633934535738143,
"grad_norm": 8.63786807542291,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -0.052240192890167236,
"logits/rejected": -0.06318216025829315,
"logps/chosen": -0.4152770936489105,
"logps/rejected": -0.6054114103317261,
"loss": 1.203,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4152770936489105,
"rewards/margins": 0.19013427197933197,
"rewards/rejected": -0.6054114103317261,
"step": 170
},
{
"epoch": 0.3740814963259853,
"grad_norm": 10.518951318209595,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -0.11517999321222305,
"logits/rejected": -0.1849624663591385,
"logps/chosen": -0.40625524520874023,
"logps/rejected": -0.5451288819313049,
"loss": 1.212,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.40625524520874023,
"rewards/margins": 0.1388736218214035,
"rewards/rejected": -0.5451288819313049,
"step": 175
},
{
"epoch": 0.3847695390781563,
"grad_norm": 8.417946401352186,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -0.06935660541057587,
"logits/rejected": -0.1301901787519455,
"logps/chosen": -0.4220534861087799,
"logps/rejected": -0.5573703050613403,
"loss": 1.2083,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4220534861087799,
"rewards/margins": 0.13531681895256042,
"rewards/rejected": -0.5573703050613403,
"step": 180
},
{
"epoch": 0.3954575818303273,
"grad_norm": 7.995331136842524,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -0.08758817613124847,
"logits/rejected": -0.14539772272109985,
"logps/chosen": -0.4507700502872467,
"logps/rejected": -0.5500799417495728,
"loss": 1.1962,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.4507700502872467,
"rewards/margins": 0.09930990636348724,
"rewards/rejected": -0.5500799417495728,
"step": 185
},
{
"epoch": 0.4061456245824983,
"grad_norm": 6.542669653809171,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -0.0885528177022934,
"logits/rejected": -0.1532018929719925,
"logps/chosen": -0.39389023184776306,
"logps/rejected": -0.5325407981872559,
"loss": 1.2053,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.39389023184776306,
"rewards/margins": 0.13865062594413757,
"rewards/rejected": -0.5325407981872559,
"step": 190
},
{
"epoch": 0.4168336673346693,
"grad_norm": 8.450137364374076,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -0.07352186739444733,
"logits/rejected": -0.15036916732788086,
"logps/chosen": -0.42540302872657776,
"logps/rejected": -0.5537828207015991,
"loss": 1.2084,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.42540302872657776,
"rewards/margins": 0.12837986648082733,
"rewards/rejected": -0.5537828207015991,
"step": 195
},
{
"epoch": 0.42752171008684037,
"grad_norm": 8.694643710416935,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -0.09027515351772308,
"logits/rejected": -0.1412956416606903,
"logps/chosen": -0.40201014280319214,
"logps/rejected": -0.5238697528839111,
"loss": 1.1957,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.40201014280319214,
"rewards/margins": 0.12185963243246078,
"rewards/rejected": -0.5238697528839111,
"step": 200
},
{
"epoch": 0.43820975283901137,
"grad_norm": 8.952421817833013,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -0.028889209032058716,
"logits/rejected": -0.11139396578073502,
"logps/chosen": -0.41819238662719727,
"logps/rejected": -0.5573530197143555,
"loss": 1.192,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.41819238662719727,
"rewards/margins": 0.13916069269180298,
"rewards/rejected": -0.5573530197143555,
"step": 205
},
{
"epoch": 0.44889779559118237,
"grad_norm": 8.187027602318263,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -0.21568699181079865,
"logits/rejected": -0.21867302060127258,
"logps/chosen": -0.43785786628723145,
"logps/rejected": -0.5911111235618591,
"loss": 1.185,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.43785786628723145,
"rewards/margins": 0.1532532274723053,
"rewards/rejected": -0.5911111235618591,
"step": 210
},
{
"epoch": 0.45958583834335337,
"grad_norm": 5.9916116804065584,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -0.12575657665729523,
"logits/rejected": -0.1455686092376709,
"logps/chosen": -0.4271029531955719,
"logps/rejected": -0.5920487642288208,
"loss": 1.1742,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4271029531955719,
"rewards/margins": 0.1649458110332489,
"rewards/rejected": -0.5920487642288208,
"step": 215
},
{
"epoch": 0.47027388109552437,
"grad_norm": 10.450264869504016,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -0.18063326179981232,
"logits/rejected": -0.23072651028633118,
"logps/chosen": -0.44927778840065,
"logps/rejected": -0.6041940450668335,
"loss": 1.1923,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.44927778840065,
"rewards/margins": 0.1549161970615387,
"rewards/rejected": -0.6041940450668335,
"step": 220
},
{
"epoch": 0.48096192384769537,
"grad_norm": 9.723089980517251,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -0.17207232117652893,
"logits/rejected": -0.20382137596607208,
"logps/chosen": -0.4381163716316223,
"logps/rejected": -0.604756236076355,
"loss": 1.192,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.4381163716316223,
"rewards/margins": 0.16663983464241028,
"rewards/rejected": -0.604756236076355,
"step": 225
},
{
"epoch": 0.4916499665998664,
"grad_norm": 9.054770077776112,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -0.14859004318714142,
"logits/rejected": -0.15977120399475098,
"logps/chosen": -0.4646037220954895,
"logps/rejected": -0.6633389592170715,
"loss": 1.1958,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.4646037220954895,
"rewards/margins": 0.198735311627388,
"rewards/rejected": -0.6633389592170715,
"step": 230
},
{
"epoch": 0.5023380093520374,
"grad_norm": 17.895526950620496,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -0.12733140587806702,
"logits/rejected": -0.18139609694480896,
"logps/chosen": -0.4529820382595062,
"logps/rejected": -0.592745304107666,
"loss": 1.1902,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4529820382595062,
"rewards/margins": 0.1397632509469986,
"rewards/rejected": -0.592745304107666,
"step": 235
},
{
"epoch": 0.5130260521042084,
"grad_norm": 9.140622097956182,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -0.12596455216407776,
"logits/rejected": -0.18648605048656464,
"logps/chosen": -0.4592694640159607,
"logps/rejected": -0.6695916056632996,
"loss": 1.1965,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4592694640159607,
"rewards/margins": 0.21032221615314484,
"rewards/rejected": -0.6695916056632996,
"step": 240
},
{
"epoch": 0.5237140948563794,
"grad_norm": 7.85051968813952,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -0.15113191306591034,
"logits/rejected": -0.20874838531017303,
"logps/chosen": -0.448641836643219,
"logps/rejected": -0.6651071906089783,
"loss": 1.2037,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.448641836643219,
"rewards/margins": 0.21646539866924286,
"rewards/rejected": -0.6651071906089783,
"step": 245
},
{
"epoch": 0.5344021376085505,
"grad_norm": 8.655989549308861,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -0.10786600410938263,
"logits/rejected": -0.13217635452747345,
"logps/chosen": -0.42935776710510254,
"logps/rejected": -0.6120445132255554,
"loss": 1.1759,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.42935776710510254,
"rewards/margins": 0.1826867312192917,
"rewards/rejected": -0.6120445132255554,
"step": 250
},
{
"epoch": 0.5450901803607214,
"grad_norm": 9.048614473634702,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -0.18685856461524963,
"logits/rejected": -0.2535242736339569,
"logps/chosen": -0.441723495721817,
"logps/rejected": -0.6506599187850952,
"loss": 1.1816,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.441723495721817,
"rewards/margins": 0.2089364230632782,
"rewards/rejected": -0.6506599187850952,
"step": 255
},
{
"epoch": 0.5557782231128925,
"grad_norm": 7.6389154059915105,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -0.19917742908000946,
"logits/rejected": -0.27246275544166565,
"logps/chosen": -0.46937423944473267,
"logps/rejected": -0.6870771646499634,
"loss": 1.1951,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.46937423944473267,
"rewards/margins": 0.21770286560058594,
"rewards/rejected": -0.6870771646499634,
"step": 260
},
{
"epoch": 0.5664662658650634,
"grad_norm": 9.10073399455548,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -0.11132203042507172,
"logits/rejected": -0.09603560715913773,
"logps/chosen": -0.44644594192504883,
"logps/rejected": -0.6944222450256348,
"loss": 1.1747,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.44644594192504883,
"rewards/margins": 0.24797634780406952,
"rewards/rejected": -0.6944222450256348,
"step": 265
},
{
"epoch": 0.5771543086172345,
"grad_norm": 8.916003107478165,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -0.15380506217479706,
"logits/rejected": -0.19326263666152954,
"logps/chosen": -0.446754515171051,
"logps/rejected": -0.6446987390518188,
"loss": 1.1864,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.446754515171051,
"rewards/margins": 0.19794420897960663,
"rewards/rejected": -0.6446987390518188,
"step": 270
},
{
"epoch": 0.5878423513694054,
"grad_norm": 10.314754743141027,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -0.15855953097343445,
"logits/rejected": -0.20561406016349792,
"logps/chosen": -0.4342300295829773,
"logps/rejected": -0.6545408964157104,
"loss": 1.1667,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.4342300295829773,
"rewards/margins": 0.22031080722808838,
"rewards/rejected": -0.6545408964157104,
"step": 275
},
{
"epoch": 0.5985303941215765,
"grad_norm": 9.8305630967928,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -0.12788251042366028,
"logits/rejected": -0.127780020236969,
"logps/chosen": -0.43477168679237366,
"logps/rejected": -0.7111866474151611,
"loss": 1.1802,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.43477168679237366,
"rewards/margins": 0.27641505002975464,
"rewards/rejected": -0.7111866474151611,
"step": 280
},
{
"epoch": 0.6092184368737475,
"grad_norm": 8.860790080933853,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -0.161110520362854,
"logits/rejected": -0.17115116119384766,
"logps/chosen": -0.42666491866111755,
"logps/rejected": -0.6175069808959961,
"loss": 1.1752,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.42666491866111755,
"rewards/margins": 0.19084201753139496,
"rewards/rejected": -0.6175069808959961,
"step": 285
},
{
"epoch": 0.6199064796259185,
"grad_norm": 7.06233886841465,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -0.1376260668039322,
"logits/rejected": -0.16905562579631805,
"logps/chosen": -0.4378681182861328,
"logps/rejected": -0.6595510840415955,
"loss": 1.1705,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.4378681182861328,
"rewards/margins": 0.22168295085430145,
"rewards/rejected": -0.6595510840415955,
"step": 290
},
{
"epoch": 0.6305945223780896,
"grad_norm": 10.466639287529468,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -0.2025509625673294,
"logits/rejected": -0.24337442219257355,
"logps/chosen": -0.42942291498184204,
"logps/rejected": -0.618620753288269,
"loss": 1.1967,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.42942291498184204,
"rewards/margins": 0.18919780850410461,
"rewards/rejected": -0.618620753288269,
"step": 295
},
{
"epoch": 0.6412825651302605,
"grad_norm": 8.130739148831358,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -0.16753128170967102,
"logits/rejected": -0.21636962890625,
"logps/chosen": -0.4428304135799408,
"logps/rejected": -0.6359456777572632,
"loss": 1.1789,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4428304135799408,
"rewards/margins": 0.19311529397964478,
"rewards/rejected": -0.6359456777572632,
"step": 300
},
{
"epoch": 0.6519706078824316,
"grad_norm": 7.273224925160923,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -0.15392135083675385,
"logits/rejected": -0.21642914414405823,
"logps/chosen": -0.47176113724708557,
"logps/rejected": -0.6521760821342468,
"loss": 1.1688,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.47176113724708557,
"rewards/margins": 0.18041494488716125,
"rewards/rejected": -0.6521760821342468,
"step": 305
},
{
"epoch": 0.6626586506346025,
"grad_norm": 7.301685407436071,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -0.08738047629594803,
"logits/rejected": -0.17941518127918243,
"logps/chosen": -0.4444531500339508,
"logps/rejected": -0.6384583711624146,
"loss": 1.1782,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.4444531500339508,
"rewards/margins": 0.19400522112846375,
"rewards/rejected": -0.6384583711624146,
"step": 310
},
{
"epoch": 0.6733466933867736,
"grad_norm": 8.792927929961811,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -0.12821796536445618,
"logits/rejected": -0.17637769877910614,
"logps/chosen": -0.430549293756485,
"logps/rejected": -0.673394501209259,
"loss": 1.1838,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.430549293756485,
"rewards/margins": 0.24284520745277405,
"rewards/rejected": -0.673394501209259,
"step": 315
},
{
"epoch": 0.6840347361389446,
"grad_norm": 9.135320352888758,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -0.1663762778043747,
"logits/rejected": -0.1760939061641693,
"logps/chosen": -0.44656792283058167,
"logps/rejected": -0.696045994758606,
"loss": 1.1597,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.44656792283058167,
"rewards/margins": 0.2494780570268631,
"rewards/rejected": -0.696045994758606,
"step": 320
},
{
"epoch": 0.6947227788911156,
"grad_norm": 7.683969138295821,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -0.10967272520065308,
"logits/rejected": -0.16051502525806427,
"logps/chosen": -0.4491657614707947,
"logps/rejected": -0.6456891298294067,
"loss": 1.1768,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4491657614707947,
"rewards/margins": 0.1965232938528061,
"rewards/rejected": -0.6456891298294067,
"step": 325
},
{
"epoch": 0.7054108216432866,
"grad_norm": 7.237560221559409,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -0.11157947778701782,
"logits/rejected": -0.1505707949399948,
"logps/chosen": -0.4335803985595703,
"logps/rejected": -0.6525880694389343,
"loss": 1.1697,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4335803985595703,
"rewards/margins": 0.2190077304840088,
"rewards/rejected": -0.6525880694389343,
"step": 330
},
{
"epoch": 0.7160988643954576,
"grad_norm": 10.892277516023302,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -0.07384945452213287,
"logits/rejected": -0.11905944347381592,
"logps/chosen": -0.44785404205322266,
"logps/rejected": -0.6679562926292419,
"loss": 1.165,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.44785404205322266,
"rewards/margins": 0.22010228037834167,
"rewards/rejected": -0.6679562926292419,
"step": 335
},
{
"epoch": 0.7267869071476286,
"grad_norm": 10.223464451109537,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -0.1539086103439331,
"logits/rejected": -0.17600053548812866,
"logps/chosen": -0.4725138545036316,
"logps/rejected": -0.6988605260848999,
"loss": 1.1726,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4725138545036316,
"rewards/margins": 0.2263466864824295,
"rewards/rejected": -0.6988605260848999,
"step": 340
},
{
"epoch": 0.7374749498997996,
"grad_norm": 9.502978098916548,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -0.21818223595619202,
"logits/rejected": -0.291820764541626,
"logps/chosen": -0.4557631015777588,
"logps/rejected": -0.6338008642196655,
"loss": 1.1847,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4557631015777588,
"rewards/margins": 0.17803780734539032,
"rewards/rejected": -0.6338008642196655,
"step": 345
},
{
"epoch": 0.7481629926519706,
"grad_norm": 8.906197274685542,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -0.16545064747333527,
"logits/rejected": -0.2143837958574295,
"logps/chosen": -0.485832542181015,
"logps/rejected": -0.6850191950798035,
"loss": 1.1775,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.485832542181015,
"rewards/margins": 0.19918662309646606,
"rewards/rejected": -0.6850191950798035,
"step": 350
},
{
"epoch": 0.7588510354041417,
"grad_norm": 9.223968907693207,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -0.15154987573623657,
"logits/rejected": -0.1329428106546402,
"logps/chosen": -0.43399348855018616,
"logps/rejected": -0.701296865940094,
"loss": 1.1559,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -0.43399348855018616,
"rewards/margins": 0.26730337738990784,
"rewards/rejected": -0.701296865940094,
"step": 355
},
{
"epoch": 0.7695390781563126,
"grad_norm": 8.817413940297333,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -0.1650848090648651,
"logits/rejected": -0.18241888284683228,
"logps/chosen": -0.4807559847831726,
"logps/rejected": -0.7194213271141052,
"loss": 1.1709,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4807559847831726,
"rewards/margins": 0.2386653870344162,
"rewards/rejected": -0.7194213271141052,
"step": 360
},
{
"epoch": 0.7802271209084837,
"grad_norm": 9.841178407980468,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -0.11777649074792862,
"logits/rejected": -0.1842351108789444,
"logps/chosen": -0.44159936904907227,
"logps/rejected": -0.6455782651901245,
"loss": 1.1759,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.44159936904907227,
"rewards/margins": 0.20397885143756866,
"rewards/rejected": -0.6455782651901245,
"step": 365
},
{
"epoch": 0.7909151636606546,
"grad_norm": 7.821082078374487,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -0.21091553568840027,
"logits/rejected": -0.23493704199790955,
"logps/chosen": -0.47468656301498413,
"logps/rejected": -0.6395518779754639,
"loss": 1.1857,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.47468656301498413,
"rewards/margins": 0.16486527025699615,
"rewards/rejected": -0.6395518779754639,
"step": 370
},
{
"epoch": 0.8016032064128257,
"grad_norm": 8.045168365501775,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -0.13924507796764374,
"logits/rejected": -0.159819096326828,
"logps/chosen": -0.4886155128479004,
"logps/rejected": -0.7014551758766174,
"loss": 1.1647,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.4886155128479004,
"rewards/margins": 0.21283963322639465,
"rewards/rejected": -0.7014551758766174,
"step": 375
},
{
"epoch": 0.8122912491649966,
"grad_norm": 6.9748624620101465,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -0.11520252376794815,
"logits/rejected": -0.1667747050523758,
"logps/chosen": -0.4690398573875427,
"logps/rejected": -0.6327452659606934,
"loss": 1.174,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4690398573875427,
"rewards/margins": 0.16370537877082825,
"rewards/rejected": -0.6327452659606934,
"step": 380
},
{
"epoch": 0.8229792919171677,
"grad_norm": 9.914022835602436,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -0.10868623107671738,
"logits/rejected": -0.08764289319515228,
"logps/chosen": -0.46836796402931213,
"logps/rejected": -0.7122930288314819,
"loss": 1.1774,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.46836796402931213,
"rewards/margins": 0.2439250648021698,
"rewards/rejected": -0.7122930288314819,
"step": 385
},
{
"epoch": 0.8336673346693386,
"grad_norm": 8.145036242622055,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -0.06981998682022095,
"logits/rejected": -0.1801852583885193,
"logps/chosen": -0.448079913854599,
"logps/rejected": -0.650068461894989,
"loss": 1.1708,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.448079913854599,
"rewards/margins": 0.20198853313922882,
"rewards/rejected": -0.650068461894989,
"step": 390
},
{
"epoch": 0.8443553774215097,
"grad_norm": 9.222740510393672,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -0.13624027371406555,
"logits/rejected": -0.1553780883550644,
"logps/chosen": -0.4544126093387604,
"logps/rejected": -0.6581717133522034,
"loss": 1.157,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.4544126093387604,
"rewards/margins": 0.20375914871692657,
"rewards/rejected": -0.6581717133522034,
"step": 395
},
{
"epoch": 0.8550434201736807,
"grad_norm": 7.23560205761774,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -0.1308642476797104,
"logits/rejected": -0.12576356530189514,
"logps/chosen": -0.4730139374732971,
"logps/rejected": -0.6597884893417358,
"loss": 1.1796,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4730139374732971,
"rewards/margins": 0.18677455186843872,
"rewards/rejected": -0.6597884893417358,
"step": 400
},
{
"epoch": 0.8550434201736807,
"eval_logits/chosen": 0.07805733382701874,
"eval_logits/rejected": 0.055379413068294525,
"eval_logps/chosen": -0.46297329664230347,
"eval_logps/rejected": -0.67759770154953,
"eval_loss": 1.1742559671401978,
"eval_rewards/accuracies": 0.7682926654815674,
"eval_rewards/chosen": -0.46297329664230347,
"eval_rewards/margins": 0.21462443470954895,
"eval_rewards/rejected": -0.67759770154953,
"eval_runtime": 422.9,
"eval_samples_per_second": 4.637,
"eval_steps_per_second": 0.291,
"step": 400
},
{
"epoch": 0.8657314629258517,
"grad_norm": 14.062888091631285,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -0.14197275042533875,
"logits/rejected": -0.19198641180992126,
"logps/chosen": -0.4838024973869324,
"logps/rejected": -0.6774718165397644,
"loss": 1.1821,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.4838024973869324,
"rewards/margins": 0.19366928935050964,
"rewards/rejected": -0.6774718165397644,
"step": 405
},
{
"epoch": 0.8764195056780227,
"grad_norm": 9.708416047557408,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -0.19223374128341675,
"logits/rejected": -0.2349742203950882,
"logps/chosen": -0.4497530460357666,
"logps/rejected": -0.6498802900314331,
"loss": 1.1835,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4497530460357666,
"rewards/margins": 0.20012721419334412,
"rewards/rejected": -0.6498802900314331,
"step": 410
},
{
"epoch": 0.8871075484301937,
"grad_norm": 10.370830255142504,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -0.0883495882153511,
"logits/rejected": -0.15084786713123322,
"logps/chosen": -0.4854651093482971,
"logps/rejected": -0.7252976298332214,
"loss": 1.1824,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.4854651093482971,
"rewards/margins": 0.23983249068260193,
"rewards/rejected": -0.7252976298332214,
"step": 415
},
{
"epoch": 0.8977955911823647,
"grad_norm": 10.050934008810426,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -0.11994824558496475,
"logits/rejected": -0.1958351880311966,
"logps/chosen": -0.4402541220188141,
"logps/rejected": -0.6553457379341125,
"loss": 1.169,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.4402541220188141,
"rewards/margins": 0.21509161591529846,
"rewards/rejected": -0.6553457379341125,
"step": 420
},
{
"epoch": 0.9084836339345357,
"grad_norm": 10.234554818131295,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -0.023924821987748146,
"logits/rejected": -0.06178613379597664,
"logps/chosen": -0.4181637763977051,
"logps/rejected": -0.6148607134819031,
"loss": 1.1693,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4181637763977051,
"rewards/margins": 0.1966969072818756,
"rewards/rejected": -0.6148607134819031,
"step": 425
},
{
"epoch": 0.9191716766867067,
"grad_norm": 7.946863491070272,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -0.06870210915803909,
"logits/rejected": -0.1558951586484909,
"logps/chosen": -0.4686119556427002,
"logps/rejected": -0.6619764566421509,
"loss": 1.1734,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4686119556427002,
"rewards/margins": 0.19336441159248352,
"rewards/rejected": -0.6619764566421509,
"step": 430
},
{
"epoch": 0.9298597194388778,
"grad_norm": 7.127358424532503,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -0.16497072577476501,
"logits/rejected": -0.2038412094116211,
"logps/chosen": -0.4426957666873932,
"logps/rejected": -0.6900163888931274,
"loss": 1.1724,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4426957666873932,
"rewards/margins": 0.2473207414150238,
"rewards/rejected": -0.6900163888931274,
"step": 435
},
{
"epoch": 0.9405477621910487,
"grad_norm": 8.700791903961722,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -0.16181275248527527,
"logits/rejected": -0.19335032999515533,
"logps/chosen": -0.4476253092288971,
"logps/rejected": -0.6875879168510437,
"loss": 1.1756,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.4476253092288971,
"rewards/margins": 0.2399626225233078,
"rewards/rejected": -0.6875879168510437,
"step": 440
},
{
"epoch": 0.9512358049432198,
"grad_norm": 7.814249452801736,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -0.1357225775718689,
"logits/rejected": -0.13643740117549896,
"logps/chosen": -0.44572919607162476,
"logps/rejected": -0.6457980871200562,
"loss": 1.168,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.44572919607162476,
"rewards/margins": 0.20006892085075378,
"rewards/rejected": -0.6457980871200562,
"step": 445
},
{
"epoch": 0.9619238476953907,
"grad_norm": 7.707995425556612,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -0.09782582521438599,
"logits/rejected": -0.12389625608921051,
"logps/chosen": -0.5031002759933472,
"logps/rejected": -0.7217256426811218,
"loss": 1.1607,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.5031002759933472,
"rewards/margins": 0.21862535178661346,
"rewards/rejected": -0.7217256426811218,
"step": 450
},
{
"epoch": 0.9726118904475618,
"grad_norm": 8.180782374754285,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -0.049927808344364166,
"logits/rejected": -0.12839142978191376,
"logps/chosen": -0.4278712272644043,
"logps/rejected": -0.662420928478241,
"loss": 1.1491,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4278712272644043,
"rewards/margins": 0.23454967141151428,
"rewards/rejected": -0.662420928478241,
"step": 455
},
{
"epoch": 0.9832999331997327,
"grad_norm": 9.706979128792979,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": 0.007074593100696802,
"logits/rejected": -0.09796188771724701,
"logps/chosen": -0.4365665316581726,
"logps/rejected": -0.6665615439414978,
"loss": 1.183,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4365665316581726,
"rewards/margins": 0.2299949824810028,
"rewards/rejected": -0.6665615439414978,
"step": 460
},
{
"epoch": 0.9939879759519038,
"grad_norm": 8.36305887438443,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -0.06538979709148407,
"logits/rejected": -0.11664478480815887,
"logps/chosen": -0.4604206681251526,
"logps/rejected": -0.6352392435073853,
"loss": 1.1796,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4604206681251526,
"rewards/margins": 0.17481860518455505,
"rewards/rejected": -0.6352392435073853,
"step": 465
},
{
"epoch": 0.9982631930527722,
"step": 467,
"total_flos": 0.0,
"train_loss": 1.2022870587587866,
"train_runtime": 20947.8363,
"train_samples_per_second": 2.858,
"train_steps_per_second": 0.022
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}