lambda-llama-3-8b-dpo-test / trainer_state.json
tanliboy's picture
Model save
b7cb8bf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984301412872841,
"eval_steps": 100,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020931449502878076,
"grad_norm": 4.875121866371553,
"learning_rate": 4.166666666666666e-09,
"logits/chosen": -2.238138437271118,
"logits/rejected": -2.554456949234009,
"logps/chosen": -443.7523193359375,
"logps/rejected": -491.8927001953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.020931449502878074,
"grad_norm": 5.553929970393955,
"learning_rate": 4.166666666666667e-08,
"logits/chosen": -2.4126930236816406,
"logits/rejected": -2.5005030632019043,
"logps/chosen": -418.43328857421875,
"logps/rejected": -405.0360107421875,
"loss": 0.6929,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.0017023859545588493,
"rewards/margins": 0.00048581857117824256,
"rewards/rejected": 0.0012165673542767763,
"step": 10
},
{
"epoch": 0.04186289900575615,
"grad_norm": 4.513029874801273,
"learning_rate": 8.333333333333334e-08,
"logits/chosen": -2.208683490753174,
"logits/rejected": -2.485910415649414,
"logps/chosen": -428.45208740234375,
"logps/rejected": -408.13763427734375,
"loss": 0.6933,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.0008482746779918671,
"rewards/margins": -0.00037219192017801106,
"rewards/rejected": 0.0012204666854813695,
"step": 20
},
{
"epoch": 0.06279434850863422,
"grad_norm": 4.637552468831084,
"learning_rate": 1.25e-07,
"logits/chosen": -2.224863290786743,
"logits/rejected": -2.4407901763916016,
"logps/chosen": -398.6038818359375,
"logps/rejected": -367.05999755859375,
"loss": 0.6924,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0041518621146678925,
"rewards/margins": 0.0011339159682393074,
"rewards/rejected": -0.005285778548568487,
"step": 30
},
{
"epoch": 0.0837257980115123,
"grad_norm": 4.657136939144448,
"learning_rate": 1.6666666666666668e-07,
"logits/chosen": -2.3235936164855957,
"logits/rejected": -2.4915928840637207,
"logps/chosen": -372.97442626953125,
"logps/rejected": -390.05841064453125,
"loss": 0.6899,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.019573217257857323,
"rewards/margins": 0.007190874312072992,
"rewards/rejected": -0.026764091104269028,
"step": 40
},
{
"epoch": 0.10465724751439037,
"grad_norm": 4.947790369246717,
"learning_rate": 1.9998927475076103e-07,
"logits/chosen": -2.1541531085968018,
"logits/rejected": -2.355862855911255,
"logps/chosen": -408.7329406738281,
"logps/rejected": -406.50347900390625,
"loss": 0.6855,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.04146841913461685,
"rewards/margins": 0.02013658545911312,
"rewards/rejected": -0.061604999005794525,
"step": 50
},
{
"epoch": 0.12558869701726844,
"grad_norm": 6.135445605235113,
"learning_rate": 1.9961413253717213e-07,
"logits/chosen": -2.120229482650757,
"logits/rejected": -2.287370204925537,
"logps/chosen": -376.740234375,
"logps/rejected": -386.8778381347656,
"loss": 0.678,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.08536554872989655,
"rewards/margins": 0.03690432757139206,
"rewards/rejected": -0.12226986885070801,
"step": 60
},
{
"epoch": 0.14652014652014653,
"grad_norm": 5.2300665585071835,
"learning_rate": 1.9870502626379125e-07,
"logits/chosen": -2.208547830581665,
"logits/rejected": -2.316659927368164,
"logps/chosen": -425.2916564941406,
"logps/rejected": -429.31463623046875,
"loss": 0.6673,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.14128030836582184,
"rewards/margins": 0.05471445247530937,
"rewards/rejected": -0.1959947645664215,
"step": 70
},
{
"epoch": 0.1674515960230246,
"grad_norm": 6.361729619349137,
"learning_rate": 1.9726682903510838e-07,
"logits/chosen": -1.8886642456054688,
"logits/rejected": -2.2390127182006836,
"logps/chosen": -470.6441955566406,
"logps/rejected": -419.4126892089844,
"loss": 0.6583,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2689892053604126,
"rewards/margins": 0.07578183710575104,
"rewards/rejected": -0.34477105736732483,
"step": 80
},
{
"epoch": 0.18838304552590268,
"grad_norm": 7.250967252041406,
"learning_rate": 1.9530725005474194e-07,
"logits/chosen": -2.3355867862701416,
"logits/rejected": -2.404792070388794,
"logps/chosen": -411.76806640625,
"logps/rejected": -441.7333068847656,
"loss": 0.6355,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.42172950506210327,
"rewards/margins": 0.12971071898937225,
"rewards/rejected": -0.5514402985572815,
"step": 90
},
{
"epoch": 0.20931449502878074,
"grad_norm": 7.1454110672964335,
"learning_rate": 1.9283679330160724e-07,
"logits/chosen": -2.2639448642730713,
"logits/rejected": -2.5537800788879395,
"logps/chosen": -477.0587463378906,
"logps/rejected": -489.705810546875,
"loss": 0.6351,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.6349204778671265,
"rewards/margins": 0.18245458602905273,
"rewards/rejected": -0.8173751831054688,
"step": 100
},
{
"epoch": 0.20931449502878074,
"eval_logits/chosen": -2.2922377586364746,
"eval_logits/rejected": -2.4565351009368896,
"eval_logps/chosen": -472.2982177734375,
"eval_logps/rejected": -487.7696533203125,
"eval_loss": 0.6359348893165588,
"eval_rewards/accuracies": 0.6746031641960144,
"eval_rewards/chosen": -0.675361156463623,
"eval_rewards/margins": 0.2425757199525833,
"eval_rewards/rejected": -0.9179368615150452,
"eval_runtime": 88.9262,
"eval_samples_per_second": 22.491,
"eval_steps_per_second": 0.708,
"step": 100
},
{
"epoch": 0.2302459445316588,
"grad_norm": 9.360622478279684,
"learning_rate": 1.898687012251826e-07,
"logits/chosen": -2.217447280883789,
"logits/rejected": -2.3863320350646973,
"logps/chosen": -481.96990966796875,
"logps/rejected": -499.48345947265625,
"loss": 0.6311,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.7452836036682129,
"rewards/margins": 0.209157794713974,
"rewards/rejected": -0.9544414281845093,
"step": 110
},
{
"epoch": 0.25117739403453687,
"grad_norm": 7.953755036427896,
"learning_rate": 1.8641888376168482e-07,
"logits/chosen": -2.2092318534851074,
"logits/rejected": -2.2929816246032715,
"logps/chosen": -454.405517578125,
"logps/rejected": -497.1351623535156,
"loss": 0.6209,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.7448408007621765,
"rewards/margins": 0.29463425278663635,
"rewards/rejected": -1.0394752025604248,
"step": 120
},
{
"epoch": 0.272108843537415,
"grad_norm": 8.821105331401093,
"learning_rate": 1.8250583305165094e-07,
"logits/chosen": -2.2061495780944824,
"logits/rejected": -2.3711869716644287,
"logps/chosen": -472.7056579589844,
"logps/rejected": -487.33880615234375,
"loss": 0.6204,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.7287603616714478,
"rewards/margins": 0.2083979845046997,
"rewards/rejected": -0.9371584057807922,
"step": 130
},
{
"epoch": 0.29304029304029305,
"grad_norm": 9.167325969849378,
"learning_rate": 1.78150524316067e-07,
"logits/chosen": -2.2468433380126953,
"logits/rejected": -2.466036319732666,
"logps/chosen": -501.697021484375,
"logps/rejected": -497.5772399902344,
"loss": 0.6195,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7497612237930298,
"rewards/margins": 0.30543631315231323,
"rewards/rejected": -1.0551974773406982,
"step": 140
},
{
"epoch": 0.3139717425431711,
"grad_norm": 10.828055616866019,
"learning_rate": 1.7337630342238038e-07,
"logits/chosen": -2.163837432861328,
"logits/rejected": -2.328864574432373,
"logps/chosen": -474.3462829589844,
"logps/rejected": -480.0904846191406,
"loss": 0.621,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7979869246482849,
"rewards/margins": 0.22926858067512512,
"rewards/rejected": -1.0272555351257324,
"step": 150
},
{
"epoch": 0.3349031920460492,
"grad_norm": 9.907119624068729,
"learning_rate": 1.682087617430782e-07,
"logits/chosen": -2.1256282329559326,
"logits/rejected": -2.4207208156585693,
"logps/chosen": -476.00933837890625,
"logps/rejected": -491.25799560546875,
"loss": 0.6148,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.8471584320068359,
"rewards/margins": 0.2906045913696289,
"rewards/rejected": -1.1377630233764648,
"step": 160
},
{
"epoch": 0.35583464154892724,
"grad_norm": 10.130673374633192,
"learning_rate": 1.6267559897763025e-07,
"logits/chosen": -2.240748405456543,
"logits/rejected": -2.3730461597442627,
"logps/chosen": -466.5884704589844,
"logps/rejected": -470.2240295410156,
"loss": 0.6136,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8948806524276733,
"rewards/margins": 0.24292059242725372,
"rewards/rejected": -1.137801170349121,
"step": 170
},
{
"epoch": 0.37676609105180536,
"grad_norm": 12.664244024162585,
"learning_rate": 1.5680647467311557e-07,
"logits/chosen": -2.3886361122131348,
"logits/rejected": -2.48551344871521,
"logps/chosen": -466.68115234375,
"logps/rejected": -481.260498046875,
"loss": 0.589,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8065212965011597,
"rewards/margins": 0.28530603647232056,
"rewards/rejected": -1.091827392578125,
"step": 180
},
{
"epoch": 0.3976975405546834,
"grad_norm": 15.413041204374277,
"learning_rate": 1.506328492394303e-07,
"logits/chosen": -2.425926685333252,
"logits/rejected": -2.436657190322876,
"logps/chosen": -480.2686462402344,
"logps/rejected": -514.1541137695312,
"loss": 0.6247,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.0268957614898682,
"rewards/margins": 0.26106053590774536,
"rewards/rejected": -1.2879562377929688,
"step": 190
},
{
"epoch": 0.4186289900575615,
"grad_norm": 16.30024056431674,
"learning_rate": 1.4418781531128634e-07,
"logits/chosen": -2.3286993503570557,
"logits/rejected": -2.387202024459839,
"logps/chosen": -454.547119140625,
"logps/rejected": -511.773681640625,
"loss": 0.6101,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.8713696599006653,
"rewards/margins": 0.2568342685699463,
"rewards/rejected": -1.1282037496566772,
"step": 200
},
{
"epoch": 0.4186289900575615,
"eval_logits/chosen": -2.293304443359375,
"eval_logits/rejected": -2.447746753692627,
"eval_logps/chosen": -484.72442626953125,
"eval_logps/rejected": -515.6393432617188,
"eval_loss": 0.5989560484886169,
"eval_rewards/accuracies": 0.7142857313156128,
"eval_rewards/chosen": -0.7996230125427246,
"eval_rewards/margins": 0.39701077342033386,
"eval_rewards/rejected": -1.1966338157653809,
"eval_runtime": 88.7991,
"eval_samples_per_second": 22.523,
"eval_steps_per_second": 0.709,
"step": 200
},
{
"epoch": 0.43956043956043955,
"grad_norm": 12.590959189684769,
"learning_rate": 1.375059203609562e-07,
"logits/chosen": -2.251105785369873,
"logits/rejected": -2.49545955657959,
"logps/chosen": -514.7989501953125,
"logps/rejected": -508.8777770996094,
"loss": 0.6036,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9383622407913208,
"rewards/margins": 0.3089975416660309,
"rewards/rejected": -1.2473597526550293,
"step": 210
},
{
"epoch": 0.4604918890633176,
"grad_norm": 32.27211919256004,
"learning_rate": 1.306229815126159e-07,
"logits/chosen": -2.374002456665039,
"logits/rejected": -2.5104002952575684,
"logps/chosen": -453.17889404296875,
"logps/rejected": -502.31829833984375,
"loss": 0.5905,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.0016330480575562,
"rewards/margins": 0.3531147539615631,
"rewards/rejected": -1.3547478914260864,
"step": 220
},
{
"epoch": 0.48142333856619574,
"grad_norm": 11.074374701972996,
"learning_rate": 1.2357589355094274e-07,
"logits/chosen": -2.240893602371216,
"logits/rejected": -2.4365756511688232,
"logps/chosen": -464.9483947753906,
"logps/rejected": -497.55950927734375,
"loss": 0.6032,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8673335909843445,
"rewards/margins": 0.4288042187690735,
"rewards/rejected": -1.2961379289627075,
"step": 230
},
{
"epoch": 0.5023547880690737,
"grad_norm": 13.608161796310325,
"learning_rate": 1.1640243115310217e-07,
"logits/chosen": -2.263231039047241,
"logits/rejected": -2.374429225921631,
"logps/chosen": -483.5979919433594,
"logps/rejected": -511.7247009277344,
"loss": 0.5829,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8133866190910339,
"rewards/margins": 0.35704511404037476,
"rewards/rejected": -1.1704318523406982,
"step": 240
},
{
"epoch": 0.5232862375719518,
"grad_norm": 14.904992006409358,
"learning_rate": 1.0914104640422679e-07,
"logits/chosen": -2.312152862548828,
"logits/rejected": -2.504575490951538,
"logps/chosen": -487.4195861816406,
"logps/rejected": -509.62213134765625,
"loss": 0.5914,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9289507865905762,
"rewards/margins": 0.4651150703430176,
"rewards/rejected": -1.3940656185150146,
"step": 250
},
{
"epoch": 0.54421768707483,
"grad_norm": 32.859126344847056,
"learning_rate": 1.0183066268176774e-07,
"logits/chosen": -2.452216863632202,
"logits/rejected": -2.5787224769592285,
"logps/chosen": -454.101318359375,
"logps/rejected": -491.07708740234375,
"loss": 0.5958,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.8231406211853027,
"rewards/margins": 0.37211668491363525,
"rewards/rejected": -1.1952574253082275,
"step": 260
},
{
"epoch": 0.565149136577708,
"grad_norm": 16.410575278967542,
"learning_rate": 9.451046601356724e-08,
"logits/chosen": -2.4211385250091553,
"logits/rejected": -2.5718777179718018,
"logps/chosen": -482.42889404296875,
"logps/rejected": -517.08447265625,
"loss": 0.5968,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7412260174751282,
"rewards/margins": 0.46059027314186096,
"rewards/rejected": -1.201816439628601,
"step": 270
},
{
"epoch": 0.5860805860805861,
"grad_norm": 14.64481409505789,
"learning_rate": 8.721969502803953e-08,
"logits/chosen": -2.414080858230591,
"logits/rejected": -2.641306161880493,
"logps/chosen": -471.8504943847656,
"logps/rejected": -492.3824157714844,
"loss": 0.6088,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9498642086982727,
"rewards/margins": 0.3709770143032074,
"rewards/rejected": -1.3208411931991577,
"step": 280
},
{
"epoch": 0.6070120355834642,
"grad_norm": 21.87484189841818,
"learning_rate": 7.999743062239557e-08,
"logits/chosen": -2.5216970443725586,
"logits/rejected": -2.5266430377960205,
"logps/chosen": -452.1351623535156,
"logps/rejected": -507.50408935546875,
"loss": 0.5975,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9256707429885864,
"rewards/margins": 0.38739797472953796,
"rewards/rejected": -1.3130687475204468,
"step": 290
},
{
"epoch": 0.6279434850863422,
"grad_norm": 13.23460942074812,
"learning_rate": 7.28823864763583e-08,
"logits/chosen": -2.3628604412078857,
"logits/rejected": -2.5071964263916016,
"logps/chosen": -530.2737426757812,
"logps/rejected": -534.9356689453125,
"loss": 0.5738,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9033306241035461,
"rewards/margins": 0.409872442483902,
"rewards/rejected": -1.313202977180481,
"step": 300
},
{
"epoch": 0.6279434850863422,
"eval_logits/chosen": -2.3505780696868896,
"eval_logits/rejected": -2.500311851501465,
"eval_logps/chosen": -511.9820861816406,
"eval_logps/rejected": -562.04541015625,
"eval_loss": 0.5819065570831299,
"eval_rewards/accuracies": 0.7142857313156128,
"eval_rewards/chosen": -1.0721999406814575,
"eval_rewards/margins": 0.5884942412376404,
"eval_rewards/rejected": -1.6606942415237427,
"eval_runtime": 88.8035,
"eval_samples_per_second": 22.522,
"eval_steps_per_second": 0.709,
"step": 300
},
{
"epoch": 0.6488749345892203,
"grad_norm": 23.240653261962176,
"learning_rate": 6.591270153428288e-08,
"logits/chosen": -2.3066353797912598,
"logits/rejected": -2.4188685417175293,
"logps/chosen": -530.1605224609375,
"logps/rejected": -555.5882568359375,
"loss": 0.5816,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0851608514785767,
"rewards/margins": 0.6294665932655334,
"rewards/rejected": -1.7146275043487549,
"step": 310
},
{
"epoch": 0.6698063840920984,
"grad_norm": 12.35925417664361,
"learning_rate": 5.912573556804452e-08,
"logits/chosen": -2.4511845111846924,
"logits/rejected": -2.5960700511932373,
"logps/chosen": -462.8910217285156,
"logps/rejected": -492.77459716796875,
"loss": 0.5721,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9141901135444641,
"rewards/margins": 0.49542441964149475,
"rewards/rejected": -1.4096145629882812,
"step": 320
},
{
"epoch": 0.6907378335949764,
"grad_norm": 19.635922794228048,
"learning_rate": 5.255786891654399e-08,
"logits/chosen": -2.2881722450256348,
"logits/rejected": -2.3245983123779297,
"logps/chosen": -490.61956787109375,
"logps/rejected": -528.5936279296875,
"loss": 0.5831,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0118191242218018,
"rewards/margins": 0.3562072217464447,
"rewards/rejected": -1.3680263757705688,
"step": 330
},
{
"epoch": 0.7116692830978545,
"grad_norm": 34.0341920873177,
"learning_rate": 4.624430747529102e-08,
"logits/chosen": -2.2541534900665283,
"logits/rejected": -2.3677923679351807,
"logps/chosen": -520.711181640625,
"logps/rejected": -555.8665771484375,
"loss": 0.5771,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1455854177474976,
"rewards/margins": 0.44834479689598083,
"rewards/rejected": -1.5939301252365112,
"step": 340
},
{
"epoch": 0.7326007326007326,
"grad_norm": 20.184086200131315,
"learning_rate": 4.0218893981385925e-08,
"logits/chosen": -2.336240291595459,
"logits/rejected": -2.5228190422058105,
"logps/chosen": -490.032470703125,
"logps/rejected": -514.3966064453125,
"loss": 0.5772,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1221544742584229,
"rewards/margins": 0.41546517610549927,
"rewards/rejected": -1.5376195907592773,
"step": 350
},
{
"epoch": 0.7535321821036107,
"grad_norm": 14.840705395348046,
"learning_rate": 3.45139266054715e-08,
"logits/chosen": -2.3588707447052,
"logits/rejected": -2.5286855697631836,
"logps/chosen": -525.8394775390625,
"logps/rejected": -543.2139892578125,
"loss": 0.5961,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9700316190719604,
"rewards/margins": 0.42892080545425415,
"rewards/rejected": -1.3989523649215698,
"step": 360
},
{
"epoch": 0.7744636316064888,
"grad_norm": 12.56992511385935,
"learning_rate": 2.9159985823062993e-08,
"logits/chosen": -2.4362387657165527,
"logits/rejected": -2.588212251663208,
"logps/chosen": -469.63018798828125,
"logps/rejected": -491.34185791015625,
"loss": 0.5787,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9046362638473511,
"rewards/margins": 0.42833614349365234,
"rewards/rejected": -1.332972526550293,
"step": 370
},
{
"epoch": 0.7953950811093669,
"grad_norm": 14.216122099186137,
"learning_rate": 2.4185770493280577e-08,
"logits/chosen": -2.4785826206207275,
"logits/rejected": -2.5475876331329346,
"logps/chosen": -463.3335876464844,
"logps/rejected": -562.8516235351562,
"loss": 0.5816,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0568846464157104,
"rewards/margins": 0.6403349041938782,
"rewards/rejected": -1.6972196102142334,
"step": 380
},
{
"epoch": 0.8163265306122449,
"grad_norm": 17.166403382209694,
"learning_rate": 1.9617944023656108e-08,
"logits/chosen": -2.3412299156188965,
"logits/rejected": -2.431159257888794,
"logps/chosen": -569.6896362304688,
"logps/rejected": -604.4752197265625,
"loss": 0.5647,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.135259985923767,
"rewards/margins": 0.5612015724182129,
"rewards/rejected": -1.6964616775512695,
"step": 390
},
{
"epoch": 0.837257980115123,
"grad_norm": 25.5326876410102,
"learning_rate": 1.5480991445620538e-08,
"logits/chosen": -2.438910961151123,
"logits/rejected": -2.621582269668579,
"logps/chosen": -477.71551513671875,
"logps/rejected": -516.8345336914062,
"loss": 0.5808,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0198707580566406,
"rewards/margins": 0.485908567905426,
"rewards/rejected": -1.5057791471481323,
"step": 400
},
{
"epoch": 0.837257980115123,
"eval_logits/chosen": -2.4454309940338135,
"eval_logits/rejected": -2.60603404045105,
"eval_logps/chosen": -509.0269470214844,
"eval_logps/rejected": -557.9309692382812,
"eval_loss": 0.5776250958442688,
"eval_rewards/accuracies": 0.7063491940498352,
"eval_rewards/chosen": -1.042648196220398,
"eval_rewards/margins": 0.5769018530845642,
"eval_rewards/rejected": -1.619550108909607,
"eval_runtime": 88.8844,
"eval_samples_per_second": 22.501,
"eval_steps_per_second": 0.709,
"step": 400
},
{
"epoch": 0.858189429618001,
"grad_norm": 12.2363803809367,
"learning_rate": 1.1797088166794e-08,
"logits/chosen": -2.327822208404541,
"logits/rejected": -2.539658308029175,
"logps/chosen": -523.35693359375,
"logps/rejected": -556.1873168945312,
"loss": 0.5837,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0230482816696167,
"rewards/margins": 0.5963117480278015,
"rewards/rejected": -1.6193599700927734,
"step": 410
},
{
"epoch": 0.8791208791208791,
"grad_norm": 17.1630701293683,
"learning_rate": 8.585981103608341e-09,
"logits/chosen": -2.3502843379974365,
"logits/rejected": -2.5074477195739746,
"logps/chosen": -481.4237365722656,
"logps/rejected": -559.5806884765625,
"loss": 0.567,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0329768657684326,
"rewards/margins": 0.5681900978088379,
"rewards/rejected": -1.6011669635772705,
"step": 420
},
{
"epoch": 0.9000523286237572,
"grad_norm": 16.184790708379772,
"learning_rate": 5.864882831430273e-09,
"logits/chosen": -2.352280378341675,
"logits/rejected": -2.436026096343994,
"logps/chosen": -513.5238647460938,
"logps/rejected": -551.8958129882812,
"loss": 0.5755,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0582252740859985,
"rewards/margins": 0.5332168340682983,
"rewards/rejected": -1.5914418697357178,
"step": 430
},
{
"epoch": 0.9209837781266352,
"grad_norm": 17.526839475687186,
"learning_rate": 3.6483793195745682e-09,
"logits/chosen": -2.3311455249786377,
"logits/rejected": -2.440988063812256,
"logps/chosen": -482.4281311035156,
"logps/rejected": -498.60345458984375,
"loss": 0.5787,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0323375463485718,
"rewards/margins": 0.4054194390773773,
"rewards/rejected": -1.4377570152282715,
"step": 440
},
{
"epoch": 0.9419152276295133,
"grad_norm": 14.705602904039639,
"learning_rate": 1.9483517457776433e-09,
"logits/chosen": -2.2350025177001953,
"logits/rejected": -2.3830924034118652,
"logps/chosen": -490.513427734375,
"logps/rejected": -551.2727661132812,
"loss": 0.579,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0369895696640015,
"rewards/margins": 0.5606644153594971,
"rewards/rejected": -1.597654104232788,
"step": 450
},
{
"epoch": 0.9628466771323915,
"grad_norm": 15.228089724513376,
"learning_rate": 7.739128092312918e-10,
"logits/chosen": -2.281054973602295,
"logits/rejected": -2.4768524169921875,
"logps/chosen": -496.84814453125,
"logps/rejected": -510.46258544921875,
"loss": 0.579,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0984748601913452,
"rewards/margins": 0.47915878891944885,
"rewards/rejected": -1.5776336193084717,
"step": 460
},
{
"epoch": 0.9837781266352695,
"grad_norm": 17.607957497609636,
"learning_rate": 1.313578835593465e-10,
"logits/chosen": -2.3311634063720703,
"logits/rejected": -2.4415996074676514,
"logps/chosen": -519.3492431640625,
"logps/rejected": -541.9041137695312,
"loss": 0.5694,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.0364539623260498,
"rewards/margins": 0.33034905791282654,
"rewards/rejected": -1.3668031692504883,
"step": 470
},
{
"epoch": 0.9984301412872841,
"step": 477,
"total_flos": 0.0,
"train_loss": 0.6095632167232361,
"train_runtime": 6900.3625,
"train_samples_per_second": 8.86,
"train_steps_per_second": 0.069
}
],
"logging_steps": 10,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}