zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
6f9ad2c verified
raw
history blame
No virus
77.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 1346,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 23.545113700609754,
"learning_rate": 3.7037037037037036e-09,
"logits/chosen": -2.017277240753174,
"logits/rejected": -1.9505600929260254,
"logps/chosen": -342.8155212402344,
"logps/rejected": -264.6424865722656,
"loss": 0.6932,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 23.704110924178444,
"learning_rate": 3.7037037037037036e-08,
"logits/chosen": -1.852867603302002,
"logits/rejected": -1.7641547918319702,
"logps/chosen": -243.63710021972656,
"logps/rejected": -215.13551330566406,
"loss": 0.6933,
"rewards/accuracies": 0.4027777910232544,
"rewards/chosen": -0.0004846964729949832,
"rewards/margins": -0.001089173136278987,
"rewards/rejected": 0.0006044767214916646,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 27.48286479448467,
"learning_rate": 7.407407407407407e-08,
"logits/chosen": -1.9755146503448486,
"logits/rejected": -1.8412548303604126,
"logps/chosen": -241.4310302734375,
"logps/rejected": -210.738037109375,
"loss": 0.6927,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0005561274592764676,
"rewards/margins": 0.0004348217917140573,
"rewards/rejected": 0.00012130556569900364,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 23.49895713678948,
"learning_rate": 1.111111111111111e-07,
"logits/chosen": -1.8477449417114258,
"logits/rejected": -1.781266450881958,
"logps/chosen": -277.84527587890625,
"logps/rejected": -244.1582489013672,
"loss": 0.6915,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.005596889648586512,
"rewards/margins": 0.0021990840323269367,
"rewards/rejected": 0.003397804917767644,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 21.952979365752906,
"learning_rate": 1.4814814814814815e-07,
"logits/chosen": -1.8662084341049194,
"logits/rejected": -1.8252031803131104,
"logps/chosen": -279.81585693359375,
"logps/rejected": -256.37322998046875,
"loss": 0.6867,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.026755522936582565,
"rewards/margins": 0.01376323588192463,
"rewards/rejected": 0.01299228798598051,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 22.515894719363914,
"learning_rate": 1.8518518518518516e-07,
"logits/chosen": -1.886828064918518,
"logits/rejected": -1.796974539756775,
"logps/chosen": -245.1302490234375,
"logps/rejected": -207.6703338623047,
"loss": 0.68,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.05396401137113571,
"rewards/margins": 0.03148679807782173,
"rewards/rejected": 0.02247721515595913,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 21.11853715417876,
"learning_rate": 2.222222222222222e-07,
"logits/chosen": -1.8658056259155273,
"logits/rejected": -1.7990939617156982,
"logps/chosen": -245.4588623046875,
"logps/rejected": -228.79067993164062,
"loss": 0.6687,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.0710381492972374,
"rewards/margins": 0.053314320743083954,
"rewards/rejected": 0.01772383041679859,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 21.639022509531838,
"learning_rate": 2.5925925925925923e-07,
"logits/chosen": -1.8920536041259766,
"logits/rejected": -1.8345096111297607,
"logps/chosen": -223.96511840820312,
"logps/rejected": -196.08775329589844,
"loss": 0.6547,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.06073574349284172,
"rewards/margins": 0.08626440167427063,
"rewards/rejected": -0.02552866004407406,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 22.179495576107882,
"learning_rate": 2.962962962962963e-07,
"logits/chosen": -1.8825687170028687,
"logits/rejected": -1.847541093826294,
"logps/chosen": -232.0540313720703,
"logps/rejected": -240.20120239257812,
"loss": 0.6407,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.03458085656166077,
"rewards/margins": 0.1154135912656784,
"rewards/rejected": -0.08083274215459824,
"step": 80
},
{
"epoch": 0.07,
"grad_norm": 21.88163995061792,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -1.9384691715240479,
"logits/rejected": -1.922488808631897,
"logps/chosen": -248.4744415283203,
"logps/rejected": -261.0725402832031,
"loss": 0.6135,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.029202425852417946,
"rewards/margins": 0.2103302925825119,
"rewards/rejected": -0.2395327389240265,
"step": 90
},
{
"epoch": 0.07,
"grad_norm": 27.693123307166786,
"learning_rate": 3.703703703703703e-07,
"logits/chosen": -1.9232885837554932,
"logits/rejected": -1.9198648929595947,
"logps/chosen": -245.3694610595703,
"logps/rejected": -275.853515625,
"loss": 0.5905,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.23111872375011444,
"rewards/margins": 0.2522026598453522,
"rewards/rejected": -0.4833213686943054,
"step": 100
},
{
"epoch": 0.07,
"eval_logits/chosen": -1.787776231765747,
"eval_logits/rejected": -1.7244033813476562,
"eval_logps/chosen": -325.57440185546875,
"eval_logps/rejected": -351.93182373046875,
"eval_loss": 0.6428781747817993,
"eval_rewards/accuracies": 0.671875,
"eval_rewards/chosen": -0.13797907531261444,
"eval_rewards/margins": 0.2060878425836563,
"eval_rewards/rejected": -0.34406691789627075,
"eval_runtime": 97.6555,
"eval_samples_per_second": 20.48,
"eval_steps_per_second": 0.328,
"step": 100
},
{
"epoch": 0.08,
"grad_norm": 33.52938589908786,
"learning_rate": 4.0740740740740737e-07,
"logits/chosen": -1.8354734182357788,
"logits/rejected": -1.7754793167114258,
"logps/chosen": -295.2403869628906,
"logps/rejected": -316.46923828125,
"loss": 0.5723,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.5448485016822815,
"rewards/margins": 0.3984159529209137,
"rewards/rejected": -0.943264365196228,
"step": 110
},
{
"epoch": 0.09,
"grad_norm": 32.42547027840792,
"learning_rate": 4.444444444444444e-07,
"logits/chosen": -1.7011499404907227,
"logits/rejected": -1.708805799484253,
"logps/chosen": -307.11334228515625,
"logps/rejected": -348.78729248046875,
"loss": 0.5442,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5989453196525574,
"rewards/margins": 0.6007151007652283,
"rewards/rejected": -1.1996605396270752,
"step": 120
},
{
"epoch": 0.1,
"grad_norm": 33.08064593315955,
"learning_rate": 4.814814814814814e-07,
"logits/chosen": -1.70786452293396,
"logits/rejected": -1.6745007038116455,
"logps/chosen": -290.42498779296875,
"logps/rejected": -343.42510986328125,
"loss": 0.5139,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7598094344139099,
"rewards/margins": 0.6571252346038818,
"rewards/rejected": -1.4169347286224365,
"step": 130
},
{
"epoch": 0.1,
"grad_norm": 33.94320124887001,
"learning_rate": 4.999789692194508e-07,
"logits/chosen": -1.8099472522735596,
"logits/rejected": -1.754595398902893,
"logps/chosen": -314.9842224121094,
"logps/rejected": -356.81011962890625,
"loss": 0.5172,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.812475860118866,
"rewards/margins": 0.6942508816719055,
"rewards/rejected": -1.5067269802093506,
"step": 140
},
{
"epoch": 0.11,
"grad_norm": 39.07047935152003,
"learning_rate": 4.998107442045616e-07,
"logits/chosen": -1.6377861499786377,
"logits/rejected": -1.6226139068603516,
"logps/chosen": -304.92840576171875,
"logps/rejected": -393.1883239746094,
"loss": 0.5094,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.8283722996711731,
"rewards/margins": 0.8278924822807312,
"rewards/rejected": -1.6562646627426147,
"step": 150
},
{
"epoch": 0.12,
"grad_norm": 42.785505208166626,
"learning_rate": 4.994744073829293e-07,
"logits/chosen": -1.5746722221374512,
"logits/rejected": -1.4142063856124878,
"logps/chosen": -343.25823974609375,
"logps/rejected": -402.02691650390625,
"loss": 0.5011,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8369730710983276,
"rewards/margins": 0.8556060791015625,
"rewards/rejected": -1.6925792694091797,
"step": 160
},
{
"epoch": 0.13,
"grad_norm": 48.274083606893925,
"learning_rate": 4.989701850946613e-07,
"logits/chosen": -1.5056556463241577,
"logits/rejected": -1.3766965866088867,
"logps/chosen": -335.7103271484375,
"logps/rejected": -388.94097900390625,
"loss": 0.4643,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9376843571662903,
"rewards/margins": 0.8313838243484497,
"rewards/rejected": -1.7690680027008057,
"step": 170
},
{
"epoch": 0.13,
"grad_norm": 46.176765511998994,
"learning_rate": 4.982984166595104e-07,
"logits/chosen": -1.4761296510696411,
"logits/rejected": -1.3599636554718018,
"logps/chosen": -408.171630859375,
"logps/rejected": -472.0873107910156,
"loss": 0.4577,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2097257375717163,
"rewards/margins": 1.240505576133728,
"rewards/rejected": -2.4502310752868652,
"step": 180
},
{
"epoch": 0.14,
"grad_norm": 43.28509926988276,
"learning_rate": 4.974595541485259e-07,
"logits/chosen": -1.3221380710601807,
"logits/rejected": -1.204590082168579,
"logps/chosen": -335.5089416503906,
"logps/rejected": -428.30621337890625,
"loss": 0.4635,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.076790452003479,
"rewards/margins": 1.0969324111938477,
"rewards/rejected": -2.173722743988037,
"step": 190
},
{
"epoch": 0.15,
"grad_norm": 56.09927596713516,
"learning_rate": 4.964541620798307e-07,
"logits/chosen": -1.2160365581512451,
"logits/rejected": -1.118375539779663,
"logps/chosen": -348.90753173828125,
"logps/rejected": -468.21563720703125,
"loss": 0.4495,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2727657556533813,
"rewards/margins": 1.1830675601959229,
"rewards/rejected": -2.4558334350585938,
"step": 200
},
{
"epoch": 0.15,
"eval_logits/chosen": -1.4371435642242432,
"eval_logits/rejected": -1.366525650024414,
"eval_logps/chosen": -361.1814880371094,
"eval_logps/rejected": -427.2509765625,
"eval_loss": 0.559985339641571,
"eval_rewards/accuracies": 0.74609375,
"eval_rewards/chosen": -0.4940495491027832,
"eval_rewards/margins": 0.6032084226608276,
"eval_rewards/rejected": -1.0972579717636108,
"eval_runtime": 97.4901,
"eval_samples_per_second": 20.515,
"eval_steps_per_second": 0.328,
"step": 200
},
{
"epoch": 0.16,
"grad_norm": 49.36366262587358,
"learning_rate": 4.952829170387241e-07,
"logits/chosen": -1.1800302267074585,
"logits/rejected": -1.0126550197601318,
"logps/chosen": -380.48828125,
"logps/rejected": -450.0765075683594,
"loss": 0.4458,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.3101383447647095,
"rewards/margins": 0.9806028604507446,
"rewards/rejected": -2.290741443634033,
"step": 210
},
{
"epoch": 0.16,
"grad_norm": 57.25684926546983,
"learning_rate": 4.939466072223697e-07,
"logits/chosen": -1.2157623767852783,
"logits/rejected": -1.0489680767059326,
"logps/chosen": -372.591064453125,
"logps/rejected": -468.7542419433594,
"loss": 0.4545,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3519532680511475,
"rewards/margins": 1.1502256393432617,
"rewards/rejected": -2.50217866897583,
"step": 220
},
{
"epoch": 0.17,
"grad_norm": 40.98752146946231,
"learning_rate": 4.924461319093725e-07,
"logits/chosen": -1.1049861907958984,
"logits/rejected": -1.0018864870071411,
"logps/chosen": -361.7793884277344,
"logps/rejected": -487.15460205078125,
"loss": 0.4436,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1743983030319214,
"rewards/margins": 1.1021788120269775,
"rewards/rejected": -2.2765772342681885,
"step": 230
},
{
"epoch": 0.18,
"grad_norm": 57.39176618017778,
"learning_rate": 4.907825008546038e-07,
"logits/chosen": -0.7271394729614258,
"logits/rejected": -0.6813848614692688,
"logps/chosen": -377.90118408203125,
"logps/rejected": -523.9625244140625,
"loss": 0.4333,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4791629314422607,
"rewards/margins": 1.4326350688934326,
"rewards/rejected": -2.9117980003356934,
"step": 240
},
{
"epoch": 0.19,
"grad_norm": 51.26102709104704,
"learning_rate": 4.889568336096795e-07,
"logits/chosen": -0.5312275290489197,
"logits/rejected": -0.37771934270858765,
"logps/chosen": -381.1251220703125,
"logps/rejected": -479.7431640625,
"loss": 0.4272,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5479203462600708,
"rewards/margins": 1.1352421045303345,
"rewards/rejected": -2.6831624507904053,
"step": 250
},
{
"epoch": 0.19,
"grad_norm": 46.69946748969463,
"learning_rate": 4.869703587695508e-07,
"logits/chosen": -0.44748228788375854,
"logits/rejected": -0.18481455743312836,
"logps/chosen": -379.5589904785156,
"logps/rejected": -527.2100830078125,
"loss": 0.4464,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.412641167640686,
"rewards/margins": 1.667824149131775,
"rewards/rejected": -3.080465793609619,
"step": 260
},
{
"epoch": 0.2,
"grad_norm": 40.8957837906737,
"learning_rate": 4.848244131457127e-07,
"logits/chosen": -0.9530747532844543,
"logits/rejected": -0.6137160062789917,
"logps/chosen": -400.1986083984375,
"logps/rejected": -499.60308837890625,
"loss": 0.4211,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.4335994720458984,
"rewards/margins": 1.4832035303115845,
"rewards/rejected": -2.9168028831481934,
"step": 270
},
{
"epoch": 0.21,
"grad_norm": 45.308995144235396,
"learning_rate": 4.825204408665877e-07,
"logits/chosen": -1.2076747417449951,
"logits/rejected": -0.9289032220840454,
"logps/chosen": -426.99114990234375,
"logps/rejected": -532.0573120117188,
"loss": 0.4124,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.4818888902664185,
"rewards/margins": 1.4990845918655396,
"rewards/rejected": -2.980973720550537,
"step": 280
},
{
"epoch": 0.22,
"grad_norm": 57.75176826411474,
"learning_rate": 4.800599924056907e-07,
"logits/chosen": -0.7638604044914246,
"logits/rejected": -0.7332445383071899,
"logps/chosen": -383.2490539550781,
"logps/rejected": -556.2003784179688,
"loss": 0.3833,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5847924947738647,
"rewards/margins": 1.5942741632461548,
"rewards/rejected": -3.1790668964385986,
"step": 290
},
{
"epoch": 0.22,
"grad_norm": 45.582764097748154,
"learning_rate": 4.774447235382259e-07,
"logits/chosen": -0.5798165202140808,
"logits/rejected": -0.5653051733970642,
"logps/chosen": -411.58154296875,
"logps/rejected": -582.2734375,
"loss": 0.3963,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.766920804977417,
"rewards/margins": 1.7389370203018188,
"rewards/rejected": -3.5058579444885254,
"step": 300
},
{
"epoch": 0.22,
"eval_logits/chosen": -1.4608731269836426,
"eval_logits/rejected": -1.2769949436187744,
"eval_logps/chosen": -423.00341796875,
"eval_logps/rejected": -521.115478515625,
"eval_loss": 0.5291498303413391,
"eval_rewards/accuracies": 0.7421875,
"eval_rewards/chosen": -1.1122692823410034,
"eval_rewards/margins": 0.9236345291137695,
"eval_rewards/rejected": -2.0359039306640625,
"eval_runtime": 97.2217,
"eval_samples_per_second": 20.572,
"eval_steps_per_second": 0.329,
"step": 300
},
{
"epoch": 0.23,
"grad_norm": 42.82644939529418,
"learning_rate": 4.7467639422682426e-07,
"logits/chosen": -0.6843788623809814,
"logits/rejected": -0.46269315481185913,
"logps/chosen": -417.7638244628906,
"logps/rejected": -573.83837890625,
"loss": 0.4006,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.8430830240249634,
"rewards/margins": 1.669550895690918,
"rewards/rejected": -3.512633800506592,
"step": 310
},
{
"epoch": 0.24,
"grad_norm": 55.146360598406936,
"learning_rate": 4.7175686743716223e-07,
"logits/chosen": -1.140579104423523,
"logits/rejected": -0.8973017930984497,
"logps/chosen": -419.18048095703125,
"logps/rejected": -527.0257568359375,
"loss": 0.405,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4635722637176514,
"rewards/margins": 1.3773781061172485,
"rewards/rejected": -2.8409504890441895,
"step": 320
},
{
"epoch": 0.25,
"grad_norm": 45.88101703811544,
"learning_rate": 4.686881078842688e-07,
"logits/chosen": -1.0653458833694458,
"logits/rejected": -0.8751330375671387,
"logps/chosen": -386.37335205078125,
"logps/rejected": -510.29949951171875,
"loss": 0.3899,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.47976553440094,
"rewards/margins": 1.366317868232727,
"rewards/rejected": -2.846083164215088,
"step": 330
},
{
"epoch": 0.25,
"grad_norm": 58.11307992254104,
"learning_rate": 4.654721807103558e-07,
"logits/chosen": -0.5151967406272888,
"logits/rejected": -0.14977958798408508,
"logps/chosen": -400.7736511230469,
"logps/rejected": -529.3316650390625,
"loss": 0.3938,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7082515954971313,
"rewards/margins": 1.6958554983139038,
"rewards/rejected": -3.404106855392456,
"step": 340
},
{
"epoch": 0.26,
"grad_norm": 48.499175539211535,
"learning_rate": 4.621112500950678e-07,
"logits/chosen": -0.8198322057723999,
"logits/rejected": -0.5934363603591919,
"logps/chosen": -429.72113037109375,
"logps/rejected": -547.5772705078125,
"loss": 0.3843,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.8615728616714478,
"rewards/margins": 1.499329924583435,
"rewards/rejected": -3.3609023094177246,
"step": 350
},
{
"epoch": 0.27,
"grad_norm": 55.599844022581365,
"learning_rate": 4.5860757779908225e-07,
"logits/chosen": -1.0455310344696045,
"logits/rejected": -0.6826554536819458,
"logps/chosen": -413.38739013671875,
"logps/rejected": -542.2623291015625,
"loss": 0.3736,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5897157192230225,
"rewards/margins": 1.6853986978530884,
"rewards/rejected": -3.2751145362854004,
"step": 360
},
{
"epoch": 0.27,
"grad_norm": 74.71151634556864,
"learning_rate": 4.5496352164204304e-07,
"logits/chosen": -0.4619407057762146,
"logits/rejected": -0.23415322601795197,
"logps/chosen": -426.197998046875,
"logps/rejected": -620.7210693359375,
"loss": 0.3997,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.0138180255889893,
"rewards/margins": 2.0114035606384277,
"rewards/rejected": -4.025221347808838,
"step": 370
},
{
"epoch": 0.28,
"grad_norm": 46.835706945950214,
"learning_rate": 4.5118153391584966e-07,
"logits/chosen": -0.7893734574317932,
"logits/rejected": -0.5286726951599121,
"logps/chosen": -348.12554931640625,
"logps/rejected": -483.89215087890625,
"loss": 0.3909,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.0020155906677246,
"rewards/margins": 1.7324419021606445,
"rewards/rejected": -2.734457492828369,
"step": 380
},
{
"epoch": 0.29,
"grad_norm": 51.06658825135186,
"learning_rate": 4.472641597343713e-07,
"logits/chosen": -0.5109713077545166,
"logits/rejected": -0.07112047076225281,
"logps/chosen": -389.3044738769531,
"logps/rejected": -567.7926635742188,
"loss": 0.3846,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.6159217357635498,
"rewards/margins": 1.9207748174667358,
"rewards/rejected": -3.536696672439575,
"step": 390
},
{
"epoch": 0.3,
"grad_norm": 44.181665144710905,
"learning_rate": 4.4321403532069523e-07,
"logits/chosen": -0.5097373127937317,
"logits/rejected": -0.2719523012638092,
"logps/chosen": -353.91278076171875,
"logps/rejected": -517.2376708984375,
"loss": 0.4012,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5704162120819092,
"rewards/margins": 1.8435367345809937,
"rewards/rejected": -3.4139533042907715,
"step": 400
},
{
"epoch": 0.3,
"eval_logits/chosen": -1.3372514247894287,
"eval_logits/rejected": -1.1222751140594482,
"eval_logps/chosen": -417.65863037109375,
"eval_logps/rejected": -516.7505493164062,
"eval_loss": 0.5314938426017761,
"eval_rewards/accuracies": 0.7734375,
"eval_rewards/chosen": -1.058821201324463,
"eval_rewards/margins": 0.9334329962730408,
"eval_rewards/rejected": -1.9922541379928589,
"eval_runtime": 97.4658,
"eval_samples_per_second": 20.52,
"eval_steps_per_second": 0.328,
"step": 400
},
{
"epoch": 0.3,
"grad_norm": 50.26869622592037,
"learning_rate": 4.390338862330631e-07,
"logits/chosen": -0.7592865824699402,
"logits/rejected": -0.4464483857154846,
"logps/chosen": -401.47607421875,
"logps/rejected": -523.3784790039062,
"loss": 0.3803,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7566916942596436,
"rewards/margins": 1.5606569051742554,
"rewards/rejected": -3.3173484802246094,
"step": 410
},
{
"epoch": 0.31,
"grad_norm": 51.57934206296598,
"learning_rate": 4.3472652553068835e-07,
"logits/chosen": -0.6644355654716492,
"logits/rejected": -0.23346371948719025,
"logps/chosen": -404.8458557128906,
"logps/rejected": -540.8956298828125,
"loss": 0.3797,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.7600839138031006,
"rewards/margins": 1.6869585514068604,
"rewards/rejected": -3.4470419883728027,
"step": 420
},
{
"epoch": 0.32,
"grad_norm": 73.04228089758476,
"learning_rate": 4.3029485188068895e-07,
"logits/chosen": 0.10370206832885742,
"logits/rejected": 0.39608412981033325,
"logps/chosen": -385.42498779296875,
"logps/rejected": -570.5172729492188,
"loss": 0.3655,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.839719533920288,
"rewards/margins": 1.714897871017456,
"rewards/rejected": -3.5546176433563232,
"step": 430
},
{
"epoch": 0.33,
"grad_norm": 54.512857623037554,
"learning_rate": 4.257418476074103e-07,
"logits/chosen": -0.023069072514772415,
"logits/rejected": 0.3960541784763336,
"logps/chosen": -423.490478515625,
"logps/rejected": -592.7897338867188,
"loss": 0.3638,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7877943515777588,
"rewards/margins": 2.115088701248169,
"rewards/rejected": -3.9028830528259277,
"step": 440
},
{
"epoch": 0.33,
"grad_norm": 55.7162708155443,
"learning_rate": 4.210705766854504e-07,
"logits/chosen": 0.15324774384498596,
"logits/rejected": 0.521506667137146,
"logps/chosen": -456.01776123046875,
"logps/rejected": -625.3338623046875,
"loss": 0.352,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.096989870071411,
"rewards/margins": 1.874829649925232,
"rewards/rejected": -3.9718196392059326,
"step": 450
},
{
"epoch": 0.34,
"grad_norm": 51.50110954656292,
"learning_rate": 4.16284182677737e-07,
"logits/chosen": 0.3847750127315521,
"logits/rejected": 0.9687877893447876,
"logps/chosen": -421.48321533203125,
"logps/rejected": -571.6495361328125,
"loss": 0.3771,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7761863470077515,
"rewards/margins": 1.777931809425354,
"rewards/rejected": -3.5541183948516846,
"step": 460
},
{
"epoch": 0.35,
"grad_norm": 42.17081561639591,
"learning_rate": 4.113858866200466e-07,
"logits/chosen": 0.5899291634559631,
"logits/rejected": 0.9651363492012024,
"logps/chosen": -411.4060974121094,
"logps/rejected": -587.0046997070312,
"loss": 0.3551,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -1.751307725906372,
"rewards/margins": 1.814639687538147,
"rewards/rejected": -3.5659472942352295,
"step": 470
},
{
"epoch": 0.36,
"grad_norm": 48.02610054790726,
"learning_rate": 4.063789848533865e-07,
"logits/chosen": 0.46232396364212036,
"logits/rejected": 1.0872290134429932,
"logps/chosen": -472.24139404296875,
"logps/rejected": -634.9567260742188,
"loss": 0.374,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.287501573562622,
"rewards/margins": 1.8356859683990479,
"rewards/rejected": -4.123187065124512,
"step": 480
},
{
"epoch": 0.36,
"grad_norm": 45.88835702974933,
"learning_rate": 4.0126684680570074e-07,
"logits/chosen": -0.3817380368709564,
"logits/rejected": 0.1566486358642578,
"logps/chosen": -461.13934326171875,
"logps/rejected": -592.1519165039062,
"loss": 0.334,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -1.8447940349578857,
"rewards/margins": 1.7669038772583008,
"rewards/rejected": -3.6116981506347656,
"step": 490
},
{
"epoch": 0.37,
"grad_norm": 53.85769217498667,
"learning_rate": 3.960529127243902e-07,
"logits/chosen": -0.31509625911712646,
"logits/rejected": -0.04504912719130516,
"logps/chosen": -477.027099609375,
"logps/rejected": -654.2672119140625,
"loss": 0.3559,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.053821086883545,
"rewards/margins": 2.070889711380005,
"rewards/rejected": -4.124711036682129,
"step": 500
},
{
"epoch": 0.37,
"eval_logits/chosen": -1.0066841840744019,
"eval_logits/rejected": -0.6833571791648865,
"eval_logps/chosen": -456.0086364746094,
"eval_logps/rejected": -568.9822387695312,
"eval_loss": 0.5275729894638062,
"eval_rewards/accuracies": 0.7578125,
"eval_rewards/chosen": -1.4423211812973022,
"eval_rewards/margins": 1.0722503662109375,
"eval_rewards/rejected": -2.5145716667175293,
"eval_runtime": 97.6519,
"eval_samples_per_second": 20.481,
"eval_steps_per_second": 0.328,
"step": 500
},
{
"epoch": 0.38,
"grad_norm": 53.47947486686438,
"learning_rate": 3.9074069136117594e-07,
"logits/chosen": -0.6587181687355042,
"logits/rejected": -0.11707913875579834,
"logps/chosen": -478.9352111816406,
"logps/rejected": -631.669921875,
"loss": 0.35,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.0555968284606934,
"rewards/margins": 1.9847618341445923,
"rewards/rejected": -4.040358543395996,
"step": 510
},
{
"epoch": 0.39,
"grad_norm": 48.01190508303512,
"learning_rate": 3.8533375761086094e-07,
"logits/chosen": -0.6520954966545105,
"logits/rejected": -0.19666698575019836,
"logps/chosen": -399.66455078125,
"logps/rejected": -589.08251953125,
"loss": 0.3518,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.5765998363494873,
"rewards/margins": 2.0024795532226562,
"rewards/rejected": -3.5790793895721436,
"step": 520
},
{
"epoch": 0.39,
"grad_norm": 58.201909693922666,
"learning_rate": 3.79835750105581e-07,
"logits/chosen": -0.015231219120323658,
"logits/rejected": 0.524590253829956,
"logps/chosen": -425.837890625,
"logps/rejected": -576.46630859375,
"loss": 0.364,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.9596973657608032,
"rewards/margins": 1.918087363243103,
"rewards/rejected": -3.8777847290039062,
"step": 530
},
{
"epoch": 0.4,
"grad_norm": 53.67325387574443,
"learning_rate": 3.742503687661627e-07,
"logits/chosen": 0.3345823585987091,
"logits/rejected": 0.8041492700576782,
"logps/chosen": -436.06170654296875,
"logps/rejected": -628.6650390625,
"loss": 0.3413,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.163074016571045,
"rewards/margins": 2.0728249549865723,
"rewards/rejected": -4.235899925231934,
"step": 540
},
{
"epoch": 0.41,
"grad_norm": 54.5126564713129,
"learning_rate": 3.685813723122372e-07,
"logits/chosen": 0.6497628688812256,
"logits/rejected": 1.1682524681091309,
"logps/chosen": -425.30157470703125,
"logps/rejected": -617.69482421875,
"loss": 0.3365,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -1.9300180673599243,
"rewards/margins": 2.057875394821167,
"rewards/rejected": -3.987893581390381,
"step": 550
},
{
"epoch": 0.42,
"grad_norm": 62.74924566191948,
"learning_rate": 3.6283257573278466e-07,
"logits/chosen": 0.867998480796814,
"logits/rejected": 1.330685019493103,
"logps/chosen": -455.71124267578125,
"logps/rejected": -659.052978515625,
"loss": 0.3223,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.0765323638916016,
"rewards/margins": 2.156247615814209,
"rewards/rejected": -4.2327799797058105,
"step": 560
},
{
"epoch": 0.42,
"grad_norm": 48.6969642598068,
"learning_rate": 3.5700784771881224e-07,
"logits/chosen": 1.0166234970092773,
"logits/rejected": 1.6870880126953125,
"logps/chosen": -478.86407470703125,
"logps/rejected": -635.7424926757812,
"loss": 0.3382,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.4357941150665283,
"rewards/margins": 1.9054218530654907,
"rewards/rejected": -4.341216087341309,
"step": 570
},
{
"epoch": 0.43,
"grad_norm": 43.243072977055355,
"learning_rate": 3.511111080598925e-07,
"logits/chosen": 0.6339820623397827,
"logits/rejected": 1.3627948760986328,
"logps/chosen": -447.268798828125,
"logps/rejected": -636.5888671875,
"loss": 0.3276,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.9055280685424805,
"rewards/margins": 2.3114867210388184,
"rewards/rejected": -4.217014312744141,
"step": 580
},
{
"epoch": 0.44,
"grad_norm": 69.40196325230258,
"learning_rate": 3.451463250063146e-07,
"logits/chosen": 0.8395903706550598,
"logits/rejected": 1.488012671470642,
"logps/chosen": -432.853271484375,
"logps/rejected": -630.223876953125,
"loss": 0.3378,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.9797086715698242,
"rewards/margins": 2.143889904022217,
"rewards/rejected": -4.123598098754883,
"step": 590
},
{
"epoch": 0.45,
"grad_norm": 59.19017069860126,
"learning_rate": 3.3911751259862403e-07,
"logits/chosen": 0.9315579533576965,
"logits/rejected": 1.3961995840072632,
"logps/chosen": -493.1189880371094,
"logps/rejected": -684.4100341796875,
"loss": 0.3291,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.291141986846924,
"rewards/margins": 2.0969302654266357,
"rewards/rejected": -4.3880720138549805,
"step": 600
},
{
"epoch": 0.45,
"eval_logits/chosen": -0.2334394007921219,
"eval_logits/rejected": 0.188625305891037,
"eval_logps/chosen": -477.9444580078125,
"eval_logps/rejected": -595.6332397460938,
"eval_loss": 0.5102677941322327,
"eval_rewards/accuracies": 0.76953125,
"eval_rewards/chosen": -1.6616793870925903,
"eval_rewards/margins": 1.1194015741348267,
"eval_rewards/rejected": -2.781080961227417,
"eval_runtime": 97.2562,
"eval_samples_per_second": 20.564,
"eval_steps_per_second": 0.329,
"step": 600
},
{
"epoch": 0.45,
"grad_norm": 37.653590501774474,
"learning_rate": 3.3302872796634754e-07,
"logits/chosen": 0.9580332040786743,
"logits/rejected": 1.3357497453689575,
"logps/chosen": -427.964111328125,
"logps/rejected": -620.7327880859375,
"loss": 0.3122,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -1.95559823513031,
"rewards/margins": 2.1169991493225098,
"rewards/rejected": -4.072597503662109,
"step": 610
},
{
"epoch": 0.46,
"grad_norm": 47.96131506831022,
"learning_rate": 3.2688406859772035e-07,
"logits/chosen": 0.8878351449966431,
"logits/rejected": 1.4351171255111694,
"logps/chosen": -489.7989196777344,
"logps/rejected": -665.8047485351562,
"loss": 0.3224,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.195067882537842,
"rewards/margins": 2.1086602210998535,
"rewards/rejected": -4.3037285804748535,
"step": 620
},
{
"epoch": 0.47,
"grad_norm": 65.32009143781127,
"learning_rate": 3.206876695822541e-07,
"logits/chosen": 1.3710159063339233,
"logits/rejected": 1.7163244485855103,
"logps/chosen": -493.956298828125,
"logps/rejected": -688.6646728515625,
"loss": 0.3129,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.438476085662842,
"rewards/margins": 2.2680106163024902,
"rewards/rejected": -4.706486701965332,
"step": 630
},
{
"epoch": 0.48,
"grad_norm": 66.03238810693847,
"learning_rate": 3.144437008280012e-07,
"logits/chosen": 0.709919273853302,
"logits/rejected": 1.0818461179733276,
"logps/chosen": -468.56890869140625,
"logps/rejected": -691.1434326171875,
"loss": 0.3232,
"rewards/accuracies": 0.90625,
"rewards/chosen": -2.252897262573242,
"rewards/margins": 2.3767807483673096,
"rewards/rejected": -4.629677772521973,
"step": 640
},
{
"epoch": 0.48,
"grad_norm": 47.885060646853404,
"learning_rate": 3.0815636425538665e-07,
"logits/chosen": 1.0194989442825317,
"logits/rejected": 1.571274995803833,
"logps/chosen": -446.6681213378906,
"logps/rejected": -611.84033203125,
"loss": 0.3429,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.190187454223633,
"rewards/margins": 2.0423951148986816,
"rewards/rejected": -4.232582092285156,
"step": 650
},
{
"epoch": 0.49,
"grad_norm": 59.75526535732341,
"learning_rate": 3.018298909694986e-07,
"logits/chosen": 1.3580573797225952,
"logits/rejected": 1.913851022720337,
"logps/chosen": -489.56982421875,
"logps/rejected": -673.2572021484375,
"loss": 0.3288,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.595083236694336,
"rewards/margins": 2.0307328701019287,
"rewards/rejected": -4.6258158683776855,
"step": 660
},
{
"epoch": 0.5,
"grad_norm": 51.20761564052719,
"learning_rate": 2.954685384127371e-07,
"logits/chosen": 0.8674410581588745,
"logits/rejected": 1.4072096347808838,
"logps/chosen": -482.65789794921875,
"logps/rejected": -649.311279296875,
"loss": 0.301,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.194945812225342,
"rewards/margins": 2.093947172164917,
"rewards/rejected": -4.288893222808838,
"step": 670
},
{
"epoch": 0.51,
"grad_norm": 62.65952308868226,
"learning_rate": 2.8907658749974054e-07,
"logits/chosen": 0.9979363679885864,
"logits/rejected": 1.4131087064743042,
"logps/chosen": -457.8363342285156,
"logps/rejected": -703.2235107421875,
"loss": 0.2929,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.264411449432373,
"rewards/margins": 2.5431039333343506,
"rewards/rejected": -4.807515621185303,
"step": 680
},
{
"epoch": 0.51,
"grad_norm": 49.65473672539794,
"learning_rate": 2.8265833973651503e-07,
"logits/chosen": 0.6275979280471802,
"logits/rejected": 1.0561200380325317,
"logps/chosen": -459.69976806640625,
"logps/rejected": -684.1864013671875,
"loss": 0.2859,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.8421128988265991,
"rewards/margins": 2.5259382724761963,
"rewards/rejected": -4.368051528930664,
"step": 690
},
{
"epoch": 0.52,
"grad_norm": 48.72864396453521,
"learning_rate": 2.7621811432570736e-07,
"logits/chosen": 0.8585799336433411,
"logits/rejected": 1.5937745571136475,
"logps/chosen": -518.5455932617188,
"logps/rejected": -734.5382690429688,
"loss": 0.2735,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.441080093383789,
"rewards/margins": 2.6617679595947266,
"rewards/rejected": -5.102847576141357,
"step": 700
},
{
"epoch": 0.52,
"eval_logits/chosen": 0.18704134225845337,
"eval_logits/rejected": 0.6721899509429932,
"eval_logps/chosen": -541.279541015625,
"eval_logps/rejected": -687.587158203125,
"eval_loss": 0.5288776159286499,
"eval_rewards/accuracies": 0.76171875,
"eval_rewards/chosen": -2.2950310707092285,
"eval_rewards/margins": 1.40558922290802,
"eval_rewards/rejected": -3.70061993598938,
"eval_runtime": 97.5006,
"eval_samples_per_second": 20.513,
"eval_steps_per_second": 0.328,
"step": 700
},
{
"epoch": 0.53,
"grad_norm": 50.62866425523001,
"learning_rate": 2.6976024525996917e-07,
"logits/chosen": 1.1524347066879272,
"logits/rejected": 1.7467842102050781,
"logps/chosen": -503.6927795410156,
"logps/rejected": -780.6187744140625,
"loss": 0.286,
"rewards/accuracies": 0.90625,
"rewards/chosen": -2.7125723361968994,
"rewards/margins": 2.8134512901306152,
"rewards/rejected": -5.5260233879089355,
"step": 710
},
{
"epoch": 0.53,
"grad_norm": 56.03367218705217,
"learning_rate": 2.6328907840536706e-07,
"logits/chosen": 0.7062090039253235,
"logits/rejected": 1.2199087142944336,
"logps/chosen": -460.45794677734375,
"logps/rejected": -685.5617065429688,
"loss": 0.3244,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.43827748298645,
"rewards/margins": 2.252427577972412,
"rewards/rejected": -4.690704822540283,
"step": 720
},
{
"epoch": 0.54,
"grad_norm": 57.82647372234183,
"learning_rate": 2.568089685768038e-07,
"logits/chosen": 0.6572129130363464,
"logits/rejected": 1.0754339694976807,
"logps/chosen": -530.2496337890625,
"logps/rejected": -698.03662109375,
"loss": 0.313,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.59128475189209,
"rewards/margins": 2.117705821990967,
"rewards/rejected": -4.708990573883057,
"step": 730
},
{
"epoch": 0.55,
"grad_norm": 50.473574423912424,
"learning_rate": 2.503242766074156e-07,
"logits/chosen": 0.42826253175735474,
"logits/rejected": 1.0195951461791992,
"logps/chosen": -451.046142578125,
"logps/rejected": -653.2913818359375,
"loss": 0.2898,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.9979403018951416,
"rewards/margins": 2.318507432937622,
"rewards/rejected": -4.316447734832764,
"step": 740
},
{
"epoch": 0.56,
"grad_norm": 61.13648555404995,
"learning_rate": 2.4383936641392136e-07,
"logits/chosen": 0.6429548859596252,
"logits/rejected": 1.103127360343933,
"logps/chosen": -467.82049560546875,
"logps/rejected": -702.5692749023438,
"loss": 0.2975,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.0785393714904785,
"rewards/margins": 2.386026382446289,
"rewards/rejected": -4.464566230773926,
"step": 750
},
{
"epoch": 0.56,
"grad_norm": 51.760001565819636,
"learning_rate": 2.3735860205989493e-07,
"logits/chosen": 0.7451823353767395,
"logits/rejected": 1.1489431858062744,
"logps/chosen": -462.767333984375,
"logps/rejected": -706.5615234375,
"loss": 0.2627,
"rewards/accuracies": 0.90625,
"rewards/chosen": -2.312885284423828,
"rewards/margins": 2.6091692447662354,
"rewards/rejected": -4.922054767608643,
"step": 760
},
{
"epoch": 0.57,
"grad_norm": 56.13632726849474,
"learning_rate": 2.308863448189402e-07,
"logits/chosen": 0.5960752367973328,
"logits/rejected": 1.0421712398529053,
"logps/chosen": -498.1941833496094,
"logps/rejected": -695.0504760742188,
"loss": 0.2811,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.29612398147583,
"rewards/margins": 2.4551825523376465,
"rewards/rejected": -4.751306533813477,
"step": 770
},
{
"epoch": 0.58,
"grad_norm": 67.7549300842345,
"learning_rate": 2.2442695023974246e-07,
"logits/chosen": 0.6856900453567505,
"logits/rejected": 1.3306076526641846,
"logps/chosen": -444.3168029785156,
"logps/rejected": -679.816650390625,
"loss": 0.2713,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.0717947483062744,
"rewards/margins": 2.6752490997314453,
"rewards/rejected": -4.747043609619141,
"step": 780
},
{
"epoch": 0.59,
"grad_norm": 55.628538802719504,
"learning_rate": 2.179847652149729e-07,
"logits/chosen": 0.7401930093765259,
"logits/rejected": 1.288172960281372,
"logps/chosen": -496.6468811035156,
"logps/rejected": -687.7960205078125,
"loss": 0.295,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.4609100818634033,
"rewards/margins": 2.223629951477051,
"rewards/rejected": -4.684540271759033,
"step": 790
},
{
"epoch": 0.59,
"grad_norm": 63.651106043315345,
"learning_rate": 2.115641250560183e-07,
"logits/chosen": 0.8801604509353638,
"logits/rejected": 1.5266039371490479,
"logps/chosen": -473.2115173339844,
"logps/rejected": -701.8800659179688,
"loss": 0.2752,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.4201507568359375,
"rewards/margins": 2.4442293643951416,
"rewards/rejected": -4.864380836486816,
"step": 800
},
{
"epoch": 0.59,
"eval_logits/chosen": -0.16280797123908997,
"eval_logits/rejected": 0.2751551866531372,
"eval_logps/chosen": -533.1201782226562,
"eval_logps/rejected": -668.2235717773438,
"eval_loss": 0.5228938460350037,
"eval_rewards/accuracies": 0.765625,
"eval_rewards/chosen": -2.2134366035461426,
"eval_rewards/margins": 1.2935477495193481,
"eval_rewards/rejected": -3.506984233856201,
"eval_runtime": 97.387,
"eval_samples_per_second": 20.537,
"eval_steps_per_second": 0.329,
"step": 800
},
{
"epoch": 0.6,
"grad_norm": 70.2608582618962,
"learning_rate": 2.051693505755042e-07,
"logits/chosen": 0.8354732394218445,
"logits/rejected": 1.2750941514968872,
"logps/chosen": -461.49786376953125,
"logps/rejected": -705.8599853515625,
"loss": 0.2946,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.4096267223358154,
"rewards/margins": 2.483677864074707,
"rewards/rejected": -4.893305778503418,
"step": 810
},
{
"epoch": 0.61,
"grad_norm": 49.246802198712466,
"learning_rate": 1.9880474517957542e-07,
"logits/chosen": 0.9254199862480164,
"logits/rejected": 1.563522458076477,
"logps/chosen": -481.2748107910156,
"logps/rejected": -658.328125,
"loss": 0.2674,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.385385036468506,
"rewards/margins": 2.1492881774902344,
"rewards/rejected": -4.53467321395874,
"step": 820
},
{
"epoch": 0.62,
"grad_norm": 88.28145029556197,
"learning_rate": 1.9247459197189e-07,
"logits/chosen": 0.8668380975723267,
"logits/rejected": 1.5001232624053955,
"logps/chosen": -488.27685546875,
"logps/rejected": -680.9069213867188,
"loss": 0.2652,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.6699295043945312,
"rewards/margins": 2.2055306434631348,
"rewards/rejected": -4.875459671020508,
"step": 830
},
{
"epoch": 0.62,
"grad_norm": 43.13543734061108,
"learning_rate": 1.8618315087127602e-07,
"logits/chosen": 0.6826521754264832,
"logits/rejected": 1.2443543672561646,
"logps/chosen": -499.20892333984375,
"logps/rejected": -706.3511962890625,
"loss": 0.2563,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.4423558712005615,
"rewards/margins": 2.461874485015869,
"rewards/rejected": -4.904230117797852,
"step": 840
},
{
"epoch": 0.63,
"grad_norm": 56.63843357010467,
"learning_rate": 1.7993465574499102e-07,
"logits/chosen": 0.5323538184165955,
"logits/rejected": 1.2176125049591064,
"logps/chosen": -463.47857666015625,
"logps/rejected": -663.4465942382812,
"loss": 0.2759,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.189335823059082,
"rewards/margins": 2.420409679412842,
"rewards/rejected": -4.609745502471924,
"step": 850
},
{
"epoch": 0.64,
"grad_norm": 56.31423994279339,
"learning_rate": 1.7373331155951233e-07,
"logits/chosen": 0.8688204884529114,
"logits/rejected": 1.4698970317840576,
"logps/chosen": -510.4227600097656,
"logps/rejected": -748.5259399414062,
"loss": 0.2649,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.550417900085449,
"rewards/margins": 2.730776309967041,
"rewards/rejected": -5.28119421005249,
"step": 860
},
{
"epoch": 0.65,
"grad_norm": 50.688626621321205,
"learning_rate": 1.6758329155077743e-07,
"logits/chosen": 1.0613950490951538,
"logits/rejected": 1.5818780660629272,
"logps/chosen": -495.5560607910156,
"logps/rejected": -708.2391967773438,
"loss": 0.2711,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.642883777618408,
"rewards/margins": 2.6204209327697754,
"rewards/rejected": -5.263304710388184,
"step": 870
},
{
"epoch": 0.65,
"grad_norm": 46.10359729315069,
"learning_rate": 1.6148873441577662e-07,
"logits/chosen": 1.0479947328567505,
"logits/rejected": 1.5524357557296753,
"logps/chosen": -480.2462463378906,
"logps/rejected": -707.98681640625,
"loss": 0.2699,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.261603355407715,
"rewards/margins": 2.4961774349212646,
"rewards/rejected": -4.757781028747559,
"step": 880
},
{
"epoch": 0.66,
"grad_norm": 41.346767344116245,
"learning_rate": 1.5545374152738934e-07,
"logits/chosen": 1.1905092000961304,
"logits/rejected": 1.6182410717010498,
"logps/chosen": -468.92083740234375,
"logps/rejected": -689.1092529296875,
"loss": 0.2722,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.264604091644287,
"rewards/margins": 2.391749143600464,
"rewards/rejected": -4.65635347366333,
"step": 890
},
{
"epoch": 0.67,
"grad_norm": 60.48896334839974,
"learning_rate": 1.4948237417433775e-07,
"logits/chosen": 1.380293369293213,
"logits/rejected": 2.2697908878326416,
"logps/chosen": -436.1393127441406,
"logps/rejected": -673.2228393554688,
"loss": 0.2492,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.151729106903076,
"rewards/margins": 2.624401330947876,
"rewards/rejected": -4.776130676269531,
"step": 900
},
{
"epoch": 0.67,
"eval_logits/chosen": 0.5183509588241577,
"eval_logits/rejected": 1.0725551843643188,
"eval_logps/chosen": -518.2382202148438,
"eval_logps/rejected": -652.8116455078125,
"eval_loss": 0.5152209997177124,
"eval_rewards/accuracies": 0.7734375,
"eval_rewards/chosen": -2.064617395401001,
"eval_rewards/margins": 1.2882475852966309,
"eval_rewards/rejected": -3.352864980697632,
"eval_runtime": 97.3137,
"eval_samples_per_second": 20.552,
"eval_steps_per_second": 0.329,
"step": 900
},
{
"epoch": 0.68,
"grad_norm": 59.39383985362304,
"learning_rate": 1.435786508281158e-07,
"logits/chosen": 1.9009380340576172,
"logits/rejected": 2.567354679107666,
"logps/chosen": -482.70513916015625,
"logps/rejected": -720.0316162109375,
"loss": 0.2499,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.3441195487976074,
"rewards/margins": 2.6516547203063965,
"rewards/rejected": -4.995774269104004,
"step": 910
},
{
"epoch": 0.68,
"grad_norm": 58.953283614647454,
"learning_rate": 1.3774654443873174e-07,
"logits/chosen": 1.749333381652832,
"logits/rejected": 2.4905173778533936,
"logps/chosen": -512.65625,
"logps/rejected": -763.8499145507812,
"loss": 0.2542,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.783947467803955,
"rewards/margins": 2.989567756652832,
"rewards/rejected": -5.773515224456787,
"step": 920
},
{
"epoch": 0.69,
"grad_norm": 57.229551980352035,
"learning_rate": 1.31989979761085e-07,
"logits/chosen": 1.3056137561798096,
"logits/rejected": 2.2303478717803955,
"logps/chosen": -465.61627197265625,
"logps/rejected": -746.7559814453125,
"loss": 0.2416,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -2.5093438625335693,
"rewards/margins": 3.106735944747925,
"rewards/rejected": -5.616079807281494,
"step": 930
},
{
"epoch": 0.7,
"grad_norm": 53.92751444407525,
"learning_rate": 1.2631283071377618e-07,
"logits/chosen": 1.6224052906036377,
"logits/rejected": 1.9630991220474243,
"logps/chosen": -458.9669494628906,
"logps/rejected": -742.6818237304688,
"loss": 0.2429,
"rewards/accuracies": 0.90625,
"rewards/chosen": -2.4590606689453125,
"rewards/margins": 2.7507693767547607,
"rewards/rejected": -5.209830284118652,
"step": 940
},
{
"epoch": 0.71,
"grad_norm": 48.183067890071925,
"learning_rate": 1.2071891777212744e-07,
"logits/chosen": 1.061023235321045,
"logits/rejected": 1.9151092767715454,
"logps/chosen": -507.06744384765625,
"logps/rejected": -707.039794921875,
"loss": 0.253,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.448425054550171,
"rewards/margins": 2.3641083240509033,
"rewards/rejected": -4.812533855438232,
"step": 950
},
{
"epoch": 0.71,
"grad_norm": 48.31856194766799,
"learning_rate": 1.1521200539716874e-07,
"logits/chosen": 1.2143045663833618,
"logits/rejected": 1.9916166067123413,
"logps/chosen": -500.71038818359375,
"logps/rejected": -771.3677978515625,
"loss": 0.2426,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.3821799755096436,
"rewards/margins": 3.1737558841705322,
"rewards/rejected": -5.555935859680176,
"step": 960
},
{
"epoch": 0.72,
"grad_norm": 57.66373376149326,
"learning_rate": 1.0979579950231821e-07,
"logits/chosen": 1.1112618446350098,
"logits/rejected": 2.246898889541626,
"logps/chosen": -502.126220703125,
"logps/rejected": -734.8248901367188,
"loss": 0.241,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.395838737487793,
"rewards/margins": 2.6420400142669678,
"rewards/rejected": -5.03787899017334,
"step": 970
},
{
"epoch": 0.73,
"grad_norm": 55.20670800594472,
"learning_rate": 1.0447394495946291e-07,
"logits/chosen": 1.387683391571045,
"logits/rejected": 2.400949478149414,
"logps/chosen": -515.9779052734375,
"logps/rejected": -765.4949340820312,
"loss": 0.2468,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.70845365524292,
"rewards/margins": 2.7117531299591064,
"rewards/rejected": -5.420206546783447,
"step": 980
},
{
"epoch": 0.74,
"grad_norm": 45.9412294534277,
"learning_rate": 9.925002314611841e-08,
"logits/chosen": 1.8099420070648193,
"logits/rejected": 2.5098319053649902,
"logps/chosen": -484.7242736816406,
"logps/rejected": -777.49169921875,
"loss": 0.2383,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.564988374710083,
"rewards/margins": 2.9337170124053955,
"rewards/rejected": -5.498705863952637,
"step": 990
},
{
"epoch": 0.74,
"grad_norm": 64.863814963629,
"learning_rate": 9.412754953531663e-08,
"logits/chosen": 1.5222892761230469,
"logits/rejected": 2.5317773818969727,
"logps/chosen": -507.424072265625,
"logps/rejected": -756.7098388671875,
"loss": 0.262,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.656026601791382,
"rewards/margins": 2.7969748973846436,
"rewards/rejected": -5.453001976013184,
"step": 1000
},
{
"epoch": 0.74,
"eval_logits/chosen": 0.6804571151733398,
"eval_logits/rejected": 1.3123811483383179,
"eval_logps/chosen": -556.8264770507812,
"eval_logps/rejected": -703.1602783203125,
"eval_loss": 0.5241079330444336,
"eval_rewards/accuracies": 0.76171875,
"eval_rewards/chosen": -2.4504995346069336,
"eval_rewards/margins": 1.405852198600769,
"eval_rewards/rejected": -3.856351613998413,
"eval_runtime": 97.4441,
"eval_samples_per_second": 20.525,
"eval_steps_per_second": 0.328,
"step": 1000
},
{
"epoch": 0.75,
"grad_norm": 69.68207773392557,
"learning_rate": 8.910997132984479e-08,
"logits/chosen": 1.820955514907837,
"logits/rejected": 2.952479839324951,
"logps/chosen": -544.1399536132812,
"logps/rejected": -808.0184936523438,
"loss": 0.2504,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.861184597015381,
"rewards/margins": 3.071931838989258,
"rewards/rejected": -5.933116436004639,
"step": 1010
},
{
"epoch": 0.76,
"grad_norm": 50.59071094029437,
"learning_rate": 8.42006651424274e-08,
"logits/chosen": 1.8404204845428467,
"logits/rejected": 2.6863815784454346,
"logps/chosen": -461.4169921875,
"logps/rejected": -703.1361083984375,
"loss": 0.2318,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.4329962730407715,
"rewards/margins": 2.7300188541412354,
"rewards/rejected": -5.163014888763428,
"step": 1020
},
{
"epoch": 0.77,
"grad_norm": 57.22762908033313,
"learning_rate": 7.940293472341217e-08,
"logits/chosen": 2.013861894607544,
"logits/rejected": 2.7502970695495605,
"logps/chosen": -477.7572326660156,
"logps/rejected": -773.4556884765625,
"loss": 0.2276,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.6210336685180664,
"rewards/margins": 3.139965057373047,
"rewards/rejected": -5.7609992027282715,
"step": 1030
},
{
"epoch": 0.77,
"grad_norm": 55.15868922046573,
"learning_rate": 7.472000873748918e-08,
"logits/chosen": 2.0298519134521484,
"logits/rejected": 2.990135431289673,
"logps/chosen": -528.5840454101562,
"logps/rejected": -781.4909057617188,
"loss": 0.2487,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.6361494064331055,
"rewards/margins": 2.9660372734069824,
"rewards/rejected": -5.602187156677246,
"step": 1040
},
{
"epoch": 0.78,
"grad_norm": 43.438291077124795,
"learning_rate": 7.015503859093927e-08,
"logits/chosen": 2.1326801776885986,
"logits/rejected": 2.5511794090270996,
"logps/chosen": -486.6455078125,
"logps/rejected": -757.7630004882812,
"loss": 0.2148,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.617185592651367,
"rewards/margins": 2.795973062515259,
"rewards/rejected": -5.413158893585205,
"step": 1050
},
{
"epoch": 0.79,
"grad_norm": 63.14016572546011,
"learning_rate": 6.571109631087451e-08,
"logits/chosen": 2.417752742767334,
"logits/rejected": 3.036146402359009,
"logps/chosen": -494.73046875,
"logps/rejected": -811.0126953125,
"loss": 0.2112,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -2.588284492492676,
"rewards/margins": 3.300442934036255,
"rewards/rejected": -5.888727188110352,
"step": 1060
},
{
"epoch": 0.79,
"grad_norm": 58.89863039830767,
"learning_rate": 6.139117247789687e-08,
"logits/chosen": 2.5516977310180664,
"logits/rejected": 3.055995464324951,
"logps/chosen": -535.7842407226562,
"logps/rejected": -800.0374145507812,
"loss": 0.2248,
"rewards/accuracies": 0.90625,
"rewards/chosen": -2.956123113632202,
"rewards/margins": 2.720890998840332,
"rewards/rejected": -5.677014350891113,
"step": 1070
},
{
"epoch": 0.8,
"grad_norm": 41.21215573686561,
"learning_rate": 5.719817421356685e-08,
"logits/chosen": 1.9021530151367188,
"logits/rejected": 2.7421538829803467,
"logps/chosen": -549.5343017578125,
"logps/rejected": -820.0753784179688,
"loss": 0.2033,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -2.7265052795410156,
"rewards/margins": 3.281470537185669,
"rewards/rejected": -6.007976055145264,
"step": 1080
},
{
"epoch": 0.81,
"grad_norm": 58.39711865385947,
"learning_rate": 5.313492322403701e-08,
"logits/chosen": 2.2018539905548096,
"logits/rejected": 2.951138496398926,
"logps/chosen": -533.9331665039062,
"logps/rejected": -891.0558471679688,
"loss": 0.1937,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -2.8866357803344727,
"rewards/margins": 3.6149306297302246,
"rewards/rejected": -6.501566410064697,
"step": 1090
},
{
"epoch": 0.82,
"grad_norm": 51.18256501676837,
"learning_rate": 4.9204153901165805e-08,
"logits/chosen": 1.9893665313720703,
"logits/rejected": 2.7781219482421875,
"logps/chosen": -530.7794189453125,
"logps/rejected": -824.0559692382812,
"loss": 0.2299,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.8573508262634277,
"rewards/margins": 3.2189173698425293,
"rewards/rejected": -6.076268196105957,
"step": 1100
},
{
"epoch": 0.82,
"eval_logits/chosen": 0.8391125202178955,
"eval_logits/rejected": 1.4834216833114624,
"eval_logps/chosen": -588.2494506835938,
"eval_logps/rejected": -741.857421875,
"eval_loss": 0.5312901139259338,
"eval_rewards/accuracies": 0.7578125,
"eval_rewards/chosen": -2.7647294998168945,
"eval_rewards/margins": 1.4785932302474976,
"eval_rewards/rejected": -4.243322849273682,
"eval_runtime": 97.5423,
"eval_samples_per_second": 20.504,
"eval_steps_per_second": 0.328,
"step": 1100
},
{
"epoch": 0.82,
"grad_norm": 68.60925195657734,
"learning_rate": 4.540851148239036e-08,
"logits/chosen": 1.7061752080917358,
"logits/rejected": 2.698995351791382,
"logps/chosen": -537.1931762695312,
"logps/rejected": -848.33154296875,
"loss": 0.2129,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -2.7809014320373535,
"rewards/margins": 3.3348469734191895,
"rewards/rejected": -6.115748405456543,
"step": 1110
},
{
"epoch": 0.83,
"grad_norm": 48.80096479357628,
"learning_rate": 4.1750550270596206e-08,
"logits/chosen": 1.531884789466858,
"logits/rejected": 2.923696994781494,
"logps/chosen": -509.5885314941406,
"logps/rejected": -794.9307250976562,
"loss": 0.1954,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.630959987640381,
"rewards/margins": 3.3725571632385254,
"rewards/rejected": -6.003516674041748,
"step": 1120
},
{
"epoch": 0.84,
"grad_norm": 68.79197398198284,
"learning_rate": 3.823273191518234e-08,
"logits/chosen": 1.5292671918869019,
"logits/rejected": 2.3230159282684326,
"logps/chosen": -568.5833740234375,
"logps/rejected": -835.826171875,
"loss": 0.2178,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -3.0106937885284424,
"rewards/margins": 3.2017643451690674,
"rewards/rejected": -6.212458610534668,
"step": 1130
},
{
"epoch": 0.85,
"grad_norm": 59.434543375011025,
"learning_rate": 3.485742375547745e-08,
"logits/chosen": 1.4421080350875854,
"logits/rejected": 2.442089796066284,
"logps/chosen": -553.727294921875,
"logps/rejected": -822.7138671875,
"loss": 0.2009,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.1090734004974365,
"rewards/margins": 2.9853668212890625,
"rewards/rejected": -6.094440460205078,
"step": 1140
},
{
"epoch": 0.85,
"grad_norm": 38.888275757403804,
"learning_rate": 3.162689722762365e-08,
"logits/chosen": 1.5811113119125366,
"logits/rejected": 2.2564284801483154,
"logps/chosen": -543.1163940429688,
"logps/rejected": -842.681640625,
"loss": 0.2095,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -2.9668571949005127,
"rewards/margins": 3.10882830619812,
"rewards/rejected": -6.075685024261475,
"step": 1150
},
{
"epoch": 0.86,
"grad_norm": 42.47551430381964,
"learning_rate": 2.8543326335997904e-08,
"logits/chosen": 1.768690824508667,
"logits/rejected": 2.4484939575195312,
"logps/chosen": -556.0635375976562,
"logps/rejected": -805.807373046875,
"loss": 0.2046,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -2.865739107131958,
"rewards/margins": 2.8989548683166504,
"rewards/rejected": -5.764693737030029,
"step": 1160
},
{
"epoch": 0.87,
"grad_norm": 59.36158165544989,
"learning_rate": 2.560878619020157e-08,
"logits/chosen": 1.9017894268035889,
"logits/rejected": 2.7026009559631348,
"logps/chosen": -521.269287109375,
"logps/rejected": -813.7127685546875,
"loss": 0.1964,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -2.9693474769592285,
"rewards/margins": 3.1322848796844482,
"rewards/rejected": -6.101632595062256,
"step": 1170
},
{
"epoch": 0.88,
"grad_norm": 49.475189963130575,
"learning_rate": 2.2825251608601466e-08,
"logits/chosen": 1.8870357275009155,
"logits/rejected": 2.8944287300109863,
"logps/chosen": -558.059814453125,
"logps/rejected": -868.568359375,
"loss": 0.1891,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -3.1376397609710693,
"rewards/margins": 3.2884891033172607,
"rewards/rejected": -6.426129341125488,
"step": 1180
},
{
"epoch": 0.88,
"grad_norm": 85.599165147591,
"learning_rate": 2.0194595789362474e-08,
"logits/chosen": 1.9095745086669922,
"logits/rejected": 2.530900478363037,
"logps/chosen": -577.1746826171875,
"logps/rejected": -892.88623046875,
"loss": 0.2027,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.0735995769500732,
"rewards/margins": 3.377427339553833,
"rewards/rejected": -6.451026916503906,
"step": 1190
},
{
"epoch": 0.89,
"grad_norm": 45.52491787365754,
"learning_rate": 1.7718589049866728e-08,
"logits/chosen": 2.376490592956543,
"logits/rejected": 3.1364424228668213,
"logps/chosen": -510.269287109375,
"logps/rejected": -829.1940307617188,
"loss": 0.1974,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.9278645515441895,
"rewards/margins": 3.433408737182617,
"rewards/rejected": -6.361273765563965,
"step": 1200
},
{
"epoch": 0.89,
"eval_logits/chosen": 0.8963963389396667,
"eval_logits/rejected": 1.5457934141159058,
"eval_logps/chosen": -606.617431640625,
"eval_logps/rejected": -764.6512451171875,
"eval_loss": 0.5366576910018921,
"eval_rewards/accuracies": 0.76171875,
"eval_rewards/chosen": -2.948409080505371,
"eval_rewards/margins": 1.5228519439697266,
"eval_rewards/rejected": -4.471261024475098,
"eval_runtime": 97.4355,
"eval_samples_per_second": 20.526,
"eval_steps_per_second": 0.328,
"step": 1200
},
{
"epoch": 0.9,
"grad_norm": 56.7147448955845,
"learning_rate": 1.539889763536645e-08,
"logits/chosen": 1.9441492557525635,
"logits/rejected": 3.0478804111480713,
"logps/chosen": -538.355224609375,
"logps/rejected": -856.01416015625,
"loss": 0.2187,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.842240571975708,
"rewards/margins": 3.5280959606170654,
"rewards/rejected": -6.370336055755615,
"step": 1210
},
{
"epoch": 0.91,
"grad_norm": 60.258963508413004,
"learning_rate": 1.3237082597673172e-08,
"logits/chosen": 2.1856608390808105,
"logits/rejected": 2.853616237640381,
"logps/chosen": -517.0845947265625,
"logps/rejected": -845.6990966796875,
"loss": 0.204,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.0185937881469727,
"rewards/margins": 3.2306289672851562,
"rewards/rejected": -6.249222755432129,
"step": 1220
},
{
"epoch": 0.91,
"grad_norm": 71.41232139420377,
"learning_rate": 1.1234598744637502e-08,
"logits/chosen": 1.5448696613311768,
"logits/rejected": 2.610525608062744,
"logps/chosen": -545.0371704101562,
"logps/rejected": -821.2421875,
"loss": 0.2063,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.1403965950012207,
"rewards/margins": 3.1843514442443848,
"rewards/rejected": -6.3247480392456055,
"step": 1230
},
{
"epoch": 0.92,
"grad_norm": 57.959377016977456,
"learning_rate": 9.392793661126414e-09,
"logits/chosen": 1.898782730102539,
"logits/rejected": 2.7061781883239746,
"logps/chosen": -582.9857177734375,
"logps/rejected": -879.3019409179688,
"loss": 0.1979,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.2453556060791016,
"rewards/margins": 3.297309160232544,
"rewards/rejected": -6.542665004730225,
"step": 1240
},
{
"epoch": 0.93,
"grad_norm": 50.86760187147993,
"learning_rate": 7.71290680215711e-09,
"logits/chosen": 2.0340778827667236,
"logits/rejected": 2.8080642223358154,
"logps/chosen": -558.147705078125,
"logps/rejected": -874.9266357421875,
"loss": 0.1974,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.0640769004821777,
"rewards/margins": 3.380338668823242,
"rewards/rejected": -6.444415092468262,
"step": 1250
},
{
"epoch": 0.94,
"grad_norm": 61.973766270626015,
"learning_rate": 6.196068658797543e-09,
"logits/chosen": 1.8814232349395752,
"logits/rejected": 2.7813236713409424,
"logps/chosen": -551.5777587890625,
"logps/rejected": -826.7698974609375,
"loss": 0.1971,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.9602150917053223,
"rewards/margins": 3.0024728775024414,
"rewards/rejected": -5.9626874923706055,
"step": 1260
},
{
"epoch": 0.94,
"grad_norm": 67.6695850405579,
"learning_rate": 4.843299997394717e-09,
"logits/chosen": 1.856507658958435,
"logits/rejected": 2.7601516246795654,
"logps/chosen": -540.268310546875,
"logps/rejected": -846.9691162109375,
"loss": 0.2067,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.077454090118408,
"rewards/margins": 3.414836883544922,
"rewards/rejected": -6.492290496826172,
"step": 1270
},
{
"epoch": 0.95,
"grad_norm": 68.73319089653008,
"learning_rate": 3.655511172643372e-09,
"logits/chosen": 1.932074785232544,
"logits/rejected": 2.437225818634033,
"logps/chosen": -531.4140625,
"logps/rejected": -836.9505615234375,
"loss": 0.1876,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.8276994228363037,
"rewards/margins": 3.25665020942688,
"rewards/rejected": -6.084350109100342,
"step": 1280
},
{
"epoch": 0.96,
"grad_norm": 50.423800165908794,
"learning_rate": 2.633501514956532e-09,
"logits/chosen": 1.9169034957885742,
"logits/rejected": 2.7369441986083984,
"logps/chosen": -586.8289794921875,
"logps/rejected": -896.8014526367188,
"loss": 0.2044,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -3.1295228004455566,
"rewards/margins": 3.5232949256896973,
"rewards/rejected": -6.652817726135254,
"step": 1290
},
{
"epoch": 0.97,
"grad_norm": 57.31903342529662,
"learning_rate": 1.777958792550993e-09,
"logits/chosen": 1.5464543104171753,
"logits/rejected": 2.9688878059387207,
"logps/chosen": -587.2015380859375,
"logps/rejected": -853.0357666015625,
"loss": 0.1842,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -2.988502025604248,
"rewards/margins": 3.156489372253418,
"rewards/rejected": -6.144991397857666,
"step": 1300
},
{
"epoch": 0.97,
"eval_logits/chosen": 0.9558575749397278,
"eval_logits/rejected": 1.609464406967163,
"eval_logps/chosen": -609.159423828125,
"eval_logps/rejected": -767.4317016601562,
"eval_loss": 0.5365558862686157,
"eval_rewards/accuracies": 0.76171875,
"eval_rewards/chosen": -2.9738292694091797,
"eval_rewards/margins": 1.5252362489700317,
"eval_rewards/rejected": -4.499065399169922,
"eval_runtime": 97.3239,
"eval_samples_per_second": 20.55,
"eval_steps_per_second": 0.329,
"step": 1300
},
{
"epoch": 0.97,
"grad_norm": 66.21886288694567,
"learning_rate": 1.0894587486089125e-09,
"logits/chosen": 1.8931999206542969,
"logits/rejected": 2.824298858642578,
"logps/chosen": -563.06201171875,
"logps/rejected": -834.8709716796875,
"loss": 0.2157,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.2370285987854004,
"rewards/margins": 3.035515546798706,
"rewards/rejected": -6.272543430328369,
"step": 1310
},
{
"epoch": 0.98,
"grad_norm": 45.779926433395936,
"learning_rate": 5.684647138277098e-10,
"logits/chosen": 1.7055333852767944,
"logits/rejected": 2.308079719543457,
"logps/chosen": -531.0139770507812,
"logps/rejected": -862.2609252929688,
"loss": 0.1974,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -2.956573486328125,
"rewards/margins": 3.375626802444458,
"rewards/rejected": -6.332200050354004,
"step": 1320
},
{
"epoch": 0.99,
"grad_norm": 58.05458328657747,
"learning_rate": 2.153272946184559e-10,
"logits/chosen": 1.735358476638794,
"logits/rejected": 2.259385585784912,
"logps/chosen": -585.9295043945312,
"logps/rejected": -861.4645385742188,
"loss": 0.1738,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -3.10073184967041,
"rewards/margins": 2.996291399002075,
"rewards/rejected": -6.097023010253906,
"step": 1330
},
{
"epoch": 1.0,
"grad_norm": 46.42702960995785,
"learning_rate": 3.0284137163189004e-11,
"logits/chosen": 2.000138759613037,
"logits/rejected": 2.7859671115875244,
"logps/chosen": -530.1033935546875,
"logps/rejected": -878.3465576171875,
"loss": 0.1884,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -3.1844658851623535,
"rewards/margins": 3.3884029388427734,
"rewards/rejected": -6.572869300842285,
"step": 1340
},
{
"epoch": 1.0,
"step": 1346,
"total_flos": 0.0,
"train_loss": 0.335402155391883,
"train_runtime": 21644.3608,
"train_samples_per_second": 7.959,
"train_steps_per_second": 0.062
}
],
"logging_steps": 10,
"max_steps": 1346,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}